123 files changed, 43963 insertions, 0 deletions
diff --git a/src/rocksdb/table/adaptive/adaptive_table_factory.cc b/src/rocksdb/table/adaptive/adaptive_table_factory.cc
new file mode 100644
index 000000000..5b9fe3dbd
--- /dev/null
+++ b/src/rocksdb/table/adaptive/adaptive_table_factory.cc
@@ -0,0 +1,126 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/adaptive/adaptive_table_factory.h"
+
+#include "port/port.h"
+#include "table/format.h"
+#include "table/table_builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+AdaptiveTableFactory::AdaptiveTableFactory(
+    std::shared_ptr<TableFactory> table_factory_to_write,
+    std::shared_ptr<TableFactory> block_based_table_factory,
+    std::shared_ptr<TableFactory> plain_table_factory,
+    std::shared_ptr<TableFactory> cuckoo_table_factory)
+    : table_factory_to_write_(table_factory_to_write),
+      block_based_table_factory_(block_based_table_factory),
+      plain_table_factory_(plain_table_factory),
+      cuckoo_table_factory_(cuckoo_table_factory) {
+  if (!plain_table_factory_) {
+    plain_table_factory_.reset(NewPlainTableFactory());
+  }
+  if (!block_based_table_factory_) {
+    block_based_table_factory_.reset(NewBlockBasedTableFactory());
+  }
+  if (!cuckoo_table_factory_) {
+    cuckoo_table_factory_.reset(NewCuckooTableFactory());
+  }
+  if (!table_factory_to_write_) {
+    table_factory_to_write_ = block_based_table_factory_;
+  }
+}
+
+extern const uint64_t kPlainTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kCuckooTableMagicNumber;
+
+Status AdaptiveTableFactory::NewTableReader(
+    const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table,
+    bool prefetch_index_and_filter_in_cache) const {
+  Footer footer;
+  IOOptions opts;
+  auto s = ReadFooterFromFile(opts, file.get(), nullptr /* prefetch_buffer */,
+                              file_size, &footer);
+  if (!s.ok()) {
+    return s;
+  }
+  if (footer.table_magic_number() == kPlainTableMagicNumber ||
+      footer.table_magic_number() == kLegacyPlainTableMagicNumber) {
+    return plain_table_factory_->NewTableReader(
+        table_reader_options, std::move(file), file_size, table);
+  } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
+             footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
+    return block_based_table_factory_->NewTableReader(
+        ro, table_reader_options, std::move(file), file_size, table,
+        prefetch_index_and_filter_in_cache);
+  } else if (footer.table_magic_number() == kCuckooTableMagicNumber) {
+    return cuckoo_table_factory_->NewTableReader(
+        table_reader_options, std::move(file), file_size, table);
+  } else {
+    return Status::NotSupported("Unidentified table format");
+  }
+}
+
+TableBuilder* AdaptiveTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFileWriter* file) const {
+  return table_factory_to_write_->NewTableBuilder(table_builder_options, file);
+}
+
+std::string AdaptiveTableFactory::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  if (table_factory_to_write_) {
+    snprintf(buffer, kBufferSize, "  write factory (%s) options:\n%s\n",
+             (table_factory_to_write_->Name() ? table_factory_to_write_->Name()
+                                              : ""),
+             table_factory_to_write_->GetPrintableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (plain_table_factory_) {
+    snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
+             plain_table_factory_->Name() ? plain_table_factory_->Name() : "",
+             plain_table_factory_->GetPrintableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (block_based_table_factory_) {
+    snprintf(
+        buffer, kBufferSize, "  %s options:\n%s\n",
+        (block_based_table_factory_->Name() ? block_based_table_factory_->Name()
+                                            : ""),
+        block_based_table_factory_->GetPrintableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (cuckoo_table_factory_) {
+    snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
+             cuckoo_table_factory_->Name() ? cuckoo_table_factory_->Name() : "",
+             cuckoo_table_factory_->GetPrintableOptions().c_str());
+    ret.append(buffer);
+  }
+  return ret;
+}
+
+extern TableFactory* NewAdaptiveTableFactory(
+    std::shared_ptr<TableFactory> table_factory_to_write,
+    std::shared_ptr<TableFactory> block_based_table_factory,
+    std::shared_ptr<TableFactory> plain_table_factory,
+    std::shared_ptr<TableFactory> cuckoo_table_factory) {
+  return new AdaptiveTableFactory(table_factory_to_write,
+                                  block_based_table_factory,
+                                  plain_table_factory, cuckoo_table_factory);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/adaptive/adaptive_table_factory.h b/src/rocksdb/table/adaptive/adaptive_table_factory.h
new file mode 100644
index 000000000..3b631942d
--- /dev/null
+++ b/src/rocksdb/table/adaptive/adaptive_table_factory.h
@@ -0,0 +1,58 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct EnvOptions;
+
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+class AdaptiveTableFactory : public TableFactory {
+ public:
+  ~AdaptiveTableFactory() {}
+
+  explicit AdaptiveTableFactory(
+      std::shared_ptr<TableFactory> table_factory_to_write,
+      std::shared_ptr<TableFactory> block_based_table_factory,
+      std::shared_ptr<TableFactory> plain_table_factory,
+      std::shared_ptr<TableFactory> cuckoo_table_factory);
+
+  const char* Name() const override { return "AdaptiveTableFactory"; }
+
+  using TableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
+      bool prefetch_index_and_filter_in_cache = true) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const override;
+
+  std::string GetPrintableOptions() const override;
+
+ private:
+  std::shared_ptr<TableFactory> table_factory_to_write_;
+  std::shared_ptr<TableFactory> block_based_table_factory_;
+  std::shared_ptr<TableFactory> plain_table_factory_;
+  std::shared_ptr<TableFactory> cuckoo_table_factory_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/block_based/binary_search_index_reader.cc b/src/rocksdb/table/block_based/binary_search_index_reader.cc
new file mode 100644
index 000000000..21787cc1a
--- /dev/null
+++ b/src/rocksdb/table/block_based/binary_search_index_reader.cc
@@ -0,0 +1,74 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/binary_search_index_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status BinarySearchIndexReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<IndexReader>* index_reader) {
+  assert(table != nullptr);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+  assert(index_reader != nullptr);
+
+  CachableEntry<Block> index_block;
+  if (prefetch || !use_cache) {
+    const Status s =
+        ReadIndexBlock(table, prefetch_buffer, ro, use_cache,
+                       /*get_context=*/nullptr, lookup_context, &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      index_block.Reset();
+    }
+  }
+
+  index_reader->reset(
+      new BinarySearchIndexReader(table, std::move(index_block)));
+
+  return Status::OK();
+}
+
+InternalIteratorBase<IndexValue>* BinarySearchIndexReader::NewIterator(
+    const ReadOptions& read_options, bool /* disable_prefix_seek */,
+    IndexBlockIter* iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+  const BlockBasedTable::Rep* rep = table()->get_rep();
+  const bool no_io = (read_options.read_tier == kBlockCacheTier);
+  CachableEntry<Block> index_block;
+  const Status s =
+      GetOrReadIndexBlock(no_io, read_options.rate_limiter_priority,
+                          get_context, lookup_context, &index_block);
+  if (!s.ok()) {
+    if (iter != nullptr) {
+      iter->Invalidate(s);
+      return iter;
+    }
+
+    return NewErrorInternalIterator<IndexValue>(s);
+  }
+
+  Statistics* kNullStats = nullptr;
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  auto it = index_block.GetValue()->NewIndexIterator(
+      internal_comparator()->user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, true,
+      index_has_first_key(), index_key_includes_seq(), index_value_is_full());
+
+  assert(it != nullptr);
+  index_block.TransferTo(it);
+
+  return it;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/binary_search_index_reader.h b/src/rocksdb/table/block_based/binary_search_index_reader.h
new file mode 100644
index 000000000..d4a611ecc
--- /dev/null
+++ b/src/rocksdb/table/block_based/binary_search_index_reader.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/index_reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Index that allows binary search lookup for the first key of each block.
+// This class can be viewed as a thin wrapper for `Block` class which already
+// supports binary search.
+class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  // Read index from the file and create an intance for
+  // `BinarySearchIndexReader`.
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(const BlockBasedTable* table, const ReadOptions& ro,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader);
+
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override;
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+
+ private:
+  BinarySearchIndexReader(const BlockBasedTable* t,
+                          CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block.cc b/src/rocksdb/table/block_based/block.cc
new file mode 100644
index 000000000..7eb0b010f
--- /dev/null
+++ b/src/rocksdb/table/block_based/block.cc
@@ -0,0 +1,1131 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Decodes the blocks generated by block_builder.cc.
+
+#include "table/block_based/block.h"
+
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_footer.h"
+#include "table/format.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Helper routine: decode the next block entry starting at "p",
+// storing the number of shared key bytes, non_shared key bytes,
+// and the length of the value in "*shared", "*non_shared", and
+// "*value_length", respectively.  Will not derefence past "limit".
+//
+// If any errors are detected, returns nullptr.  Otherwise, returns a
+// pointer to the key delta (just past the three decoded values).
+struct DecodeEntry {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    assert(limit - p >= 3);
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+    if ((*shared | *non_shared | *value_length) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 3;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+        return nullptr;
+      }
+    }
+
+    // Using an assert in place of "return null" since we should not pay the
+    // cost of checking for corruption on every single key decoding
+    assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
+    return p;
+  }
+};
+
+// Helper routine: similar to DecodeEntry but does not have assertions.
+// Instead, returns nullptr so that caller can detect and report failure.
+struct CheckAndDecodeEntry {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    if (limit - p < 3) {
+      return nullptr;
+    }
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+    if ((*shared | *non_shared | *value_length) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 3;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+        return nullptr;
+      }
+    }
+
+    if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
+      return nullptr;
+    }
+    return p;
+  }
+};
+
+struct DecodeKey {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared) {
+    uint32_t value_length;
+    return DecodeEntry()(p, limit, shared, non_shared, &value_length);
+  }
+};
+
+// In format_version 4, which is used by index blocks, the value size is not
+// encoded before the entry, as the value is known to be the handle with the
+// known size.
+struct DecodeKeyV4 {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    if (limit - p < 3) return nullptr;
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    if ((*shared | *non_shared) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 2;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+    }
+    return p;
+  }
+};
+
+struct DecodeEntryV4 {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    assert(value_length);
+
+    *value_length = 0;
+    return DecodeKeyV4()(p, limit, shared, non_shared);
+  }
+};
+void DataBlockIter::NextImpl() {
+  bool is_shared = false;
+  ParseNextDataKey(&is_shared);
+}
+
+void MetaBlockIter::NextImpl() {
+  bool is_shared = false;
+  ParseNextKey<CheckAndDecodeEntry>(&is_shared);
+}
+
+void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
+
+void IndexBlockIter::PrevImpl() {
+  assert(Valid());
+  // Scan backwards to a restart point before current_
+  const uint32_t original = current_;
+  while (GetRestartPoint(restart_index_) >= original) {
+    if (restart_index_ == 0) {
+      // No more entries
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return;
+    }
+    restart_index_--;
+  }
+  SeekToRestartPoint(restart_index_);
+  // Loop until end of current entry hits the start of original entry
+  while (ParseNextIndexKey() && NextEntryOffset() < original) {
+  }
+}
+
+void MetaBlockIter::PrevImpl() {
+  assert(Valid());
+  // Scan backwards to a restart point before current_
+  const uint32_t original = current_;
+  while (GetRestartPoint(restart_index_) >= original) {
+    if (restart_index_ == 0) {
+      // No more entries
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return;
+    }
+    restart_index_--;
+  }
+  SeekToRestartPoint(restart_index_);
+  bool is_shared = false;
+  // Loop until end of current entry hits the start of original entry
+  while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
+         NextEntryOffset() < original) {
+  }
+}
+
+// Similar to IndexBlockIter::PrevImpl but also caches the prev entries
+void DataBlockIter::PrevImpl() {
+  assert(Valid());
+
+  assert(prev_entries_idx_ == -1 ||
+         static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
+  // Check if we can use cached prev_entries_
+  if (prev_entries_idx_ > 0 &&
+      prev_entries_[prev_entries_idx_].offset == current_) {
+    // Read cached CachedPrevEntry
+    prev_entries_idx_--;
+    const CachedPrevEntry& current_prev_entry =
+        prev_entries_[prev_entries_idx_];
+
+    const char* key_ptr = nullptr;
+    bool raw_key_cached;
+    if (current_prev_entry.key_ptr != nullptr) {
+      // The key is not delta encoded and stored in the data block
+      key_ptr = current_prev_entry.key_ptr;
+      raw_key_cached = false;
+    } else {
+      // The key is delta encoded and stored in prev_entries_keys_buff_
+      key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
+      raw_key_cached = true;
+    }
+    const Slice current_key(key_ptr, current_prev_entry.key_size);
+
+    current_ = current_prev_entry.offset;
+    // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience,
+    // not necessity. It is convenient since this class treats keys as pinned
+    // when `raw_key_` points to an outside buffer. So we cannot allow
+    // `raw_key_` point into Prev cache as it is a transient outside buffer
+    // (i.e., keys in it are not actually pinned).
+    raw_key_.SetKey(current_key, raw_key_cached /* copy */);
+    value_ = current_prev_entry.value;
+
+    return;
+  }
+
+  // Clear prev entries cache
+  prev_entries_idx_ = -1;
+  prev_entries_.clear();
+  prev_entries_keys_buff_.clear();
+
+  // Scan backwards to a restart point before current_
+  const uint32_t original = current_;
+  while (GetRestartPoint(restart_index_) >= original) {
+    if (restart_index_ == 0) {
+      // No more entries
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return;
+    }
+    restart_index_--;
+  }
+
+  SeekToRestartPoint(restart_index_);
+
+  do {
+    bool is_shared = false;
+    if (!ParseNextDataKey(&is_shared)) {
+      break;
+    }
+    Slice current_key = raw_key_.GetKey();
+
+    if (raw_key_.IsKeyPinned()) {
+      // The key is not delta encoded
+      prev_entries_.emplace_back(current_, current_key.data(), 0,
+                                 current_key.size(), value());
+    } else {
+      // The key is delta encoded, cache decoded key in buffer
+      size_t new_key_offset = prev_entries_keys_buff_.size();
+      prev_entries_keys_buff_.append(current_key.data(), current_key.size());
+
+      prev_entries_.emplace_back(current_, nullptr, new_key_offset,
+                                 current_key.size(), value());
+    }
+    // Loop until end of current entry hits the start of original entry
+  } while (NextEntryOffset() < original);
+  prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
+}
+
+void DataBlockIter::SeekImpl(const Slice& target) {
+  Slice seek_key = target;
+  PERF_TIMER_GUARD(block_seek_nanos);
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+}
+
+void MetaBlockIter::SeekImpl(const Slice& target) {
+  Slice seek_key = target;
+  PERF_TIMER_GUARD(block_seek_nanos);
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+}
+
+// Optimized Seek for point lookup for an internal key `target`
+// target = "seek_user_key @ type | seqno".
+//
+// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// kTypeBlobIndex, or kTypeWideColumnEntity, this function behaves identically
+// to Seek().
+//
+// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// kTypeBlobIndex, or kTypeWideColumnEntity:
+//
+// If the return value is FALSE, iter location is undefined, and it means:
+// 1) there is no key in this block falling into the range:
+//    ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
+//    inclusive; AND
+// 2) the last key of this block has a greater user_key from seek_user_key
+//
+// If the return value is TRUE, iter location has two possibilies:
+// 1) If iter is valid, it is set to a location as if set by BinarySeek. In
+//    this case, it points to the first key with a larger user_key or a matching
+//    user_key with a seqno no greater than the seeking seqno.
+// 2) If the iter is invalid, it means that either all the user_key is less
+//    than the seek_user_key, or the block ends with a matching user_key but
+//    with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
+//    but larger type).
+bool DataBlockIter::SeekForGetImpl(const Slice& target) {
+  Slice target_user_key = ExtractUserKey(target);
+  uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
+  uint8_t entry =
+      data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
+
+  if (entry == kCollision) {
+    // HashSeek not effective, falling back
+    SeekImpl(target);
+    return true;
+  }
+
+  if (entry == kNoEntry) {
+    // Even if we cannot find the user_key in this block, the result may
+    // exist in the next block. Consider this example:
+    //
+    // Block N:    [aab@100, ... , app@120]
+    // boundary key: axy@50 (we make minimal assumption about a boundary key)
+    // Block N+1:  [axy@10, ...   ]
+    //
+    // If seek_key = axy@60, the search will starts from Block N.
+    // Even if the user_key is not found in the hash map, the caller still
+    // have to continue searching the next block.
+    //
+    // In this case, we pretend the key is the the last restart interval.
+    // The while-loop below will search the last restart interval for the
+    // key. It will stop at the first key that is larger than the seek_key,
+    // or to the end of the block if no one is larger.
+    entry = static_cast<uint8_t>(num_restarts_ - 1);
+  }
+
+  uint32_t restart_index = entry;
+
+  // check if the key is in the restart_interval
+  assert(restart_index < num_restarts_);
+  SeekToRestartPoint(restart_index);
+  current_ = GetRestartPoint(restart_index);
+
+  uint32_t limit = restarts_;
+  if (restart_index + 1 < num_restarts_) {
+    limit = GetRestartPoint(restart_index + 1);
+  }
+  while (current_ < limit) {
+    bool shared;
+    // Here we only linear seek the target key inside the restart interval.
+    // If a key does not exist inside a restart interval, we avoid
+    // further searching the block content across restart interval boundary.
+    //
+    // TODO(fwu): check the left and right boundary of the restart interval
+    // to avoid linear seek a target key that is out of range.
+    if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) {
+      // we stop at the first potential matching user key.
+      break;
+    }
+  }
+
+  if (current_ == restarts_) {
+    // Search reaches to the end of the block. There are three possibilites:
+    // 1) there is only one user_key match in the block (otherwise collsion).
+    //    the matching user_key resides in the last restart interval, and it
+    //    is the last key of the restart interval and of the block as well.
+    //    ParseNextKey() skiped it as its [ type | seqno ] is smaller.
+    //
+    // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
+    //    AND all existing user_keys in the restart interval are smaller than
+    //    seek_user_key.
+    //
+    // 3) The seek_key is a false positive and happens to be hashed to the
+    //    last restart interval, AND all existing user_keys in the restart
+    //    interval are smaller than seek_user_key.
+    //
+    // The result may exist in the next block each case, so we return true.
+    return true;
+  }
+
+  if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(),
+                                        target_user_key) != 0) {
+    // the key is not in this block and cannot be at the next block either.
+    return false;
+  }
+
+  // Here we are conservative and only support a limited set of cases
+  ValueType value_type = ExtractValueType(raw_key_.GetInternalKey());
+  if (value_type != ValueType::kTypeValue &&
+      value_type != ValueType::kTypeDeletion &&
+      value_type != ValueType::kTypeSingleDeletion &&
+      value_type != ValueType::kTypeBlobIndex &&
+      value_type != ValueType::kTypeWideColumnEntity) {
+    SeekImpl(target);
+    return true;
+  }
+
+  // Result found, and the iter is correctly set.
+  return true;
+}
+
+void IndexBlockIter::SeekImpl(const Slice& target) {
+  TEST_SYNC_POINT("IndexBlockIter::Seek:0");
+  PERF_TIMER_GUARD(block_seek_nanos);
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  Slice seek_key = target;
+  if (raw_key_.IsUserKey()) {
+    seek_key = ExtractUserKey(target);
+  }
+  status_ = Status::OK();
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = false;
+  if (prefix_index_) {
+    bool prefix_may_exist = true;
+    ok = PrefixSeek(target, &index, &prefix_may_exist);
+    if (!prefix_may_exist) {
+      // This is to let the caller to distinguish between non-existing prefix,
+      // and when key is larger than the last key, which both set Valid() to
+      // false.
+      current_ = restarts_;
+      status_ = Status::NotFound();
+    }
+    // restart interval must be one when hash search is enabled so the binary
+    // search simply lands at the right place.
+    skip_linear_scan = true;
+  } else if (value_delta_encoded_) {
+    ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
+  } else {
+    ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+  }
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+}
+
+void DataBlockIter::SeekForPrevImpl(const Slice& target) {
+  PERF_TIMER_GUARD(block_seek_nanos);
+  Slice seek_key = target;
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+
+  if (!Valid()) {
+    SeekToLastImpl();
+  } else {
+    while (Valid() && CompareCurrentKey(seek_key) > 0) {
+      PrevImpl();
+    }
+  }
+}
+
+void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
+  PERF_TIMER_GUARD(block_seek_nanos);
+  Slice seek_key = target;
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+
+  if (!Valid()) {
+    SeekToLastImpl();
+  } else {
+    while (Valid() && CompareCurrentKey(seek_key) > 0) {
+      PrevImpl();
+    }
+  }
+}
+
+void DataBlockIter::SeekToFirstImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(0);
+  bool is_shared = false;
+  ParseNextDataKey(&is_shared);
+}
+
+void MetaBlockIter::SeekToFirstImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(0);
+  bool is_shared = false;
+  ParseNextKey<CheckAndDecodeEntry>(&is_shared);
+}
+
+void IndexBlockIter::SeekToFirstImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  status_ = Status::OK();
+  SeekToRestartPoint(0);
+  ParseNextIndexKey();
+}
+
+void DataBlockIter::SeekToLastImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(num_restarts_ - 1);
+  bool is_shared = false;
+  while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) {
+    // Keep skipping
+  }
+}
+
+void MetaBlockIter::SeekToLastImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(num_restarts_ - 1);
+  bool is_shared = false;
+  while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
+         NextEntryOffset() < restarts_) {
+    // Keep skipping
+  }
+}
+
+void IndexBlockIter::SeekToLastImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  status_ = Status::OK();
+  SeekToRestartPoint(num_restarts_ - 1);
+  while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
+    // Keep skipping
+  }
+}
+
+template <class TValue>
+void BlockIter<TValue>::CorruptionError() {
+  current_ = restarts_;
+  restart_index_ = num_restarts_;
+  status_ = Status::Corruption("bad entry in block");
+  raw_key_.Clear();
+  value_.clear();
+}
+
+template <class TValue>
+template <typename DecodeEntryFunc>
+bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
+  current_ = NextEntryOffset();
+  const char* p = data_ + current_;
+  const char* limit = data_ + restarts_;  // Restarts come right after data
+
+  if (p >= limit) {
+    // No more entries to return.  Mark as invalid.
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    return false;
+  }
+  // Decode next entry
+  uint32_t shared, non_shared, value_length;
+  p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
+  if (p == nullptr || raw_key_.Size() < shared) {
+    CorruptionError();
+    return false;
+  } else {
+    if (shared == 0) {
+      *is_shared = false;
+      // If this key doesn't share any bytes with prev key then we don't need
+      // to decode it and can use its address in the block directly.
+      raw_key_.SetKey(Slice(p, non_shared), false /* copy */);
+    } else {
+      // This key share `shared` bytes with prev key, we need to decode it
+      *is_shared = true;
+      raw_key_.TrimAppend(shared, p, non_shared);
+    }
+    value_ = Slice(p + non_shared, value_length);
+    if (shared == 0) {
+      while (restart_index_ + 1 < num_restarts_ &&
+             GetRestartPoint(restart_index_ + 1) < current_) {
+        ++restart_index_;
+      }
+    }
+    // else we are in the middle of a restart interval and the restart_index_
+    // thus has not changed
+    return true;
+  }
+}
+
+bool DataBlockIter::ParseNextDataKey(bool* is_shared) {
+  if (ParseNextKey<DecodeEntry>(is_shared)) {
+#ifndef NDEBUG
+    if (global_seqno_ != kDisableGlobalSequenceNumber) {
+      // If we are reading a file with a global sequence number we should
+      // expect that all encoded sequence numbers are zeros and any value
+      // type is kTypeValue, kTypeMerge, kTypeDeletion,
+      // kTypeDeletionWithTimestamp, or kTypeRangeDeletion.
+      uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey());
+      SequenceNumber seqno;
+      ValueType value_type;
+      UnPackSequenceAndType(packed, &seqno, &value_type);
+      assert(value_type == ValueType::kTypeValue ||
+             value_type == ValueType::kTypeMerge ||
+             value_type == ValueType::kTypeDeletion ||
+             value_type == ValueType::kTypeDeletionWithTimestamp ||
+             value_type == ValueType::kTypeRangeDeletion);
+      assert(seqno == 0);
+    }
+#endif  // NDEBUG
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool IndexBlockIter::ParseNextIndexKey() {
+  bool is_shared = false;
+  bool ok = (value_delta_encoded_) ? ParseNextKey<DecodeEntryV4>(&is_shared)
+                                   : ParseNextKey<DecodeEntry>(&is_shared);
+  if (ok) {
+    if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
+      DecodeCurrentValue(is_shared);
+    }
+  }
+  return ok;
+}
+
+// The format:
+// restart_point   0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// restart_point   1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// ...
+// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// where, k is key, v is value, and its encoding is in parenthesis.
+// The format of each key is (shared_size, non_shared_size, shared, non_shared)
+// The format of each value, i.e., block handle, is (offset, size) whenever the
+// is_shared is false, which included the first entry in each restart point.
+// Otherwise the format is delta-size = block handle size - size of last block
+// handle.
+void IndexBlockIter::DecodeCurrentValue(bool is_shared) {
+  Slice v(value_.data(), data_ + restarts_ - value_.data());
+  // Delta encoding is used if `shared` != 0.
+  Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
+      &v, have_first_key_,
+      (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr);
+  assert(decode_s.ok());
+  value_ = Slice(value_.data(), v.data() - value_.data());
+
+  if (global_seqno_state_ != nullptr) {
+    // Overwrite sequence number the same way as in DataBlockIter.
+
+    IterKey& first_internal_key = global_seqno_state_->first_internal_key;
+    first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
+                                      /* copy */ true);
+
+    assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
+
+    ValueType value_type = ExtractValueType(first_internal_key.GetKey());
+    assert(value_type == ValueType::kTypeValue ||
+           value_type == ValueType::kTypeMerge ||
+           value_type == ValueType::kTypeDeletion ||
+           value_type == ValueType::kTypeRangeDeletion);
+
+    first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
+                                         value_type);
+    decoded_value_.first_internal_key = first_internal_key.GetKey();
+  }
+}
+
+template <class TValue>
+void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
+                                               uint32_t index,
+                                               bool skip_linear_scan) {
+  // SeekToRestartPoint() only does the lookup in the restart block. We need
+  // to follow it up with NextImpl() to position the iterator at the restart
+  // key.
+  SeekToRestartPoint(index);
+  NextImpl();
+
+  if (!skip_linear_scan) {
+    // Linear search (within restart block) for first key >= target
+    uint32_t max_offset;
+    if (index + 1 < num_restarts_) {
+      // We are in a non-last restart interval. Since `BinarySeek()` guarantees
+      // the next restart key is strictly greater than `target`, we can
+      // terminate upon reaching it without any additional key comparison.
+      max_offset = GetRestartPoint(index + 1);
+    } else {
+      // We are in the last restart interval. The while-loop will terminate by
+      // `Valid()` returning false upon advancing past the block's last key.
+      max_offset = std::numeric_limits<uint32_t>::max();
+    }
+    while (true) {
+      NextImpl();
+      if (!Valid()) {
+        break;
+      }
+      if (current_ == max_offset) {
+        assert(CompareCurrentKey(target) > 0);
+        break;
+      } else if (CompareCurrentKey(target) >= 0) {
+        break;
+      }
+    }
+  }
+}
+
+// Binary searches in restart array to find the starting restart point for the
+// linear scan, and stores it in `*index`. Assumes restart array does not
+// contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
+// is strictly greater than `target` or does not exist (this can be used to
+// elide a comparison when linear scan reaches all the way to the next restart
+// key). Furthermore, `*skip_linear_scan` is set to indicate whether the
+// `*index`th restart key is the final result so that key does not need to be
+// compared again later.
+template <class TValue>
+template <typename DecodeKeyFunc>
+bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
+                                   bool* skip_linear_scan) {
+  if (restarts_ == 0) {
+    // SST files dedicated to range tombstones are written with index blocks
+    // that have no keys while also having `num_restarts_ == 1`. This would
+    // cause a problem for `BinarySeek()` as it'd try to access the first key
+    // which does not exist. We identify such blocks by the offset at which
+    // their restarts are stored, and return false to prevent any attempted
+    // key accesses.
+    return false;
+  }
+
+  *skip_linear_scan = false;
+  // Loop invariants:
+  // - Restart key at index `left` is less than or equal to the target key. The
+  //   sentinel index `-1` is considered to have a key that is less than all
+  //   keys.
+  // - Any restart keys after index `right` are strictly greater than the target
+  //   key.
+  int64_t left = -1, right = num_restarts_ - 1;
+  while (left != right) {
+    // The `mid` is computed by rounding up so it lands in (`left`, `right`].
+    int64_t mid = left + (right - left + 1) / 2;
+    uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
+    uint32_t shared, non_shared;
+    const char* key_ptr = DecodeKeyFunc()(
+        data_ + region_offset, data_ + restarts_, &shared, &non_shared);
+    if (key_ptr == nullptr || (shared != 0)) {
+      CorruptionError();
+      return false;
+    }
+    Slice mid_key(key_ptr, non_shared);
+    raw_key_.SetKey(mid_key, false /* copy */);
+    int cmp = CompareCurrentKey(target);
+    if (cmp < 0) {
+      // Key at "mid" is smaller than "target". Therefore all
+      // blocks before "mid" are uninteresting.
+      left = mid;
+    } else if (cmp > 0) {
+      // Key at "mid" is >= "target". Therefore all blocks at or
+      // after "mid" are uninteresting.
+      right = mid - 1;
+    } else {
+      *skip_linear_scan = true;
+      left = right = mid;
+    }
+  }
+
+  if (left == -1) {
+    // All keys in the block were strictly greater than `target`. So the very
+    // first key in the block is the final seek result.
+    *skip_linear_scan = true;
+    *index = 0;
+  } else {
+    *index = static_cast<uint32_t>(left);
+  }
+  return true;
+}
+
+// Compare target key and the block key of the block of `block_index`.
+// Return -1 if error.
+int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
+  uint32_t region_offset = GetRestartPoint(block_index);
+  uint32_t shared, non_shared;
+  const char* key_ptr =
+      value_delta_encoded_
+          ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
+                          &non_shared)
+          : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
+                        &non_shared);
+  if (key_ptr == nullptr || (shared != 0)) {
+    CorruptionError();
+    return 1;  // Return target is smaller
+  }
+  Slice block_key(key_ptr, non_shared);
+  raw_key_.SetKey(block_key, false /* copy */);
+  return CompareCurrentKey(target);
+}
+
+// Binary search in block_ids to find the first block
+// with a key >= target
+bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
+                                          uint32_t* block_ids, uint32_t left,
+                                          uint32_t right, uint32_t* index,
+                                          bool* prefix_may_exist) {
+  assert(left <= right);
+  assert(index);
+  assert(prefix_may_exist);
+  *prefix_may_exist = true;
+  uint32_t left_bound = left;
+
+  while (left <= right) {
+    uint32_t mid = (right + left) / 2;
+
+    int cmp = CompareBlockKey(block_ids[mid], target);
+    if (!status_.ok()) {
+      return false;
+    }
+    if (cmp < 0) {
+      // Key at "target" is larger than "mid". Therefore all
+      // blocks before or at "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "target" is <= "mid". Therefore all blocks
+      // after "mid" are uninteresting.
+      // If there is only one block left, we found it.
+      if (left == right) break;
+      right = mid;
+    }
+  }
+
+  if (left == right) {
+    // In one of the two following cases:
+    // (1) left is the first one of block_ids
+    // (2) there is a gap of blocks between block of `left` and `left-1`.
+    // we can further distinguish the case of key in the block or key not
+    // existing, by comparing the target key and the key of the previous
+    // block to the left of the block found.
+    if (block_ids[left] > 0 &&
+        (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
+        CompareBlockKey(block_ids[left] - 1, target) > 0) {
+      current_ = restarts_;
+      *prefix_may_exist = false;
+      return false;
+    }
+
+    *index = block_ids[left];
+    return true;
+  } else {
+    assert(left > right);
+
+    // If the next block key is larger than seek key, it is possible that
+    // no key shares the prefix with `target`, or all keys with the same
+    // prefix as `target` are smaller than prefix. In the latter case,
+    // we are mandated to set the position the same as the total order.
+    // In the latter case, either:
+    // (1) `target` falls into the range of the next block. In this case,
+    //     we can place the iterator to the next block, or
+    // (2) `target` is larger than all block keys. In this case we can
+    //     keep the iterator invalidate without setting `prefix_may_exist`
+    //     to false.
+    // We might sometimes end up with setting the total order position
+    // while there is no key sharing the prefix as `target`, but it
+    // still follows the contract.
+    uint32_t right_index = block_ids[right];
+    assert(right_index + 1 <= num_restarts_);
+    if (right_index + 1 < num_restarts_) {
+      if (CompareBlockKey(right_index + 1, target) >= 0) {
+        *index = right_index + 1;
+        return true;
+      } else {
+        // We have to set the flag here because we are not positioning
+        // the iterator to the total order position.
+        *prefix_may_exist = false;
+      }
+    }
+
+    // Mark iterator invalid
+    current_ = restarts_;
+    return false;
+  }
+}
+
+bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
+                                bool* prefix_may_exist) {
+  assert(index);
+  assert(prefix_may_exist);
+  assert(prefix_index_);
+  *prefix_may_exist = true;
+  Slice seek_key = target;
+  if (raw_key_.IsUserKey()) {
+    seek_key = ExtractUserKey(target);
+  }
+  uint32_t* block_ids = nullptr;
+  uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
+
+  if (num_blocks == 0) {
+    current_ = restarts_;
+    *prefix_may_exist = false;
+    return false;
+  } else {
+    assert(block_ids);
+    return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
+                                prefix_may_exist);
+  }
+}
+
+uint32_t Block::NumRestarts() const {
+  assert(size_ >= 2 * sizeof(uint32_t));
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // In BlockBuilder, we have ensured a block with HashIndex is less than
+    // kMaxBlockSizeSupportedByHashIndex (64KiB).
+    //
+    // Therefore, if we encounter a block with a size > 64KiB, the block
+    // cannot have HashIndex. So the footer will directly interpreted as
+    // num_restarts.
+    //
+    // Such check is for backward compatibility. We can ensure legacy block
+    // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
+    // correctly as no HashIndex even if the MSB of num_restarts is set.
+    return num_restarts;
+  }
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return num_restarts;
+}
+
+BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
+  assert(size_ >= 2 * sizeof(uint32_t));
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // The check is for the same reason as that in NumRestarts()
+    return BlockBasedTableOptions::kDataBlockBinarySearch;
+  }
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return index_type;
+}
+
+Block::~Block() {
+  // This sync point can be re-enabled if RocksDB can control the
+  // initialization order of any/all static options created by the user.
+  // TEST_SYNC_POINT("Block::~Block");
+}
+
+Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
+             Statistics* statistics)
+    : contents_(std::move(contents)),
+      data_(contents_.data.data()),
+      size_(contents_.data.size()),
+      restart_offset_(0),
+      num_restarts_(0) {
+  TEST_SYNC_POINT("Block::Block:0");
+  if (size_ < sizeof(uint32_t)) {
+    size_ = 0;  // Error marker
+  } else {
+    // Should only decode restart points for uncompressed blocks
+    num_restarts_ = NumRestarts();
+    switch (IndexType()) {
+      case BlockBasedTableOptions::kDataBlockBinarySearch:
+        restart_offset_ = static_cast<uint32_t>(size_) -
+                          (1 + num_restarts_) * sizeof(uint32_t);
+        if (restart_offset_ > size_ - sizeof(uint32_t)) {
+          // The size is too small for NumRestarts() and therefore
+          // restart_offset_ wrapped around.
+          size_ = 0;
+        }
+        break;
+      case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+        if (size_ < sizeof(uint32_t) /* block footer */ +
+                        sizeof(uint16_t) /* NUM_BUCK */) {
+          size_ = 0;
+          break;
+        }
+
+        uint16_t map_offset;
+        data_block_hash_index_.Initialize(
+            contents.data.data(),
+            static_cast<uint16_t>(contents.data.size() -
+                                  sizeof(uint32_t)), /*chop off
+                                                 NUM_RESTARTS*/
+            &map_offset);
+
+        restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
+
+        if (restart_offset_ > map_offset) {
+          // map_offset is too small for NumRestarts() and
+          // therefore restart_offset_ wrapped around.
+          size_ = 0;
+          break;
+        }
+        break;
+      default:
+        size_ = 0;  // Error marker
+    }
+  }
+  if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
+    read_amp_bitmap_.reset(new BlockReadAmpBitmap(
+        restart_offset_, read_amp_bytes_per_bit, statistics));
+  }
+}
+
+MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
+  MetaBlockIter* iter = new MetaBlockIter();
+  if (size_ < 2 * sizeof(uint32_t)) {
+    iter->Invalidate(Status::Corruption("bad block contents"));
+    return iter;
+  } else if (num_restarts_ == 0) {
+    // Empty block.
+    iter->Invalidate(Status::OK());
+  } else {
+    iter->Initialize(data_, restart_offset_, num_restarts_,
+                     block_contents_pinned);
+  }
+  return iter;
+}
+
+DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
+                                      SequenceNumber global_seqno,
+                                      DataBlockIter* iter, Statistics* stats,
+                                      bool block_contents_pinned) {
+  DataBlockIter* ret_iter;
+  if (iter != nullptr) {
+    ret_iter = iter;
+  } else {
+    ret_iter = new DataBlockIter;
+  }
+  if (size_ < 2 * sizeof(uint32_t)) {
+    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+    return ret_iter;
+  }
+  if (num_restarts_ == 0) {
+    // Empty block.
+    ret_iter->Invalidate(Status::OK());
+    return ret_iter;
+  } else {
+    ret_iter->Initialize(
+        raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
+        read_amp_bitmap_.get(), block_contents_pinned,
+        data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
+    if (read_amp_bitmap_) {
+      if (read_amp_bitmap_->GetStatistics() != stats) {
+        // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
+        read_amp_bitmap_->SetStatistics(stats);
+      }
+    }
+  }
+
+  return ret_iter;
+}
+
+IndexBlockIter* Block::NewIndexIterator(
+    const Comparator* raw_ucmp, SequenceNumber global_seqno,
+    IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
+    bool have_first_key, bool key_includes_seq, bool value_is_full,
+    bool block_contents_pinned, BlockPrefixIndex* prefix_index) {
+  IndexBlockIter* ret_iter;
+  if (iter != nullptr) {
+    ret_iter = iter;
+  } else {
+    ret_iter = new IndexBlockIter;
+  }
+  if (size_ < 2 * sizeof(uint32_t)) {
+    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+    return ret_iter;
+  }
+  if (num_restarts_ == 0) {
+    // Empty block.
+    ret_iter->Invalidate(Status::OK());
+    return ret_iter;
+  } else {
+    BlockPrefixIndex* prefix_index_ptr =
+        total_order_seek ? nullptr : prefix_index;
+    ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_,
+                         global_seqno, prefix_index_ptr, have_first_key,
+                         key_includes_seq, value_is_full,
+                         block_contents_pinned);
+  }
+
+  return ret_iter;
+}
+
+size_t Block::ApproximateMemoryUsage() const {
+  size_t usage = usable_size();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size((void*)this);
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  if (read_amp_bitmap_) {
+    usage += read_amp_bitmap_->ApproximateMemoryUsage();
+  }
+  return usage;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block.h b/src/rocksdb/table/block_based/block.h
new file mode 100644
index 000000000..5d73f72f6
--- /dev/null
+++ b/src/rocksdb/table/block_based/block.h
@@ -0,0 +1,744 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "db/pinned_iterators_manager.h"
+#include "port/malloc.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_hash_index.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct BlockContents;
+class Comparator;
+template <class TValue>
+class BlockIter;
+class DataBlockIter;
+class IndexBlockIter;
+class MetaBlockIter;
+class BlockPrefixIndex;
+
+// BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data
+// bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of
+// bytes in the Block we update the bitmap and increment
+// READ_AMP_ESTIMATE_USEFUL_BYTES.
+class BlockReadAmpBitmap {
+ public:
+  explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
+                              Statistics* statistics)
+      : bitmap_(nullptr),
+        bytes_per_bit_pow_(0),
+        statistics_(statistics),
+        rnd_(Random::GetTLSInstance()->Uniform(
+            static_cast<int>(bytes_per_bit))) {
+    TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
+    assert(block_size > 0 && bytes_per_bit > 0);
+
+    // convert bytes_per_bit to be a power of 2
+    while (bytes_per_bit >>= 1) {
+      bytes_per_bit_pow_++;
+    }
+
+    // num_bits_needed = ceil(block_size / bytes_per_bit)
+    size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1;
+    assert(num_bits_needed > 0);
+
+    // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
+    size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
+
+    // Create bitmap and set all the bits to 0
+    bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
+
+    RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
+  }
+
+  ~BlockReadAmpBitmap() { delete[] bitmap_; }
+
+  void Mark(uint32_t start_offset, uint32_t end_offset) {
+    assert(end_offset >= start_offset);
+    // Index of first bit in mask
+    uint32_t start_bit =
+        (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
+        bytes_per_bit_pow_;
+    // Index of last bit in mask + 1
+    uint32_t exclusive_end_bit =
+        (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
+    if (start_bit >= exclusive_end_bit) {
+      return;
+    }
+    assert(exclusive_end_bit > 0);
+
+    if (GetAndSet(start_bit) == 0) {
+      uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
+                                  << bytes_per_bit_pow_;
+      RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
+                 new_useful_bytes);
+    }
+  }
+
+  Statistics* GetStatistics() {
+    return statistics_.load(std::memory_order_relaxed);
+  }
+
+  void SetStatistics(Statistics* stats) { statistics_.store(stats); }
+
+  uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
+
+  size_t ApproximateMemoryUsage() const {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    return malloc_usable_size((void*)this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return sizeof(*this);
+  }
+
+ private:
+  // Get the current value of bit at `bit_idx` and set it to 1
+  inline bool GetAndSet(uint32_t bit_idx) {
+    const uint32_t byte_idx = bit_idx / kBitsPerEntry;
+    const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry);
+
+    return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) &
+           bit_mask;
+  }
+
+  const uint32_t kBytesPersEntry = sizeof(uint32_t);   // 4 bytes
+  const uint32_t kBitsPerEntry = kBytesPersEntry * 8;  // 32 bits
+
+  // Bitmap used to record the bytes that we read, use atomic to protect
+  // against multiple threads updating the same bit
+  std::atomic<uint32_t>* bitmap_;
+  // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
+  // muliplication and division
+  uint8_t bytes_per_bit_pow_;
+  // Pointer to DB Statistics object, Since this bitmap may outlive the DB
+  // this pointer maybe invalid, but the DB will update it to a valid pointer
+  // by using SetStatistics() before calling Mark()
+  std::atomic<Statistics*> statistics_;
+  uint32_t rnd_;
+};
+
+// class Block is the uncompressed and "parsed" form for blocks containing
+// key-value pairs. (See BlockContents comments for more on terminology.)
+// This includes the in-memory representation of data blocks, index blocks
+// (including partitions), range deletion blocks, properties blocks, metaindex
+// blocks, as well as the top level of the partitioned filter structure (which
+// is actually an index of the filter partitions). It is NOT suitable for
+// compressed blocks in general, filter blocks/partitions, or compression
+// dictionaries.
+//
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details of the format and the various block types.
+//
+// TODO: Rename to ParsedKvBlock?
+class Block {
+ public:
+  // Initialize the block with the specified contents.
+  explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0,
+                 Statistics* statistics = nullptr);
+  // No copying allowed
+  Block(const Block&) = delete;
+  void operator=(const Block&) = delete;
+
+  ~Block();
+
+  size_t size() const { return size_; }
+  const char* data() const { return data_; }
+  // The additional memory space taken by the block data.
+  size_t usable_size() const { return contents_.usable_size(); }
+  uint32_t NumRestarts() const;
+  bool own_bytes() const { return contents_.own_bytes(); }
+
+  BlockBasedTableOptions::DataBlockIndexType IndexType() const;
+
+  // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
+  // comparator.
+  //
+  // If iter is null, return new Iterator
+  // If iter is not null, update this one and return it as Iterator*
+  //
+  // Updates read_amp_bitmap_ if it is not nullptr.
+  //
+  // If `block_contents_pinned` is true, the caller will guarantee that when
+  // the cleanup functions are transferred from the iterator to other
+  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
+  // valid. Either the iterator holds cache handle or ownership of some resource
+  // and release them in a release function, or caller is sure that the data
+  // will not go away (for example, it's from mmapped file which will not be
+  // closed).
+  //
+  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+  // the iterator will simply be set as "invalid", rather than returning
+  // the key that is just pass the target key.
+  DataBlockIter* NewDataIterator(const Comparator* raw_ucmp,
+                                 SequenceNumber global_seqno,
+                                 DataBlockIter* iter = nullptr,
+                                 Statistics* stats = nullptr,
+                                 bool block_contents_pinned = false);
+
+  // Returns an MetaBlockIter for iterating over blocks containing metadata
+  // (like Properties blocks).  Unlike data blocks, the keys for these blocks
+  // do not contain sequence numbers, do not use a user-define comparator, and
+  // do not track read amplification/statistics.  Additionally, MetaBlocks will
+  // not assert if the block is formatted improperly.
+  //
+  // If `block_contents_pinned` is true, the caller will guarantee that when
+  // the cleanup functions are transferred from the iterator to other
+  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
+  // valid. Either the iterator holds cache handle or ownership of some resource
+  // and release them in a release function, or caller is sure that the data
+  // will not go away (for example, it's from mmapped file which will not be
+  // closed).
+  MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false);
+
+  // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
+  // comparator.
+  //
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default true, means that no delta encoding is
+  // applied to values.
+  //
+  // If `prefix_index` is not nullptr this block will do hash lookup for the key
+  // prefix. If total_order_seek is true, prefix_index_ is ignored.
+  //
+  // `have_first_key` controls whether IndexValue will contain
+  // first_internal_key. It affects data serialization format, so the same value
+  // have_first_key must be used when writing and reading index.
+  // It is determined by IndexType property of the table.
+  IndexBlockIter* NewIndexIterator(const Comparator* raw_ucmp,
+                                   SequenceNumber global_seqno,
+                                   IndexBlockIter* iter, Statistics* stats,
+                                   bool total_order_seek, bool have_first_key,
+                                   bool key_includes_seq, bool value_is_full,
+                                   bool block_contents_pinned = false,
+                                   BlockPrefixIndex* prefix_index = nullptr);
+
+  // Report an approximation of how much memory has been used.
+  size_t ApproximateMemoryUsage() const;
+
+ private:
+  BlockContents contents_;
+  const char* data_;         // contents_.data.data()
+  size_t size_;              // contents_.data.size()
+  uint32_t restart_offset_;  // Offset in data_ of restart array
+  uint32_t num_restarts_;
+  std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
+  DataBlockHashIndex data_block_hash_index_;
+};
+
+// A `BlockIter` iterates over the entries in a `Block`'s data buffer. The
+// format of this data buffer is an uncompressed, sorted sequence of key-value
+// pairs (see `Block` API for more details).
+//
+// Notably, the keys may either be in internal key format or user key format.
+// Subclasses are responsible for configuring the key format.
+//
+// `BlockIter` intends to provide final overrides for all of
+// `InternalIteratorBase` functions that can move the iterator. It does
+// this to guarantee `UpdateKey()` is called exactly once after each key
+// movement potentially visible to users. In this step, the key is prepared
+// (e.g., serialized if global seqno is in effect) so it can be returned
+// immediately when the user asks for it via calling `key() const`.
+//
+// For its subclasses, it provides protected variants of the above-mentioned
+// final-overridden methods. They are named with the "Impl" suffix, e.g.,
+// `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These
+// "Impl" functions are responsible for positioning `raw_key_` but not
+// invoking `UpdateKey()`.
+template <class TValue>
+class BlockIter : public InternalIteratorBase<TValue> {
+ public:
+  // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
+  // nothing. Calls cleanup functions.
+  virtual void Invalidate(const Status& s) {
+    // Assert that the BlockIter is never deleted while Pinning is Enabled.
+    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
+
+    data_ = nullptr;
+    current_ = restarts_;
+    status_ = s;
+
+    // Call cleanup callbacks.
+    Cleanable::Reset();
+  }
+
+  bool Valid() const override { return current_ < restarts_; }
+
+  virtual void SeekToFirst() override final {
+    SeekToFirstImpl();
+    UpdateKey();
+  }
+
+  virtual void SeekToLast() override final {
+    SeekToLastImpl();
+    UpdateKey();
+  }
+
+  virtual void Seek(const Slice& target) override final {
+    SeekImpl(target);
+    UpdateKey();
+  }
+
+  virtual void SeekForPrev(const Slice& target) override final {
+    SeekForPrevImpl(target);
+    UpdateKey();
+  }
+
+  virtual void Next() override final {
+    NextImpl();
+    UpdateKey();
+  }
+
+  virtual bool NextAndGetResult(IterateResult* result) override final {
+    // This does not need to call `UpdateKey()` as the parent class only has
+    // access to the `UpdateKey()`-invoking functions.
+    return InternalIteratorBase<TValue>::NextAndGetResult(result);
+  }
+
+  virtual void Prev() override final {
+    PrevImpl();
+    UpdateKey();
+  }
+
+  Status status() const override { return status_; }
+  Slice key() const override {
+    assert(Valid());
+    return key_;
+  }
+
+#ifndef NDEBUG
+  ~BlockIter() override {
+    // Assert that the BlockIter is never deleted while Pinning is Enabled.
+    assert(!pinned_iters_mgr_ ||
+           (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+    status_.PermitUncheckedError();
+  }
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+  }
+  PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
+#endif
+
+  bool IsKeyPinned() const override {
+    return block_contents_pinned_ && key_pinned_;
+  }
+
+  bool IsValuePinned() const override { return block_contents_pinned_; }
+
+  size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
+
+  uint32_t ValueOffset() const {
+    return static_cast<uint32_t>(value_.data() - data_);
+  }
+
+  void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
+
+  Cache::Handle* cache_handle() { return cache_handle_; }
+
+ protected:
+  std::unique_ptr<InternalKeyComparator> icmp_;
+  const char* data_;       // underlying block contents
+  uint32_t num_restarts_;  // Number of uint32_t entries in restart array
+
+  // Index of restart block in which current_ or current_-1 falls
+  uint32_t restart_index_;
+  uint32_t restarts_;  // Offset of restart array (list of fixed32)
+  // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
+  uint32_t current_;
+  // Raw key from block.
+  IterKey raw_key_;
+  // Buffer for key data when global seqno assignment is enabled.
+  IterKey key_buf_;
+  Slice value_;
+  Status status_;
+  // Key to be exposed to users.
+  Slice key_;
+  bool key_pinned_;
+  // Whether the block data is guaranteed to outlive this iterator, and
+  // as long as the cleanup functions are transferred to another class,
+  // e.g. PinnableSlice, the pointer to the bytes will still be valid.
+  bool block_contents_pinned_;
+  SequenceNumber global_seqno_;
+
+  virtual void SeekToFirstImpl() = 0;
+  virtual void SeekToLastImpl() = 0;
+  virtual void SeekImpl(const Slice& target) = 0;
+  virtual void SeekForPrevImpl(const Slice& target) = 0;
+  virtual void NextImpl() = 0;
+
+  virtual void PrevImpl() = 0;
+
+  template <typename DecodeEntryFunc>
+  inline bool ParseNextKey(bool* is_shared);
+
+  void InitializeBase(const Comparator* raw_ucmp, const char* data,
+                      uint32_t restarts, uint32_t num_restarts,
+                      SequenceNumber global_seqno, bool block_contents_pinned) {
+    assert(data_ == nullptr);  // Ensure it is called only once
+    assert(num_restarts > 0);  // Ensure the param is valid
+
+    icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp);
+    data_ = data;
+    restarts_ = restarts;
+    num_restarts_ = num_restarts;
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    global_seqno_ = global_seqno;
+    block_contents_pinned_ = block_contents_pinned;
+    cache_handle_ = nullptr;
+  }
+
+  // Must be called every time a key is found that needs to be returned to user,
+  // and may be called when no key is found (as a no-op). Updates `key_`,
+  // `key_buf_`, and `key_pinned_` with info about the found key.
+  void UpdateKey() {
+    key_buf_.Clear();
+    if (!Valid()) {
+      return;
+    }
+    if (raw_key_.IsUserKey()) {
+      assert(global_seqno_ == kDisableGlobalSequenceNumber);
+      key_ = raw_key_.GetUserKey();
+      key_pinned_ = raw_key_.IsKeyPinned();
+    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
+      key_ = raw_key_.GetInternalKey();
+      key_pinned_ = raw_key_.IsKeyPinned();
+    } else {
+      key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_,
+                              ExtractValueType(raw_key_.GetInternalKey()));
+      key_ = key_buf_.GetInternalKey();
+      key_pinned_ = false;
+    }
+  }
+
+  // Returns the result of `Comparator::Compare()`, where the appropriate
+  // comparator is used for the block contents, the LHS argument is the current
+  // key with global seqno applied, and the RHS argument is `other`.
+  int CompareCurrentKey(const Slice& other) {
+    if (raw_key_.IsUserKey()) {
+      assert(global_seqno_ == kDisableGlobalSequenceNumber);
+      return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other);
+    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
+      return icmp_->Compare(raw_key_.GetInternalKey(), other);
+    }
+    return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other,
+                          kDisableGlobalSequenceNumber);
+  }
+
+ private:
+  // Store the cache handle, if the block is cached. We need this since the
+  // only other place the handle is stored is as an argument to the Cleanable
+  // function callback, which is hard to retrieve. When multiple value
+  // PinnableSlices reference the block, they need the cache handle in order
+  // to bump up the ref count
+  Cache::Handle* cache_handle_;
+
+ public:
+  // Return the offset in data_ just past the end of the current entry.
+  inline uint32_t NextEntryOffset() const {
+    // NOTE: We don't support blocks bigger than 2GB
+    return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
+  }
+
+  uint32_t GetRestartPoint(uint32_t index) {
+    assert(index < num_restarts_);
+    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+  }
+
+  void SeekToRestartPoint(uint32_t index) {
+    raw_key_.Clear();
+    restart_index_ = index;
+    // current_ will be fixed by ParseNextKey();
+
+    // ParseNextKey() starts at the end of value_, so set value_ accordingly
+    uint32_t offset = GetRestartPoint(index);
+    value_ = Slice(data_ + offset, 0);
+  }
+
+  void CorruptionError();
+
+ protected:
+  template <typename DecodeKeyFunc>
+  inline bool BinarySeek(const Slice& target, uint32_t* index,
+                         bool* is_index_key_result);
+
+  void FindKeyAfterBinarySeek(const Slice& target, uint32_t index,
+                              bool is_index_key_result);
+};
+
+class DataBlockIter final : public BlockIter<Slice> {
+ public:
+  DataBlockIter()
+      : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
+  DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts,
+                uint32_t num_restarts, SequenceNumber global_seqno,
+                BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
+                DataBlockHashIndex* data_block_hash_index)
+      : DataBlockIter() {
+    Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno,
+               read_amp_bitmap, block_contents_pinned, data_block_hash_index);
+  }
+  void Initialize(const Comparator* raw_ucmp, const char* data,
+                  uint32_t restarts, uint32_t num_restarts,
+                  SequenceNumber global_seqno,
+                  BlockReadAmpBitmap* read_amp_bitmap,
+                  bool block_contents_pinned,
+                  DataBlockHashIndex* data_block_hash_index) {
+    InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno,
+                   block_contents_pinned);
+    raw_key_.SetIsUserKey(false);
+    read_amp_bitmap_ = read_amp_bitmap;
+    last_bitmap_offset_ = current_ + 1;
+    data_block_hash_index_ = data_block_hash_index;
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    if (read_amp_bitmap_ && current_ < restarts_ &&
+        current_ != last_bitmap_offset_) {
+      read_amp_bitmap_->Mark(current_ /* current entry offset */,
+                             NextEntryOffset() - 1);
+      last_bitmap_offset_ = current_;
+    }
+    return value_;
+  }
+
+  inline bool SeekForGet(const Slice& target) {
+    if (!data_block_hash_index_) {
+      SeekImpl(target);
+      UpdateKey();
+      return true;
+    }
+    bool res = SeekForGetImpl(target);
+    UpdateKey();
+    return res;
+  }
+
+  void Invalidate(const Status& s) override {
+    BlockIter::Invalidate(s);
+    // Clear prev entries cache.
+    prev_entries_keys_buff_.clear();
+    prev_entries_.clear();
+    prev_entries_idx_ = -1;
+  }
+
+ protected:
+  friend Block;
+  inline bool ParseNextDataKey(bool* is_shared);
+  void SeekToFirstImpl() override;
+  void SeekToLastImpl() override;
+  void SeekImpl(const Slice& target) override;
+  void SeekForPrevImpl(const Slice& target) override;
+  void NextImpl() override;
+  void PrevImpl() override;
+
+ private:
+  // read-amp bitmap
+  BlockReadAmpBitmap* read_amp_bitmap_;
+  // last `current_` value we report to read-amp bitmp
+  mutable uint32_t last_bitmap_offset_;
+  struct CachedPrevEntry {
+    explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
+                             size_t _key_offset, size_t _key_size, Slice _value)
+        : offset(_offset),
+          key_ptr(_key_ptr),
+          key_offset(_key_offset),
+          key_size(_key_size),
+          value(_value) {}
+
+    // offset of entry in block
+    uint32_t offset;
+    // Pointer to key data in block (nullptr if key is delta-encoded)
+    const char* key_ptr;
+    // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr)
+    size_t key_offset;
+    // size of key
+    size_t key_size;
+    // value slice pointing to data in block
+    Slice value;
+  };
+  std::string prev_entries_keys_buff_;
+  std::vector<CachedPrevEntry> prev_entries_;
+  int32_t prev_entries_idx_ = -1;
+
+  DataBlockHashIndex* data_block_hash_index_;
+
+  bool SeekForGetImpl(const Slice& target);
+};
+
+// Iterator over MetaBlocks.  MetaBlocks are similar to Data Blocks and
+// are used to store Properties associated with table.
+// Meta blocks always store user keys (no sequence number) and always
+// use the BytewiseComparator.  Additionally, MetaBlock accesses are
+// not recorded in the Statistics or for Read-Amplification.
+class MetaBlockIter final : public BlockIter<Slice> {
+ public:
+  MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); }
+  void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts,
+                  bool block_contents_pinned) {
+    // Initializes the iterator with a BytewiseComparator and
+    // the raw key being a user key.
+    InitializeBase(BytewiseComparator(), data, restarts, num_restarts,
+                   kDisableGlobalSequenceNumber, block_contents_pinned);
+    raw_key_.SetIsUserKey(true);
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    return value_;
+  }
+
+ protected:
+  void SeekToFirstImpl() override;
+  void SeekToLastImpl() override;
+  void SeekImpl(const Slice& target) override;
+  void SeekForPrevImpl(const Slice& target) override;
+  void NextImpl() override;
+  void PrevImpl() override;
+};
+
+class IndexBlockIter final : public BlockIter<IndexValue> {
+ public:
+  IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
+
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default true, means that no delta encoding is
+  // applied to values.
+  void Initialize(const Comparator* raw_ucmp, const char* data,
+                  uint32_t restarts, uint32_t num_restarts,
+                  SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
+                  bool have_first_key, bool key_includes_seq,
+                  bool value_is_full, bool block_contents_pinned) {
+    InitializeBase(raw_ucmp, data, restarts, num_restarts,
+                   kDisableGlobalSequenceNumber, block_contents_pinned);
+    raw_key_.SetIsUserKey(!key_includes_seq);
+    prefix_index_ = prefix_index;
+    value_delta_encoded_ = !value_is_full;
+    have_first_key_ = have_first_key;
+    if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
+      global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
+    } else {
+      global_seqno_state_.reset();
+    }
+  }
+
+  Slice user_key() const override {
+    assert(Valid());
+    return raw_key_.GetUserKey();
+  }
+
+  IndexValue value() const override {
+    assert(Valid());
+    if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
+      return decoded_value_;
+    } else {
+      IndexValue entry;
+      Slice v = value_;
+      Status decode_s __attribute__((__unused__)) =
+          entry.DecodeFrom(&v, have_first_key_, nullptr);
+      assert(decode_s.ok());
+      return entry;
+    }
+  }
+
+  bool IsValuePinned() const override {
+    return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
+  }
+
+ protected:
+  // IndexBlockIter follows a different contract for prefix iterator
+  // from data iterators.
+  // If prefix of the seek key `target` exists in the file, it must
+  // return the same result as total order seek.
+  // If the prefix of `target` doesn't exist in the file, it can either
+  // return the result of total order seek, or set both of Valid() = false
+  // and status() = NotFound().
+  void SeekImpl(const Slice& target) override;
+
+  void SeekForPrevImpl(const Slice&) override {
+    assert(false);
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    status_ = Status::InvalidArgument(
+        "RocksDB internal error: should never call SeekForPrev() on index "
+        "blocks");
+    raw_key_.Clear();
+    value_.clear();
+  }
+
+  void PrevImpl() override;
+
+  void NextImpl() override;
+
+  void SeekToFirstImpl() override;
+
+  void SeekToLastImpl() override;
+
+ private:
+  bool value_delta_encoded_;
+  bool have_first_key_;  // value includes first_internal_key
+  BlockPrefixIndex* prefix_index_;
+  // Whether the value is delta encoded. In that case the value is assumed to be
+  // BlockHandle. The first value in each restart interval is the full encoded
+  // BlockHandle; the restart of encoded size part of the BlockHandle. The
+  // offset of delta encoded BlockHandles is computed by adding the size of
+  // previous delta encoded values in the same restart interval to the offset of
+  // the first value in that restart interval.
+  IndexValue decoded_value_;
+
+  // When sequence number overwriting is enabled, this struct contains the seqno
+  // to overwrite with, and current first_internal_key with overwritten seqno.
+  // This is rarely used, so we put it behind a pointer and only allocate when
+  // needed.
+  struct GlobalSeqnoState {
+    // First internal key according to current index entry, but with sequence
+    // number overwritten to global_seqno.
+    IterKey first_internal_key;
+    SequenceNumber global_seqno;
+
+    explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {}
+  };
+
+  std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
+
+  // Set *prefix_may_exist to false if no key possibly share the same prefix
+  // as `target`. If not set, the result position should be the same as total
+  // order Seek.
+  bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist);
+  // Set *prefix_may_exist to false if no key can possibly share the same
+  // prefix as `target`. If not set, the result position should be the same
+  // as total order seek.
+  bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
+                            uint32_t left, uint32_t right, uint32_t* index,
+                            bool* prefix_may_exist);
+  inline int CompareBlockKey(uint32_t block_index, const Slice& target);
+
+  inline bool ParseNextIndexKey();
+
+  // When value_delta_encoded_ is enabled it decodes the value which is assumed
+  // to be BlockHandle and put it to decoded_value_
+  inline void DecodeCurrentValue(bool is_shared);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_builder.cc b/src/rocksdb/table/block_based/block_based_table_builder.cc
new file mode 100644
index 000000000..fed69af07
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_builder.cc
@@ -0,0 +1,2096 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_table_builder.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <atomic>
+#include <list>
+#include <map>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_helpers.h"
+#include "cache/cache_key.h"
+#include "cache/cache_reservation_manager.h"
+#include "db/dbformat.h"
+#include "index_builder.h"
+#include "logging/logging.h"
+#include "memory/memory_allocator.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/table.h"
+#include "rocksdb/types.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/block_like_traits.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/table_builder.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/work_queue.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+
+constexpr size_t kBlockTrailerSize = BlockBasedTable::kBlockTrailerSize;
+
+// Create a filter block builder based on its type.
+FilterBlockBuilder* CreateFilterBlockBuilder(
+    const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
+    const FilterBuildingContext& context,
+    const bool use_delta_encoding_for_index_values,
+    PartitionedIndexBuilder* const p_index_builder) {
+  const BlockBasedTableOptions& table_opt = context.table_options;
+  assert(table_opt.filter_policy);  // precondition
+
+  FilterBitsBuilder* filter_bits_builder =
+      BloomFilterPolicy::GetBuilderFromContext(context);
+  if (filter_bits_builder == nullptr) {
+    return nullptr;
+  } else {
+    if (table_opt.partition_filters) {
+      assert(p_index_builder != nullptr);
+      // Since after partition cut request from filter builder it takes time
+      // until index builder actully cuts the partition, until the end of a
+      // data block potentially with many keys, we take the lower bound as
+      // partition size.
+      assert(table_opt.block_size_deviation <= 100);
+      auto partition_size =
+          static_cast<uint32_t>(((table_opt.metadata_block_size *
+                                  (100 - table_opt.block_size_deviation)) +
+                                 99) /
+                                100);
+      partition_size = std::max(partition_size, static_cast<uint32_t>(1));
+      return new PartitionedFilterBlockBuilder(
+          mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
+          filter_bits_builder, table_opt.index_block_restart_interval,
+          use_delta_encoding_for_index_values, p_index_builder, partition_size);
+    } else {
+      return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
+                                        table_opt.whole_key_filtering,
+                                        filter_bits_builder);
+    }
+  }
+}
+
+bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size) {
+  // Check to see if compressed less than 12.5%
+  return compressed_size < uncomp_size - (uncomp_size / 8u);
+}
+
+}  // namespace
+
+// format_version is the block format as defined in include/rocksdb/table.h
+Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
+                    CompressionType* type, uint32_t format_version,
+                    bool do_sample, std::string* compressed_output,
+                    std::string* sampled_output_fast,
+                    std::string* sampled_output_slow) {
+  assert(type);
+  assert(compressed_output);
+  assert(compressed_output->empty());
+
+  // If requested, we sample one in every N block with a
+  // fast and slow compression algorithm and report the stats.
+  // The users can use these stats to decide if it is worthwhile
+  // enabling compression and they also get a hint about which
+  // compression algorithm wil be beneficial.
+  if (do_sample && info.SampleForCompression() &&
+      Random::GetTLSInstance()->OneIn(
+          static_cast<int>(info.SampleForCompression()))) {
+    // Sampling with a fast compression algorithm
+    if (sampled_output_fast && (LZ4_Supported() || Snappy_Supported())) {
+      CompressionType c =
+          LZ4_Supported() ? kLZ4Compression : kSnappyCompression;
+      CompressionContext context(c);
+      CompressionOptions options;
+      CompressionInfo info_tmp(options, context,
+                               CompressionDict::GetEmptyDict(), c,
+                               info.SampleForCompression());
+
+      CompressData(uncompressed_data, info_tmp,
+                   GetCompressFormatForVersion(format_version),
+                   sampled_output_fast);
+    }
+
+    // Sampling with a slow but high-compression algorithm
+    if (sampled_output_slow && (ZSTD_Supported() || Zlib_Supported())) {
+      CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression;
+      CompressionContext context(c);
+      CompressionOptions options;
+      CompressionInfo info_tmp(options, context,
+                               CompressionDict::GetEmptyDict(), c,
+                               info.SampleForCompression());
+
+      CompressData(uncompressed_data, info_tmp,
+                   GetCompressFormatForVersion(format_version),
+                   sampled_output_slow);
+    }
+  }
+
+  if (info.type() == kNoCompression) {
+    *type = kNoCompression;
+    return uncompressed_data;
+  }
+
+  // Actually compress the data; if the compression method is not supported,
+  // or the compression fails etc., just fall back to uncompressed
+  if (!CompressData(uncompressed_data, info,
+                    GetCompressFormatForVersion(format_version),
+                    compressed_output)) {
+    *type = kNoCompression;
+    return uncompressed_data;
+  }
+
+  // Check the compression ratio; if it's not good enough, just fall back to
+  // uncompressed
+  if (!GoodCompressionRatio(compressed_output->size(),
+                            uncompressed_data.size())) {
+    *type = kNoCompression;
+    return uncompressed_data;
+  }
+
+  *type = info.type();
+  return *compressed_output;
+}
+
+// kBlockBasedTableMagicNumber was picked by running
+//    echo rocksdb.table.block_based | sha1sum
+// and taking the leading 64 bits.
+// Please note that kBlockBasedTableMagicNumber may also be accessed by other
+// .cc files
+// for that reason we declare it extern in the header but to get the space
+// allocated
+// it must be not extern in one place.
+const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
+// We also support reading and writing legacy block based table format (for
+// backwards compatibility)
+const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
+
+// A collector that collects properties of interest to block-based table.
+// For now this class looks heavy-weight since we only write one additional
+// property.
+// But in the foreseeable future, we will add more and more properties that are
+// specific to block-based table.
+class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
+    : public IntTblPropCollector {
+ public:
+  explicit BlockBasedTablePropertiesCollector(
+      BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering,
+      bool prefix_filtering)
+      : index_type_(index_type),
+        whole_key_filtering_(whole_key_filtering),
+        prefix_filtering_(prefix_filtering) {}
+
+  Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
+                     uint64_t /*file_size*/) override {
+    // Intentionally left blank. Have no interest in collecting stats for
+    // individual key/value pairs.
+    return Status::OK();
+  }
+
+  virtual void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) override {
+    // Intentionally left blank. No interest in collecting stats for
+    // blocks.
+    return;
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string val;
+    PutFixed32(&val, static_cast<uint32_t>(index_type_));
+    properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
+    properties->insert({BlockBasedTablePropertyNames::kWholeKeyFiltering,
+                        whole_key_filtering_ ? kPropTrue : kPropFalse});
+    properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering,
+                        prefix_filtering_ ? kPropTrue : kPropFalse});
+    return Status::OK();
+  }
+
+  // The name of the properties collector can be used for debugging purpose.
+  const char* Name() const override {
+    return "BlockBasedTablePropertiesCollector";
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    // Intentionally left blank.
+    return UserCollectedProperties();
+  }
+
+ private:
+  BlockBasedTableOptions::IndexType index_type_;
+  bool whole_key_filtering_;
+  bool prefix_filtering_;
+};
+
+struct BlockBasedTableBuilder::Rep {
+  const ImmutableOptions ioptions;
+  const MutableCFOptions moptions;
+  const BlockBasedTableOptions table_options;
+  const InternalKeyComparator& internal_comparator;
+  WritableFileWriter* file;
+  std::atomic<uint64_t> offset;
+  size_t alignment;
+  BlockBuilder data_block;
+  // Buffers uncompressed data blocks to replay later. Needed when
+  // compression dictionary is enabled so we can finalize the dictionary before
+  // compressing any data blocks.
+  std::vector<std::string> data_block_buffers;
+  BlockBuilder range_del_block;
+
+  InternalKeySliceTransform internal_prefix_transform;
+  std::unique_ptr<IndexBuilder> index_builder;
+  PartitionedIndexBuilder* p_index_builder_ = nullptr;
+
+  std::string last_key;
+  const Slice* first_key_in_next_block = nullptr;
+  CompressionType compression_type;
+  uint64_t sample_for_compression;
+  std::atomic<uint64_t> compressible_input_data_bytes;
+  std::atomic<uint64_t> uncompressible_input_data_bytes;
+  std::atomic<uint64_t> sampled_input_data_bytes;
+  std::atomic<uint64_t> sampled_output_slow_data_bytes;
+  std::atomic<uint64_t> sampled_output_fast_data_bytes;
+  CompressionOptions compression_opts;
+  std::unique_ptr<CompressionDict> compression_dict;
+  std::vector<std::unique_ptr<CompressionContext>> compression_ctxs;
+  std::vector<std::unique_ptr<UncompressionContext>> verify_ctxs;
+  std::unique_ptr<UncompressionDict> verify_dict;
+
+  size_t data_begin_offset = 0;
+
+  TableProperties props;
+
+  // States of the builder.
+  //
+  // - `kBuffered`: This is the initial state where zero or more data blocks are
+  //   accumulated uncompressed in-memory. From this state, call
+  //   `EnterUnbuffered()` to finalize the compression dictionary if enabled,
+  //   compress/write out any buffered blocks, and proceed to the `kUnbuffered`
+  //   state.
+  //
+  // - `kUnbuffered`: This is the state when compression dictionary is finalized
+  //   either because it wasn't enabled in the first place or it's been created
+  //   from sampling previously buffered data. In this state, blocks are simply
+  //   compressed/written out as they fill up. From this state, call `Finish()`
+  //   to complete the file (write meta-blocks, etc.), or `Abandon()` to delete
+  //   the partially created file.
+  //
+  // - `kClosed`: This indicates either `Finish()` or `Abandon()` has been
+  //   called, so the table builder is no longer usable. We must be in this
+  //   state by the time the destructor runs.
+  enum class State {
+    kBuffered,
+    kUnbuffered,
+    kClosed,
+  };
+  State state;
+  // `kBuffered` state is allowed only as long as the buffering of uncompressed
+  // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`.
+  uint64_t buffer_limit;
+  std::shared_ptr<CacheReservationManager>
+      compression_dict_buffer_cache_res_mgr;
+  const bool use_delta_encoding_for_index_values;
+  std::unique_ptr<FilterBlockBuilder> filter_builder;
+  OffsetableCacheKey base_cache_key;
+  const TableFileCreationReason reason;
+
+  BlockHandle pending_handle;  // Handle to add to index block
+
+  std::string compressed_output;
+  std::unique_ptr<FlushBlockPolicy> flush_block_policy;
+
+  std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
+
+  std::unique_ptr<ParallelCompressionRep> pc_rep;
+
+  uint64_t get_offset() { return offset.load(std::memory_order_relaxed); }
+  void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }
+
+  bool IsParallelCompressionEnabled() const {
+    return compression_opts.parallel_threads > 1;
+  }
+
+  Status GetStatus() {
+    // We need to make modifications of status visible when status_ok is set
+    // to false, and this is ensured by status_mutex, so no special memory
+    // order for status_ok is required.
+    if (status_ok.load(std::memory_order_relaxed)) {
+      return Status::OK();
+    } else {
+      return CopyStatus();
+    }
+  }
+
+  Status CopyStatus() {
+    std::lock_guard<std::mutex> lock(status_mutex);
+    return status;
+  }
+
+  IOStatus GetIOStatus() {
+    // We need to make modifications of io_status visible when status_ok is set
+    // to false, and this is ensured by io_status_mutex, so no special memory
+    // order for io_status_ok is required.
+    if (io_status_ok.load(std::memory_order_relaxed)) {
+      return IOStatus::OK();
+    } else {
+      return CopyIOStatus();
+    }
+  }
+
+  IOStatus CopyIOStatus() {
+    std::lock_guard<std::mutex> lock(io_status_mutex);
+    return io_status;
+  }
+
+  // Never erase an existing status that is not OK.
+  void SetStatus(Status s) {
+    if (!s.ok() && status_ok.load(std::memory_order_relaxed)) {
+      // Locking is an overkill for non compression_opts.parallel_threads
+      // case but since it's unlikely that s is not OK, we take this cost
+      // to be simplicity.
+      std::lock_guard<std::mutex> lock(status_mutex);
+      status = s;
+      status_ok.store(false, std::memory_order_relaxed);
+    }
+  }
+
+  // Never erase an existing I/O status that is not OK.
+  // Calling this will also SetStatus(ios)
+  void SetIOStatus(IOStatus ios) {
+    if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) {
+      // Locking is an overkill for non compression_opts.parallel_threads
+      // case but since it's unlikely that s is not OK, we take this cost
+      // to be simplicity.
+      std::lock_guard<std::mutex> lock(io_status_mutex);
+      io_status = ios;
+      io_status_ok.store(false, std::memory_order_relaxed);
+    }
+    SetStatus(ios);
+  }
+
+  Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo,
+      WritableFileWriter* f)
+      : ioptions(tbo.ioptions),
+        moptions(tbo.moptions),
+        table_options(table_opt),
+        internal_comparator(tbo.internal_comparator),
+        file(f),
+        offset(0),
+        alignment(table_options.block_align
+                      ? std::min(static_cast<size_t>(table_options.block_size),
+                                 kDefaultPageSize)
+                      : 0),
+        data_block(table_options.block_restart_interval,
+                   table_options.use_delta_encoding,
+                   false /* use_value_delta_encoding */,
+                   tbo.internal_comparator.user_comparator()
+                           ->CanKeysWithDifferentByteContentsBeEqual()
+                       ? BlockBasedTableOptions::kDataBlockBinarySearch
+                       : table_options.data_block_index_type,
+                   table_options.data_block_hash_table_util_ratio),
+        range_del_block(1 /* block_restart_interval */),
+        internal_prefix_transform(tbo.moptions.prefix_extractor.get()),
+        compression_type(tbo.compression_type),
+        sample_for_compression(tbo.moptions.sample_for_compression),
+        compressible_input_data_bytes(0),
+        uncompressible_input_data_bytes(0),
+        sampled_input_data_bytes(0),
+        sampled_output_slow_data_bytes(0),
+        sampled_output_fast_data_bytes(0),
+        compression_opts(tbo.compression_opts),
+        compression_dict(),
+        compression_ctxs(tbo.compression_opts.parallel_threads),
+        verify_ctxs(tbo.compression_opts.parallel_threads),
+        verify_dict(),
+        state((tbo.compression_opts.max_dict_bytes > 0) ? State::kBuffered
+                                                        : State::kUnbuffered),
+        use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
+                                            !table_opt.block_align),
+        reason(tbo.reason),
+        flush_block_policy(
+            table_options.flush_block_policy_factory->NewFlushBlockPolicy(
+                table_options, data_block)),
+        status_ok(true),
+        io_status_ok(true) {
+    if (tbo.target_file_size == 0) {
+      buffer_limit = compression_opts.max_dict_buffer_bytes;
+    } else if (compression_opts.max_dict_buffer_bytes == 0) {
+      buffer_limit = tbo.target_file_size;
+    } else {
+      buffer_limit = std::min(tbo.target_file_size,
+                              compression_opts.max_dict_buffer_bytes);
+    }
+
+    const auto compress_dict_build_buffer_charged =
+        table_options.cache_usage_options.options_overrides
+            .at(CacheEntryRole::kCompressionDictionaryBuildingBuffer)
+            .charged;
+    if (table_options.block_cache &&
+        (compress_dict_build_buffer_charged ==
+             CacheEntryRoleOptions::Decision::kEnabled ||
+         compress_dict_build_buffer_charged ==
+             CacheEntryRoleOptions::Decision::kFallback)) {
+      compression_dict_buffer_cache_res_mgr =
+          std::make_shared<CacheReservationManagerImpl<
+              CacheEntryRole::kCompressionDictionaryBuildingBuffer>>(
+              table_options.block_cache);
+    } else {
+      compression_dict_buffer_cache_res_mgr = nullptr;
+    }
+
+    for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
+      compression_ctxs[i].reset(new CompressionContext(compression_type));
+    }
+    if (table_options.index_type ==
+        BlockBasedTableOptions::kTwoLevelIndexSearch) {
+      p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
+          &internal_comparator, use_delta_encoding_for_index_values,
+          table_options);
+      index_builder.reset(p_index_builder_);
+    } else {
+      index_builder.reset(IndexBuilder::CreateIndexBuilder(
+          table_options.index_type, &internal_comparator,
+          &this->internal_prefix_transform, use_delta_encoding_for_index_values,
+          table_options));
+    }
+    if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) {
+      // Apply optimize_filters_for_hits setting here when applicable by
+      // skipping filter generation
+      filter_builder.reset();
+    } else if (tbo.skip_filters) {
+      // For SstFileWriter skip_filters
+      filter_builder.reset();
+    } else if (!table_options.filter_policy) {
+      // Null filter_policy -> no filter
+      filter_builder.reset();
+    } else {
+      FilterBuildingContext filter_context(table_options);
+
+      filter_context.info_log = ioptions.logger;
+      filter_context.column_family_name = tbo.column_family_name;
+      filter_context.reason = reason;
+
+      // Only populate other fields if known to be in LSM rather than
+      // generating external SST file
+      if (reason != TableFileCreationReason::kMisc) {
+        filter_context.compaction_style = ioptions.compaction_style;
+        filter_context.num_levels = ioptions.num_levels;
+        filter_context.level_at_creation = tbo.level_at_creation;
+        filter_context.is_bottommost = tbo.is_bottommost;
+        assert(filter_context.level_at_creation < filter_context.num_levels);
+      }
+
+      filter_builder.reset(CreateFilterBlockBuilder(
+          ioptions, moptions, filter_context,
+          use_delta_encoding_for_index_values, p_index_builder_));
+    }
+
+    assert(tbo.int_tbl_prop_collector_factories);
+    for (auto& factory : *tbo.int_tbl_prop_collector_factories) {
+      assert(factory);
+
+      table_properties_collectors.emplace_back(
+          factory->CreateIntTblPropCollector(tbo.column_family_id,
+                                             tbo.level_at_creation));
+    }
+    table_properties_collectors.emplace_back(
+        new BlockBasedTablePropertiesCollector(
+            table_options.index_type, table_options.whole_key_filtering,
+            moptions.prefix_extractor != nullptr));
+    const Comparator* ucmp = tbo.internal_comparator.user_comparator();
+    assert(ucmp);
+    if (ucmp->timestamp_size() > 0) {
+      table_properties_collectors.emplace_back(
+          new TimestampTablePropertiesCollector(ucmp));
+    }
+    if (table_options.verify_compression) {
+      for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
+        verify_ctxs[i].reset(new UncompressionContext(compression_type));
+      }
+    }
+
+    // These are only needed for populating table properties
+    props.column_family_id = tbo.column_family_id;
+    props.column_family_name = tbo.column_family_name;
+    props.oldest_key_time = tbo.oldest_key_time;
+    props.file_creation_time = tbo.file_creation_time;
+    props.orig_file_number = tbo.cur_file_num;
+    props.db_id = tbo.db_id;
+    props.db_session_id = tbo.db_session_id;
+    props.db_host_id = ioptions.db_host_id;
+    if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) {
+      ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set");
+    }
+  }
+
+  Rep(const Rep&) = delete;
+  Rep& operator=(const Rep&) = delete;
+
+ private:
+  // Synchronize status & io_status accesses across threads from main thread,
+  // compression thread and write thread in parallel compression.
+  std::mutex status_mutex;
+  std::atomic<bool> status_ok;
+  Status status;
+  std::mutex io_status_mutex;
+  std::atomic<bool> io_status_ok;
+  IOStatus io_status;
+};
+
+struct BlockBasedTableBuilder::ParallelCompressionRep {
+  // Keys is a wrapper of vector of strings avoiding
+  // releasing string memories during vector clear()
+  // in order to save memory allocation overhead
+  class Keys {
+   public:
+    Keys() : keys_(kKeysInitSize), size_(0) {}
+    void PushBack(const Slice& key) {
+      if (size_ == keys_.size()) {
+        keys_.emplace_back(key.data(), key.size());
+      } else {
+        keys_[size_].assign(key.data(), key.size());
+      }
+      size_++;
+    }
+    void SwapAssign(std::vector<std::string>& keys) {
+      size_ = keys.size();
+      std::swap(keys_, keys);
+    }
+    void Clear() { size_ = 0; }
+    size_t Size() { return size_; }
+    std::string& Back() { return keys_[size_ - 1]; }
+    std::string& operator[](size_t idx) {
+      assert(idx < size_);
+      return keys_[idx];
+    }
+
+   private:
+    const size_t kKeysInitSize = 32;
+    std::vector<std::string> keys_;
+    size_t size_;
+  };
+  std::unique_ptr<Keys> curr_block_keys;
+
+  class BlockRepSlot;
+
+  // BlockRep instances are fetched from and recycled to
+  // block_rep_pool during parallel compression.
+  struct BlockRep {
+    Slice contents;
+    Slice compressed_contents;
+    std::unique_ptr<std::string> data;
+    std::unique_ptr<std::string> compressed_data;
+    CompressionType compression_type;
+    std::unique_ptr<std::string> first_key_in_next_block;
+    std::unique_ptr<Keys> keys;
+    std::unique_ptr<BlockRepSlot> slot;
+    Status status;
+  };
+  // Use a vector of BlockRep as a buffer for a determined number
+  // of BlockRep structures. All data referenced by pointers in
+  // BlockRep will be freed when this vector is destructed.
+  using BlockRepBuffer = std::vector<BlockRep>;
+  BlockRepBuffer block_rep_buf;
+  // Use a thread-safe queue for concurrent access from block
+  // building thread and writer thread.
+  using BlockRepPool = WorkQueue<BlockRep*>;
+  BlockRepPool block_rep_pool;
+
+  // Use BlockRepSlot to keep block order in write thread.
+  // slot_ will pass references to BlockRep
+  class BlockRepSlot {
+   public:
+    BlockRepSlot() : slot_(1) {}
+    template <typename T>
+    void Fill(T&& rep) {
+      slot_.push(std::forward<T>(rep));
+    };
+    void Take(BlockRep*& rep) { slot_.pop(rep); }
+
+   private:
+    // slot_ will pass references to BlockRep in block_rep_buf,
+    // and those references are always valid before the destruction of
+    // block_rep_buf.
+    WorkQueue<BlockRep*> slot_;
+  };
+
+  // Compression queue will pass references to BlockRep in block_rep_buf,
+  // and those references are always valid before the destruction of
+  // block_rep_buf.
+  using CompressQueue = WorkQueue<BlockRep*>;
+  CompressQueue compress_queue;
+  std::vector<port::Thread> compress_thread_pool;
+
+  // Write queue will pass references to BlockRep::slot in block_rep_buf,
+  // and those references are always valid before the corresponding
+  // BlockRep::slot is destructed, which is before the destruction of
+  // block_rep_buf.
+  using WriteQueue = WorkQueue<BlockRepSlot*>;
+  WriteQueue write_queue;
+  std::unique_ptr<port::Thread> write_thread;
+
+  // Estimate output file size when parallel compression is enabled. This is
+  // necessary because compression & flush are no longer synchronized,
+  // and BlockBasedTableBuilder::FileSize() is no longer accurate.
+  // memory_order_relaxed suffices because accurate statistics is not required.
+  class FileSizeEstimator {
+   public:
+    explicit FileSizeEstimator()
+        : uncomp_bytes_compressed(0),
+          uncomp_bytes_curr_block(0),
+          uncomp_bytes_curr_block_set(false),
+          uncomp_bytes_inflight(0),
+          blocks_inflight(0),
+          curr_compression_ratio(0),
+          estimated_file_size(0) {}
+
+    // Estimate file size when a block is about to be emitted to
+    // compression thread
+    void EmitBlock(uint64_t uncomp_block_size, uint64_t curr_file_size) {
+      uint64_t new_uncomp_bytes_inflight =
+          uncomp_bytes_inflight.fetch_add(uncomp_block_size,
+                                          std::memory_order_relaxed) +
+          uncomp_block_size;
+
+      uint64_t new_blocks_inflight =
+          blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1;
+
+      estimated_file_size.store(
+          curr_file_size +
+              static_cast<uint64_t>(
+                  static_cast<double>(new_uncomp_bytes_inflight) *
+                  curr_compression_ratio.load(std::memory_order_relaxed)) +
+              new_blocks_inflight * kBlockTrailerSize,
+          std::memory_order_relaxed);
+    }
+
+    // Estimate file size when a block is already reaped from
+    // compression thread
+    void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) {
+      assert(uncomp_bytes_curr_block_set);
+
+      uint64_t new_uncomp_bytes_compressed =
+          uncomp_bytes_compressed + uncomp_bytes_curr_block;
+      assert(new_uncomp_bytes_compressed > 0);
+
+      curr_compression_ratio.store(
+          (curr_compression_ratio.load(std::memory_order_relaxed) *
+               uncomp_bytes_compressed +
+           compressed_block_size) /
+              static_cast<double>(new_uncomp_bytes_compressed),
+          std::memory_order_relaxed);
+      uncomp_bytes_compressed = new_uncomp_bytes_compressed;
+
+      uint64_t new_uncomp_bytes_inflight =
+          uncomp_bytes_inflight.fetch_sub(uncomp_bytes_curr_block,
+                                          std::memory_order_relaxed) -
+          uncomp_bytes_curr_block;
+
+      uint64_t new_blocks_inflight =
+          blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1;
+
+      estimated_file_size.store(
+          curr_file_size +
+              static_cast<uint64_t>(
+                  static_cast<double>(new_uncomp_bytes_inflight) *
+                  curr_compression_ratio.load(std::memory_order_relaxed)) +
+              new_blocks_inflight * kBlockTrailerSize,
+          std::memory_order_relaxed);
+
+      uncomp_bytes_curr_block_set = false;
+    }
+
+    void SetEstimatedFileSize(uint64_t size) {
+      estimated_file_size.store(size, std::memory_order_relaxed);
+    }
+
+    uint64_t GetEstimatedFileSize() {
+      return estimated_file_size.load(std::memory_order_relaxed);
+    }
+
+    void SetCurrBlockUncompSize(uint64_t size) {
+      uncomp_bytes_curr_block = size;
+      uncomp_bytes_curr_block_set = true;
+    }
+
+   private:
+    // Input bytes compressed so far.
+    uint64_t uncomp_bytes_compressed;
+    // Size of current block being appended.
+    uint64_t uncomp_bytes_curr_block;
+    // Whether uncomp_bytes_curr_block has been set for next
+    // ReapBlock call.
+    bool uncomp_bytes_curr_block_set;
+    // Input bytes under compression and not appended yet.
+    std::atomic<uint64_t> uncomp_bytes_inflight;
+    // Number of blocks under compression and not appended yet.
+    std::atomic<uint64_t> blocks_inflight;
+    // Current compression ratio, maintained by BGWorkWriteMaybeCompressedBlock.
+    std::atomic<double> curr_compression_ratio;
+    // Estimated SST file size.
+    std::atomic<uint64_t> estimated_file_size;
+  };
+  FileSizeEstimator file_size_estimator;
+
+  // Facilities used for waiting first block completion. Need to Wait for
+  // the completion of first block compression and flush to get a non-zero
+  // compression ratio.
+  std::atomic<bool> first_block_processed;
+  std::condition_variable first_block_cond;
+  std::mutex first_block_mutex;
+
+  explicit ParallelCompressionRep(uint32_t parallel_threads)
+      : curr_block_keys(new Keys()),
+        block_rep_buf(parallel_threads),
+        block_rep_pool(parallel_threads),
+        compress_queue(parallel_threads),
+        write_queue(parallel_threads),
+        first_block_processed(false) {
+    for (uint32_t i = 0; i < parallel_threads; i++) {
+      block_rep_buf[i].contents = Slice();
+      block_rep_buf[i].compressed_contents = Slice();
+      block_rep_buf[i].data.reset(new std::string());
+      block_rep_buf[i].compressed_data.reset(new std::string());
+      block_rep_buf[i].compression_type = CompressionType();
+      block_rep_buf[i].first_key_in_next_block.reset(new std::string());
+      block_rep_buf[i].keys.reset(new Keys());
+      block_rep_buf[i].slot.reset(new BlockRepSlot());
+      block_rep_buf[i].status = Status::OK();
+      block_rep_pool.push(&block_rep_buf[i]);
+    }
+  }
+
+  ~ParallelCompressionRep() { block_rep_pool.finish(); }
+
+  // Make a block prepared to be emitted to compression thread
+  // Used in non-buffered mode
+  BlockRep* PrepareBlock(CompressionType compression_type,
+                         const Slice* first_key_in_next_block,
+                         BlockBuilder* data_block) {
+    BlockRep* block_rep =
+        PrepareBlockInternal(compression_type, first_key_in_next_block);
+    assert(block_rep != nullptr);
+    data_block->SwapAndReset(*(block_rep->data));
+    block_rep->contents = *(block_rep->data);
+    std::swap(block_rep->keys, curr_block_keys);
+    curr_block_keys->Clear();
+    return block_rep;
+  }
+
+  // Used in EnterUnbuffered
+  BlockRep* PrepareBlock(CompressionType compression_type,
+                         const Slice* first_key_in_next_block,
+                         std::string* data_block,
+                         std::vector<std::string>* keys) {
+    BlockRep* block_rep =
+        PrepareBlockInternal(compression_type, first_key_in_next_block);
+    assert(block_rep != nullptr);
+    std::swap(*(block_rep->data), *data_block);
+    block_rep->contents = *(block_rep->data);
+    block_rep->keys->SwapAssign(*keys);
+    return block_rep;
+  }
+
+  // Emit a block to compression thread
+  void EmitBlock(BlockRep* block_rep) {
+    assert(block_rep != nullptr);
+    assert(block_rep->status.ok());
+    if (!write_queue.push(block_rep->slot.get())) {
+      return;
+    }
+    if (!compress_queue.push(block_rep)) {
+      return;
+    }
+
+    if (!first_block_processed.load(std::memory_order_relaxed)) {
+      std::unique_lock<std::mutex> lock(first_block_mutex);
+      first_block_cond.wait(lock, [this] {
+        return first_block_processed.load(std::memory_order_relaxed);
+      });
+    }
+  }
+
+  // Reap a block from compression thread
+  void ReapBlock(BlockRep* block_rep) {
+    assert(block_rep != nullptr);
+    block_rep->compressed_data->clear();
+    block_rep_pool.push(block_rep);
+
+    if (!first_block_processed.load(std::memory_order_relaxed)) {
+      std::lock_guard<std::mutex> lock(first_block_mutex);
+      first_block_processed.store(true, std::memory_order_relaxed);
+      first_block_cond.notify_one();
+    }
+  }
+
+ private:
+  BlockRep* PrepareBlockInternal(CompressionType compression_type,
+                                 const Slice* first_key_in_next_block) {
+    BlockRep* block_rep = nullptr;
+    block_rep_pool.pop(block_rep);
+    assert(block_rep != nullptr);
+
+    assert(block_rep->data);
+
+    block_rep->compression_type = compression_type;
+
+    if (first_key_in_next_block == nullptr) {
+      block_rep->first_key_in_next_block.reset(nullptr);
+    } else {
+      block_rep->first_key_in_next_block->assign(
+          first_key_in_next_block->data(), first_key_in_next_block->size());
+    }
+
+    return block_rep;
+  }
+};
+
+BlockBasedTableBuilder::BlockBasedTableBuilder(
+    const BlockBasedTableOptions& table_options, const TableBuilderOptions& tbo,
+    WritableFileWriter* file) {
+  BlockBasedTableOptions sanitized_table_options(table_options);
+  if (sanitized_table_options.format_version == 0 &&
+      sanitized_table_options.checksum != kCRC32c) {
+    ROCKS_LOG_WARN(
+        tbo.ioptions.logger,
+        "Silently converting format_version to 1 because checksum is "
+        "non-default");
+    // silently convert format_version to 1 to keep consistent with current
+    // behavior
+    sanitized_table_options.format_version = 1;
+  }
+
+  rep_ = new Rep(sanitized_table_options, tbo, file);
+
+  TEST_SYNC_POINT_CALLBACK(
+      "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey",
+      const_cast<TableProperties*>(&rep_->props));
+
+  BlockBasedTable::SetupBaseCacheKey(&rep_->props, tbo.db_session_id,
+                                     tbo.cur_file_num, &rep_->base_cache_key);
+
+  if (rep_->IsParallelCompressionEnabled()) {
+    StartParallelCompression();
+  }
+}
+
+BlockBasedTableBuilder::~BlockBasedTableBuilder() {
+  // Catch errors where caller forgot to call Finish()
+  assert(rep_->state == Rep::State::kClosed);
+  delete rep_;
+}
+
+void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
+  Rep* r = rep_;
+  assert(rep_->state != Rep::State::kClosed);
+  if (!ok()) return;
+  ValueType value_type = ExtractValueType(key);
+  if (IsValueType(value_type)) {
+#ifndef NDEBUG
+    if (r->props.num_entries > r->props.num_range_deletions) {
+      assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
+    }
+#endif  // !NDEBUG
+
+    auto should_flush = r->flush_block_policy->Update(key, value);
+    if (should_flush) {
+      assert(!r->data_block.empty());
+      r->first_key_in_next_block = &key;
+      Flush();
+      if (r->state == Rep::State::kBuffered) {
+        bool exceeds_buffer_limit =
+            (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit);
+        bool exceeds_global_block_cache_limit = false;
+
+        // Increase cache charging for the last buffered data block
+        // only if the block is not going to be unbuffered immediately
+        // and there exists a cache reservation manager
+        if (!exceeds_buffer_limit &&
+            r->compression_dict_buffer_cache_res_mgr != nullptr) {
+          Status s =
+              r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation(
+                  r->data_begin_offset);
+          exceeds_global_block_cache_limit = s.IsMemoryLimit();
+        }
+
+        if (exceeds_buffer_limit || exceeds_global_block_cache_limit) {
+          EnterUnbuffered();
+        }
+      }
+
+      // Add item to index block.
+      // We do not emit the index entry for a block until we have seen the
+      // first key for the next data block.  This allows us to use shorter
+      // keys in the index block.  For example, consider a block boundary
+      // between the keys "the quick brown fox" and "the who".  We can use
+      // "the r" as the key for the index block entry since it is >= all
+      // entries in the first block and < all entries in subsequent
+      // blocks.
+      if (ok() && r->state == Rep::State::kUnbuffered) {
+        if (r->IsParallelCompressionEnabled()) {
+          r->pc_rep->curr_block_keys->Clear();
+        } else {
+          r->index_builder->AddIndexEntry(&r->last_key, &key,
+                                          r->pending_handle);
+        }
+      }
+    }
+
+    // Note: PartitionedFilterBlockBuilder requires key being added to filter
+    // builder after being added to index builder.
+    if (r->state == Rep::State::kUnbuffered) {
+      if (r->IsParallelCompressionEnabled()) {
+        r->pc_rep->curr_block_keys->PushBack(key);
+      } else {
+        if (r->filter_builder != nullptr) {
+          size_t ts_sz =
+              r->internal_comparator.user_comparator()->timestamp_size();
+          r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+        }
+      }
+    }
+
+    r->data_block.AddWithLastKey(key, value, r->last_key);
+    r->last_key.assign(key.data(), key.size());
+    if (r->state == Rep::State::kBuffered) {
+      // Buffered keys will be replayed from data_block_buffers during
+      // `Finish()` once compression dictionary has been finalized.
+    } else {
+      if (!r->IsParallelCompressionEnabled()) {
+        r->index_builder->OnKeyAdded(key);
+      }
+    }
+    // TODO offset passed in is not accurate for parallel compression case
+    NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
+                                      r->table_properties_collectors,
+                                      r->ioptions.logger);
+
+  } else if (value_type == kTypeRangeDeletion) {
+    r->range_del_block.Add(key, value);
+    // TODO offset passed in is not accurate for parallel compression case
+    NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
+                                      r->table_properties_collectors,
+                                      r->ioptions.logger);
+  } else {
+    assert(false);
+  }
+
+  r->props.num_entries++;
+  r->props.raw_key_size += key.size();
+  r->props.raw_value_size += value.size();
+  if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion ||
+      value_type == kTypeDeletionWithTimestamp) {
+    r->props.num_deletions++;
+  } else if (value_type == kTypeRangeDeletion) {
+    r->props.num_deletions++;
+    r->props.num_range_deletions++;
+  } else if (value_type == kTypeMerge) {
+    r->props.num_merge_operands++;
+  }
+}
+
+void BlockBasedTableBuilder::Flush() {
+  Rep* r = rep_;
+  assert(rep_->state != Rep::State::kClosed);
+  if (!ok()) return;
+  if (r->data_block.empty()) return;
+  if (r->IsParallelCompressionEnabled() &&
+      r->state == Rep::State::kUnbuffered) {
+    r->data_block.Finish();
+    ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
+        r->compression_type, r->first_key_in_next_block, &(r->data_block));
+    assert(block_rep != nullptr);
+    r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
+                                             r->get_offset());
+    r->pc_rep->EmitBlock(block_rep);
+  } else {
+    WriteBlock(&r->data_block, &r->pending_handle, BlockType::kData);
+  }
+}
+
+void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
+                                        BlockHandle* handle,
+                                        BlockType block_type) {
+  block->Finish();
+  std::string uncompressed_block_data;
+  uncompressed_block_data.reserve(rep_->table_options.block_size);
+  block->SwapAndReset(uncompressed_block_data);
+  if (rep_->state == Rep::State::kBuffered) {
+    assert(block_type == BlockType::kData);
+    rep_->data_block_buffers.emplace_back(std::move(uncompressed_block_data));
+    rep_->data_begin_offset += rep_->data_block_buffers.back().size();
+    return;
+  }
+  WriteBlock(uncompressed_block_data, handle, block_type);
+}
+
+void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
+                                        BlockHandle* handle,
+                                        BlockType block_type) {
+  Rep* r = rep_;
+  assert(r->state == Rep::State::kUnbuffered);
+  Slice block_contents;
+  CompressionType type;
+  Status compress_status;
+  bool is_data_block = block_type == BlockType::kData;
+  CompressAndVerifyBlock(uncompressed_block_data, is_data_block,
+                         *(r->compression_ctxs[0]), r->verify_ctxs[0].get(),
+                         &(r->compressed_output), &(block_contents), &type,
+                         &compress_status);
+  r->SetStatus(compress_status);
+  if (!ok()) {
+    return;
+  }
+
+  WriteMaybeCompressedBlock(block_contents, type, handle, block_type,
+                            &uncompressed_block_data);
+  r->compressed_output.clear();
+  if (is_data_block) {
+    r->props.data_size = r->get_offset();
+    ++r->props.num_data_blocks;
+  }
+}
+
+void BlockBasedTableBuilder::BGWorkCompression(
+    const CompressionContext& compression_ctx,
+    UncompressionContext* verify_ctx) {
+  ParallelCompressionRep::BlockRep* block_rep = nullptr;
+  while (rep_->pc_rep->compress_queue.pop(block_rep)) {
+    assert(block_rep != nullptr);
+    CompressAndVerifyBlock(block_rep->contents, true, /* is_data_block*/
+                           compression_ctx, verify_ctx,
+                           block_rep->compressed_data.get(),
+                           &block_rep->compressed_contents,
+                           &(block_rep->compression_type), &block_rep->status);
+    block_rep->slot->Fill(block_rep);
+  }
+}
+
+void BlockBasedTableBuilder::CompressAndVerifyBlock(
+    const Slice& uncompressed_block_data, bool is_data_block,
+    const CompressionContext& compression_ctx, UncompressionContext* verify_ctx,
+    std::string* compressed_output, Slice* block_contents,
+    CompressionType* type, Status* out_status) {
+  // File format contains a sequence of blocks where each block has:
+  //    block_data: uint8[n]
+  //    type: uint8
+  //    crc: uint32
+  Rep* r = rep_;
+  bool is_status_ok = ok();
+  if (!r->IsParallelCompressionEnabled()) {
+    assert(is_status_ok);
+  }
+
+  *type = r->compression_type;
+  uint64_t sample_for_compression = r->sample_for_compression;
+  bool abort_compression = false;
+
+  StopWatchNano timer(
+      r->ioptions.clock,
+      ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
+
+  if (is_status_ok && uncompressed_block_data.size() < kCompressionSizeLimit) {
+    if (is_data_block) {
+      r->compressible_input_data_bytes.fetch_add(uncompressed_block_data.size(),
+                                                 std::memory_order_relaxed);
+    }
+    const CompressionDict* compression_dict;
+    if (!is_data_block || r->compression_dict == nullptr) {
+      compression_dict = &CompressionDict::GetEmptyDict();
+    } else {
+      compression_dict = r->compression_dict.get();
+    }
+    assert(compression_dict != nullptr);
+    CompressionInfo compression_info(r->compression_opts, compression_ctx,
+                                     *compression_dict, *type,
+                                     sample_for_compression);
+
+    std::string sampled_output_fast;
+    std::string sampled_output_slow;
+    *block_contents = CompressBlock(
+        uncompressed_block_data, compression_info, type,
+        r->table_options.format_version, is_data_block /* do_sample */,
+        compressed_output, &sampled_output_fast, &sampled_output_slow);
+
+    if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
+      // Currently compression sampling is only enabled for data block.
+      assert(is_data_block);
+      r->sampled_input_data_bytes.fetch_add(uncompressed_block_data.size(),
+                                            std::memory_order_relaxed);
+      r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(),
+                                                  std::memory_order_relaxed);
+      r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(),
+                                                  std::memory_order_relaxed);
+    }
+    // notify collectors on block add
+    NotifyCollectTableCollectorsOnBlockAdd(
+        r->table_properties_collectors, uncompressed_block_data.size(),
+        sampled_output_fast.size(), sampled_output_slow.size());
+
+    // Some of the compression algorithms are known to be unreliable. If
+    // the verify_compression flag is set then try to de-compress the
+    // compressed data and compare to the input.
+    if (*type != kNoCompression && r->table_options.verify_compression) {
+      // Retrieve the uncompressed contents into a new buffer
+      const UncompressionDict* verify_dict;
+      if (!is_data_block || r->verify_dict == nullptr) {
+        verify_dict = &UncompressionDict::GetEmptyDict();
+      } else {
+        verify_dict = r->verify_dict.get();
+      }
+      assert(verify_dict != nullptr);
+      BlockContents contents;
+      UncompressionInfo uncompression_info(*verify_ctx, *verify_dict,
+                                           r->compression_type);
+      Status stat = UncompressBlockData(
+          uncompression_info, block_contents->data(), block_contents->size(),
+          &contents, r->table_options.format_version, r->ioptions);
+
+      if (stat.ok()) {
+        bool compressed_ok =
+            contents.data.compare(uncompressed_block_data) == 0;
+        if (!compressed_ok) {
+          // The result of the compression was invalid. abort.
+          abort_compression = true;
+          const char* const msg =
+              "Decompressed block did not match pre-compression block";
+          ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg);
+          *out_status = Status::Corruption(msg);
+        }
+      } else {
+        // Decompression reported an error. abort.
+        *out_status = Status::Corruption(std::string("Could not decompress: ") +
+                                         stat.getState());
+        abort_compression = true;
+      }
+    }
+  } else {
+    // Block is too big to be compressed.
+    if (is_data_block) {
+      r->uncompressible_input_data_bytes.fetch_add(
+          uncompressed_block_data.size(), std::memory_order_relaxed);
+    }
+    abort_compression = true;
+  }
+  if (is_data_block) {
+    r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize,
+                                                 std::memory_order_relaxed);
+  }
+
+  // Abort compression if the block is too big, or did not pass
+  // verification.
+  if (abort_compression) {
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
+    *type = kNoCompression;
+    *block_contents = uncompressed_block_data;
+  } else if (*type != kNoCompression) {
+    if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)) {
+      RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
+                            timer.ElapsedNanos());
+    }
+    RecordInHistogram(r->ioptions.stats, BYTES_COMPRESSED,
+                      uncompressed_block_data.size());
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_COMPRESSED);
+  } else if (*type != r->compression_type) {
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
+  }
+}
+
+void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
+    const Slice& block_contents, CompressionType type, BlockHandle* handle,
+    BlockType block_type, const Slice* uncompressed_block_data) {
+  Rep* r = rep_;
+  bool is_data_block = block_type == BlockType::kData;
+  // Old, misleading name of this function: WriteRawBlock
+  StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS);
+  handle->set_offset(r->get_offset());
+  handle->set_size(block_contents.size());
+  assert(status().ok());
+  assert(io_status().ok());
+
+  {
+    IOStatus io_s = r->file->Append(block_contents);
+    if (!io_s.ok()) {
+      r->SetIOStatus(io_s);
+      return;
+    }
+  }
+
+  std::array<char, kBlockTrailerSize> trailer;
+  trailer[0] = type;
+  uint32_t checksum = ComputeBuiltinChecksumWithLastByte(
+      r->table_options.checksum, block_contents.data(), block_contents.size(),
+      /*last_byte*/ type);
+
+  if (block_type == BlockType::kFilter) {
+    Status s = r->filter_builder->MaybePostVerifyFilter(block_contents);
+    if (!s.ok()) {
+      r->SetStatus(s);
+      return;
+    }
+  }
+
+  EncodeFixed32(trailer.data() + 1, checksum);
+  TEST_SYNC_POINT_CALLBACK(
+      "BlockBasedTableBuilder::WriteMaybeCompressedBlock:TamperWithChecksum",
+      trailer.data());
+  {
+    IOStatus io_s = r->file->Append(Slice(trailer.data(), trailer.size()));
+    if (!io_s.ok()) {
+      r->SetIOStatus(io_s);
+      return;
+    }
+  }
+
+  {
+    Status s = Status::OK();
+    bool warm_cache;
+    switch (r->table_options.prepopulate_block_cache) {
+      case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly:
+        warm_cache = (r->reason == TableFileCreationReason::kFlush);
+        break;
+      case BlockBasedTableOptions::PrepopulateBlockCache::kDisable:
+        warm_cache = false;
+        break;
+      default:
+        // missing case
+        assert(false);
+        warm_cache = false;
+    }
+    if (warm_cache) {
+      if (type == kNoCompression) {
+        s = InsertBlockInCacheHelper(block_contents, handle, block_type);
+      } else if (uncompressed_block_data != nullptr) {
+        s = InsertBlockInCacheHelper(*uncompressed_block_data, handle,
+                                     block_type);
+      }
+      if (!s.ok()) {
+        r->SetStatus(s);
+        return;
+      }
+    }
+    s = InsertBlockInCompressedCache(block_contents, type, handle);
+    if (!s.ok()) {
+      r->SetStatus(s);
+      return;
+    }
+  }
+
+  r->set_offset(r->get_offset() + block_contents.size() + kBlockTrailerSize);
+  if (r->table_options.block_align && is_data_block) {
+    size_t pad_bytes =
+        (r->alignment -
+         ((block_contents.size() + kBlockTrailerSize) & (r->alignment - 1))) &
+        (r->alignment - 1);
+    IOStatus io_s = r->file->Pad(pad_bytes);
+    if (io_s.ok()) {
+      r->set_offset(r->get_offset() + pad_bytes);
+    } else {
+      r->SetIOStatus(io_s);
+      return;
+    }
+  }
+
+  if (r->IsParallelCompressionEnabled()) {
+    if (is_data_block) {
+      r->pc_rep->file_size_estimator.ReapBlock(block_contents.size(),
+                                               r->get_offset());
+    } else {
+      r->pc_rep->file_size_estimator.SetEstimatedFileSize(r->get_offset());
+    }
+  }
+}
+
+void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
+  Rep* r = rep_;
+  ParallelCompressionRep::BlockRepSlot* slot = nullptr;
+  ParallelCompressionRep::BlockRep* block_rep = nullptr;
+  while (r->pc_rep->write_queue.pop(slot)) {
+    assert(slot != nullptr);
+    slot->Take(block_rep);
+    assert(block_rep != nullptr);
+    if (!block_rep->status.ok()) {
+      r->SetStatus(block_rep->status);
+      // Reap block so that blocked Flush() can finish
+      // if there is one, and Flush() will notice !ok() next time.
+      block_rep->status = Status::OK();
+      r->pc_rep->ReapBlock(block_rep);
+      continue;
+    }
+
+    for (size_t i = 0; i < block_rep->keys->Size(); i++) {
+      auto& key = (*block_rep->keys)[i];
+      if (r->filter_builder != nullptr) {
+        size_t ts_sz =
+            r->internal_comparator.user_comparator()->timestamp_size();
+        r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+      }
+      r->index_builder->OnKeyAdded(key);
+    }
+
+    r->pc_rep->file_size_estimator.SetCurrBlockUncompSize(
+        block_rep->data->size());
+    WriteMaybeCompressedBlock(block_rep->compressed_contents,
+                              block_rep->compression_type, &r->pending_handle,
+                              BlockType::kData, &block_rep->contents);
+    if (!ok()) {
+      break;
+    }
+
+    r->props.data_size = r->get_offset();
+    ++r->props.num_data_blocks;
+
+    if (block_rep->first_key_in_next_block == nullptr) {
+      r->index_builder->AddIndexEntry(&(block_rep->keys->Back()), nullptr,
+                                      r->pending_handle);
+    } else {
+      Slice first_key_in_next_block =
+          Slice(*block_rep->first_key_in_next_block);
+      r->index_builder->AddIndexEntry(&(block_rep->keys->Back()),
+                                      &first_key_in_next_block,
+                                      r->pending_handle);
+    }
+
+    r->pc_rep->ReapBlock(block_rep);
+  }
+}
+
+void BlockBasedTableBuilder::StartParallelCompression() {
+  rep_->pc_rep.reset(
+      new ParallelCompressionRep(rep_->compression_opts.parallel_threads));
+  rep_->pc_rep->compress_thread_pool.reserve(
+      rep_->compression_opts.parallel_threads);
+  for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) {
+    rep_->pc_rep->compress_thread_pool.emplace_back([this, i] {
+      BGWorkCompression(*(rep_->compression_ctxs[i]),
+                        rep_->verify_ctxs[i].get());
+    });
+  }
+  rep_->pc_rep->write_thread.reset(
+      new port::Thread([this] { BGWorkWriteMaybeCompressedBlock(); }));
+}
+
+void BlockBasedTableBuilder::StopParallelCompression() {
+  rep_->pc_rep->compress_queue.finish();
+  for (auto& thread : rep_->pc_rep->compress_thread_pool) {
+    thread.join();
+  }
+  rep_->pc_rep->write_queue.finish();
+  rep_->pc_rep->write_thread->join();
+}
+
+Status BlockBasedTableBuilder::status() const { return rep_->GetStatus(); }
+
+IOStatus BlockBasedTableBuilder::io_status() const {
+  return rep_->GetIOStatus();
+}
+
+//
+// Make a copy of the block contents and insert into compressed block cache
+//
+Status BlockBasedTableBuilder::InsertBlockInCompressedCache(
+    const Slice& block_contents, const CompressionType type,
+    const BlockHandle* handle) {
+  Rep* r = rep_;
+  Cache* block_cache_compressed = r->table_options.block_cache_compressed.get();
+  Status s;
+  if (type != kNoCompression && block_cache_compressed != nullptr) {
+    size_t size = block_contents.size();
+
+    auto ubuf =
+        AllocateBlock(size + 1, block_cache_compressed->memory_allocator());
+    memcpy(ubuf.get(), block_contents.data(), size);
+    ubuf[size] = type;
+
+    BlockContents* block_contents_to_cache =
+        new BlockContents(std::move(ubuf), size);
+#ifndef NDEBUG
+    block_contents_to_cache->has_trailer = true;
+#endif  // NDEBUG
+
+    CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle);
+
+    s = block_cache_compressed->Insert(
+        key.AsSlice(), block_contents_to_cache,
+        block_contents_to_cache->ApproximateMemoryUsage(),
+        &DeleteCacheEntry<BlockContents>);
+    if (s.ok()) {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD);
+    } else {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+    }
+    // Invalidate OS cache.
+    r->file->InvalidateCache(static_cast<size_t>(r->get_offset()), size)
+        .PermitUncheckedError();
+  }
+  return s;
+}
+
+Status BlockBasedTableBuilder::InsertBlockInCacheHelper(
+    const Slice& block_contents, const BlockHandle* handle,
+    BlockType block_type) {
+  Status s;
+  switch (block_type) {
+    case BlockType::kData:
+    case BlockType::kIndex:
+    case BlockType::kFilterPartitionIndex:
+      s = InsertBlockInCache<Block>(block_contents, handle, block_type);
+      break;
+    case BlockType::kFilter:
+      s = InsertBlockInCache<ParsedFullFilterBlock>(block_contents, handle,
+                                                    block_type);
+      break;
+    case BlockType::kCompressionDictionary:
+      s = InsertBlockInCache<UncompressionDict>(block_contents, handle,
+                                                block_type);
+      break;
+    default:
+      // no-op / not cached
+      break;
+  }
+  return s;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
+                                                  const BlockHandle* handle,
+                                                  BlockType block_type) {
+  // Uncompressed regular block cache
+  Cache* block_cache = rep_->table_options.block_cache.get();
+  Status s;
+  if (block_cache != nullptr) {
+    size_t size = block_contents.size();
+    auto buf = AllocateBlock(size, block_cache->memory_allocator());
+    memcpy(buf.get(), block_contents.data(), size);
+    BlockContents results(std::move(buf), size);
+
+    CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle);
+
+    const size_t read_amp_bytes_per_bit =
+        rep_->table_options.read_amp_bytes_per_bit;
+
+    // TODO akanksha:: Dedup below code by calling
+    // BlockBasedTable::PutDataBlockToCache.
+    std::unique_ptr<TBlocklike> block_holder(
+        BlocklikeTraits<TBlocklike>::Create(
+            std::move(results), read_amp_bytes_per_bit,
+            rep_->ioptions.statistics.get(),
+            false /*rep_->blocks_definitely_zstd_compressed*/,
+            rep_->table_options.filter_policy.get()));
+
+    assert(block_holder->own_bytes());
+    size_t charge = block_holder->ApproximateMemoryUsage();
+    s = block_cache->Insert(
+        key.AsSlice(), block_holder.get(),
+        BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), charge,
+        nullptr, Cache::Priority::LOW);
+
+    if (s.ok()) {
+      // Release ownership of block_holder.
+      block_holder.release();
+      BlockBasedTable::UpdateCacheInsertionMetrics(
+          block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(),
+          rep_->ioptions.stats);
+    } else {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_ADD_FAILURES);
+    }
+  }
+  return s;
+}
+
+void BlockBasedTableBuilder::WriteFilterBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  if (rep_->filter_builder == nullptr || rep_->filter_builder->IsEmpty()) {
+    // No filter block needed
+    return;
+  }
+  BlockHandle filter_block_handle;
+  bool is_partitioned_filter = rep_->table_options.partition_filters;
+  if (ok()) {
+    rep_->props.num_filter_entries +=
+        rep_->filter_builder->EstimateEntriesAdded();
+    Status s = Status::Incomplete();
+    while (ok() && s.IsIncomplete()) {
+      // filter_data is used to store the transferred filter data payload from
+      // FilterBlockBuilder and deallocate the payload by going out of scope.
+      // Otherwise, the payload will unnecessarily remain until
+      // BlockBasedTableBuilder is deallocated.
+      //
+      // See FilterBlockBuilder::Finish() for more on the difference in
+      // transferred filter data payload among different FilterBlockBuilder
+      // subtypes.
+      std::unique_ptr<const char[]> filter_data;
+      Slice filter_content =
+          rep_->filter_builder->Finish(filter_block_handle, &s, &filter_data);
+
+      assert(s.ok() || s.IsIncomplete() || s.IsCorruption());
+      if (s.IsCorruption()) {
+        rep_->SetStatus(s);
+        break;
+      }
+
+      rep_->props.filter_size += filter_content.size();
+
+      BlockType btype = is_partitioned_filter && /* last */ s.ok()
+                            ? BlockType::kFilterPartitionIndex
+                            : BlockType::kFilter;
+      WriteMaybeCompressedBlock(filter_content, kNoCompression,
+                                &filter_block_handle, btype);
+    }
+    rep_->filter_builder->ResetFilterBitsBuilder();
+  }
+  if (ok()) {
+    // Add mapping from "<filter_block_prefix>.Name" to location
+    // of filter data.
+    std::string key;
+    key = is_partitioned_filter ? BlockBasedTable::kPartitionedFilterBlockPrefix
+                                : BlockBasedTable::kFullFilterBlockPrefix;
+    key.append(rep_->table_options.filter_policy->CompatibilityName());
+    meta_index_builder->Add(key, filter_block_handle);
+  }
+}
+
+void BlockBasedTableBuilder::WriteIndexBlock(
+    MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) {
+  if (!ok()) {
+    return;
+  }
+  IndexBuilder::IndexBlocks index_blocks;
+  auto index_builder_status = rep_->index_builder->Finish(&index_blocks);
+  if (index_builder_status.IsIncomplete()) {
+    // We we have more than one index partition then meta_blocks are not
+    // supported for the index. Currently meta_blocks are used only by
+    // HashIndexBuilder which is not multi-partition.
+    assert(index_blocks.meta_blocks.empty());
+  } else if (ok() && !index_builder_status.ok()) {
+    rep_->SetStatus(index_builder_status);
+  }
+  if (ok()) {
+    for (const auto& item : index_blocks.meta_blocks) {
+      BlockHandle block_handle;
+      WriteBlock(item.second, &block_handle, BlockType::kIndex);
+      if (!ok()) {
+        break;
+      }
+      meta_index_builder->Add(item.first, block_handle);
+    }
+  }
+  if (ok()) {
+    if (rep_->table_options.enable_index_compression) {
+      WriteBlock(index_blocks.index_block_contents, index_block_handle,
+                 BlockType::kIndex);
+    } else {
+      WriteMaybeCompressedBlock(index_blocks.index_block_contents,
+                                kNoCompression, index_block_handle,
+                                BlockType::kIndex);
+    }
+  }
+  // If there are more index partitions, finish them and write them out
+  if (index_builder_status.IsIncomplete()) {
+    bool index_building_finished = false;
+    while (ok() && !index_building_finished) {
+      Status s =
+          rep_->index_builder->Finish(&index_blocks, *index_block_handle);
+      if (s.ok()) {
+        index_building_finished = true;
+      } else if (s.IsIncomplete()) {
+        // More partitioned index after this one
+        assert(!index_building_finished);
+      } else {
+        // Error
+        rep_->SetStatus(s);
+        return;
+      }
+
+      if (rep_->table_options.enable_index_compression) {
+        WriteBlock(index_blocks.index_block_contents, index_block_handle,
+                   BlockType::kIndex);
+      } else {
+        WriteMaybeCompressedBlock(index_blocks.index_block_contents,
+                                  kNoCompression, index_block_handle,
+                                  BlockType::kIndex);
+      }
+      // The last index_block_handle will be for the partition index block
+    }
+  }
+}
+
+void BlockBasedTableBuilder::WritePropertiesBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  BlockHandle properties_block_handle;
+  if (ok()) {
+    PropertyBlockBuilder property_block_builder;
+    rep_->props.filter_policy_name =
+        rep_->table_options.filter_policy != nullptr
+            ? rep_->table_options.filter_policy->Name()
+            : "";
+    rep_->props.index_size =
+        rep_->index_builder->IndexSize() + kBlockTrailerSize;
+    rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr
+                                      ? rep_->ioptions.user_comparator->Name()
+                                      : "nullptr";
+    rep_->props.merge_operator_name =
+        rep_->ioptions.merge_operator != nullptr
+            ? rep_->ioptions.merge_operator->Name()
+            : "nullptr";
+    rep_->props.compression_name =
+        CompressionTypeToString(rep_->compression_type);
+    rep_->props.compression_options =
+        CompressionOptionsToString(rep_->compression_opts);
+    rep_->props.prefix_extractor_name =
+        rep_->moptions.prefix_extractor != nullptr
+            ? rep_->moptions.prefix_extractor->AsString()
+            : "nullptr";
+    std::string property_collectors_names = "[";
+    for (size_t i = 0;
+         i < rep_->ioptions.table_properties_collector_factories.size(); ++i) {
+      if (i != 0) {
+        property_collectors_names += ",";
+      }
+      property_collectors_names +=
+          rep_->ioptions.table_properties_collector_factories[i]->Name();
+    }
+    property_collectors_names += "]";
+    rep_->props.property_collectors_names = property_collectors_names;
+    if (rep_->table_options.index_type ==
+        BlockBasedTableOptions::kTwoLevelIndexSearch) {
+      assert(rep_->p_index_builder_ != nullptr);
+      rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions();
+      rep_->props.top_level_index_size =
+          rep_->p_index_builder_->TopLevelIndexSize(rep_->offset);
+    }
+    rep_->props.index_key_is_user_key =
+        !rep_->index_builder->seperator_is_key_plus_seq();
+    rep_->props.index_value_is_delta_encoded =
+        rep_->use_delta_encoding_for_index_values;
+    if (rep_->sampled_input_data_bytes > 0) {
+      rep_->props.slow_compression_estimated_data_size = static_cast<uint64_t>(
+          static_cast<double>(rep_->sampled_output_slow_data_bytes) /
+              rep_->sampled_input_data_bytes *
+              rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes + 0.5);
+      rep_->props.fast_compression_estimated_data_size = static_cast<uint64_t>(
+          static_cast<double>(rep_->sampled_output_fast_data_bytes) /
+              rep_->sampled_input_data_bytes *
+              rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes + 0.5);
+    } else if (rep_->sample_for_compression > 0) {
+      // We tried to sample but none were found. Assume worst-case (compression
+      // ratio 1.0) so data is complete and aggregatable.
+      rep_->props.slow_compression_estimated_data_size =
+          rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes;
+      rep_->props.fast_compression_estimated_data_size =
+          rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes;
+    }
+
+    // Add basic properties
+    property_block_builder.AddTableProperty(rep_->props);
+
+    // Add use collected properties
+    NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors,
+                                         rep_->ioptions.logger,
+                                         &property_block_builder);
+
+    Slice block_data = property_block_builder.Finish();
+    TEST_SYNC_POINT_CALLBACK(
+        "BlockBasedTableBuilder::WritePropertiesBlock:BlockData", &block_data);
+    WriteMaybeCompressedBlock(block_data, kNoCompression,
+                              &properties_block_handle, BlockType::kProperties);
+  }
+  if (ok()) {
+#ifndef NDEBUG
+    {
+      uint64_t props_block_offset = properties_block_handle.offset();
+      uint64_t props_block_size = properties_block_handle.size();
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
+          &props_block_offset);
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
+          &props_block_size);
+    }
+#endif  // !NDEBUG
+
+    const std::string* properties_block_meta = &kPropertiesBlockName;
+    TEST_SYNC_POINT_CALLBACK(
+        "BlockBasedTableBuilder::WritePropertiesBlock:Meta",
+        &properties_block_meta);
+    meta_index_builder->Add(*properties_block_meta, properties_block_handle);
+  }
+}
+
+void BlockBasedTableBuilder::WriteCompressionDictBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  if (rep_->compression_dict != nullptr &&
+      rep_->compression_dict->GetRawDict().size()) {
+    BlockHandle compression_dict_block_handle;
+    if (ok()) {
+      WriteMaybeCompressedBlock(rep_->compression_dict->GetRawDict(),
+                                kNoCompression, &compression_dict_block_handle,
+                                BlockType::kCompressionDictionary);
+#ifndef NDEBUG
+      Slice compression_dict = rep_->compression_dict->GetRawDict();
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+          &compression_dict);
+#endif  // NDEBUG
+    }
+    if (ok()) {
+      meta_index_builder->Add(kCompressionDictBlockName,
+                              compression_dict_block_handle);
+    }
+  }
+}
+
+void BlockBasedTableBuilder::WriteRangeDelBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  if (ok() && !rep_->range_del_block.empty()) {
+    BlockHandle range_del_block_handle;
+    WriteMaybeCompressedBlock(rep_->range_del_block.Finish(), kNoCompression,
+                              &range_del_block_handle,
+                              BlockType::kRangeDeletion);
+    meta_index_builder->Add(kRangeDelBlockName, range_del_block_handle);
+  }
+}
+
+void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
+                                         BlockHandle& index_block_handle) {
+  Rep* r = rep_;
+  // this is guaranteed by BlockBasedTableBuilder's constructor
+  assert(r->table_options.checksum == kCRC32c ||
+         r->table_options.format_version != 0);
+  assert(ok());
+
+  FooterBuilder footer;
+  footer.Build(kBlockBasedTableMagicNumber, r->table_options.format_version,
+               r->get_offset(), r->table_options.checksum,
+               metaindex_block_handle, index_block_handle);
+  IOStatus ios = r->file->Append(footer.GetSlice());
+  if (ios.ok()) {
+    r->set_offset(r->get_offset() + footer.GetSlice().size());
+  } else {
+    r->SetIOStatus(ios);
+  }
+}
+
+void BlockBasedTableBuilder::EnterUnbuffered() {
+  Rep* r = rep_;
+  assert(r->state == Rep::State::kBuffered);
+  r->state = Rep::State::kUnbuffered;
+  const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
+                                  ? r->compression_opts.zstd_max_train_bytes
+                                  : r->compression_opts.max_dict_bytes;
+  const size_t kNumBlocksBuffered = r->data_block_buffers.size();
+  if (kNumBlocksBuffered == 0) {
+    // The below code is neither safe nor necessary for handling zero data
+    // blocks.
+    return;
+  }
+
+  // Abstract algebra teaches us that a finite cyclic group (such as the
+  // additive group of integers modulo N) can be generated by a number that is
+  // coprime with N. Since N is variable (number of buffered data blocks), we
+  // must then pick a prime number in order to guarantee coprimeness with any N.
+  //
+  // One downside of this approach is the spread will be poor when
+  // `kPrimeGeneratorRemainder` is close to zero or close to
+  // `kNumBlocksBuffered`.
+  //
+  // Picked a random number between one and one trillion and then chose the
+  // next prime number greater than or equal to it.
+  const uint64_t kPrimeGenerator = 545055921143ull;
+  // Can avoid repeated division by just adding the remainder repeatedly.
+  const size_t kPrimeGeneratorRemainder = static_cast<size_t>(
+      kPrimeGenerator % static_cast<uint64_t>(kNumBlocksBuffered));
+  const size_t kInitSampleIdx = kNumBlocksBuffered / 2;
+
+  std::string compression_dict_samples;
+  std::vector<size_t> compression_dict_sample_lens;
+  size_t buffer_idx = kInitSampleIdx;
+  for (size_t i = 0;
+       i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes;
+       ++i) {
+    size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(),
+                               r->data_block_buffers[buffer_idx].size());
+    compression_dict_samples.append(r->data_block_buffers[buffer_idx], 0,
+                                    copy_len);
+    compression_dict_sample_lens.emplace_back(copy_len);
+
+    buffer_idx += kPrimeGeneratorRemainder;
+    if (buffer_idx >= kNumBlocksBuffered) {
+      buffer_idx -= kNumBlocksBuffered;
+    }
+  }
+
+  // final data block flushed, now we can generate dictionary from the samples.
+  // OK if compression_dict_samples is empty, we'll just get empty dictionary.
+  std::string dict;
+  if (r->compression_opts.zstd_max_train_bytes > 0) {
+    if (r->compression_opts.use_zstd_dict_trainer) {
+      dict = ZSTD_TrainDictionary(compression_dict_samples,
+                                  compression_dict_sample_lens,
+                                  r->compression_opts.max_dict_bytes);
+    } else {
+      dict = ZSTD_FinalizeDictionary(
+          compression_dict_samples, compression_dict_sample_lens,
+          r->compression_opts.max_dict_bytes, r->compression_opts.level);
+    }
+  } else {
+    dict = std::move(compression_dict_samples);
+  }
+  r->compression_dict.reset(new CompressionDict(dict, r->compression_type,
+                                                r->compression_opts.level));
+  r->verify_dict.reset(new UncompressionDict(
+      dict, r->compression_type == kZSTD ||
+                r->compression_type == kZSTDNotFinalCompression));
+
+  auto get_iterator_for_block = [&r](size_t i) {
+    auto& data_block = r->data_block_buffers[i];
+    assert(!data_block.empty());
+
+    Block reader{BlockContents{data_block}};
+    DataBlockIter* iter = reader.NewDataIterator(
+        r->internal_comparator.user_comparator(), kDisableGlobalSequenceNumber);
+
+    iter->SeekToFirst();
+    assert(iter->Valid());
+    return std::unique_ptr<DataBlockIter>(iter);
+  };
+
+  std::unique_ptr<DataBlockIter> iter = nullptr, next_block_iter = nullptr;
+
+  for (size_t i = 0; ok() && i < r->data_block_buffers.size(); ++i) {
+    if (iter == nullptr) {
+      iter = get_iterator_for_block(i);
+      assert(iter != nullptr);
+    };
+
+    if (i + 1 < r->data_block_buffers.size()) {
+      next_block_iter = get_iterator_for_block(i + 1);
+    }
+
+    auto& data_block = r->data_block_buffers[i];
+    if (r->IsParallelCompressionEnabled()) {
+      Slice first_key_in_next_block;
+      const Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
+      if (i + 1 < r->data_block_buffers.size()) {
+        assert(next_block_iter != nullptr);
+        first_key_in_next_block = next_block_iter->key();
+      } else {
+        first_key_in_next_block_ptr = r->first_key_in_next_block;
+      }
+
+      std::vector<std::string> keys;
+      for (; iter->Valid(); iter->Next()) {
+        keys.emplace_back(iter->key().ToString());
+      }
+
+      ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
+          r->compression_type, first_key_in_next_block_ptr, &data_block, &keys);
+
+      assert(block_rep != nullptr);
+      r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
+                                               r->get_offset());
+      r->pc_rep->EmitBlock(block_rep);
+    } else {
+      for (; iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (r->filter_builder != nullptr) {
+          size_t ts_sz =
+              r->internal_comparator.user_comparator()->timestamp_size();
+          r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+        }
+        r->index_builder->OnKeyAdded(key);
+      }
+      WriteBlock(Slice(data_block), &r->pending_handle, BlockType::kData);
+      if (ok() && i + 1 < r->data_block_buffers.size()) {
+        assert(next_block_iter != nullptr);
+        Slice first_key_in_next_block = next_block_iter->key();
+
+        Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
+
+        iter->SeekToLast();
+        std::string last_key = iter->key().ToString();
+        r->index_builder->AddIndexEntry(&last_key, first_key_in_next_block_ptr,
+                                        r->pending_handle);
+      }
+    }
+    std::swap(iter, next_block_iter);
+  }
+  r->data_block_buffers.clear();
+  r->data_begin_offset = 0;
+  // Release all reserved cache for data block buffers
+  if (r->compression_dict_buffer_cache_res_mgr != nullptr) {
+    Status s = r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation(
+        r->data_begin_offset);
+    s.PermitUncheckedError();
+  }
+}
+
+Status BlockBasedTableBuilder::Finish() {
+  Rep* r = rep_;
+  assert(r->state != Rep::State::kClosed);
+  bool empty_data_block = r->data_block.empty();
+  r->first_key_in_next_block = nullptr;
+  Flush();
+  if (r->state == Rep::State::kBuffered) {
+    EnterUnbuffered();
+  }
+  if (r->IsParallelCompressionEnabled()) {
+    StopParallelCompression();
+#ifndef NDEBUG
+    for (const auto& br : r->pc_rep->block_rep_buf) {
+      assert(br.status.ok());
+    }
+#endif  // !NDEBUG
+  } else {
+    // To make sure properties block is able to keep the accurate size of index
+    // block, we will finish writing all index entries first.
+    if (ok() && !empty_data_block) {
+      r->index_builder->AddIndexEntry(
+          &r->last_key, nullptr /* no next data block */, r->pending_handle);
+    }
+  }
+
+  // Write meta blocks, metaindex block and footer in the following order.
+  //    1. [meta block: filter]
+  //    2. [meta block: index]
+  //    3. [meta block: compression dictionary]
+  //    4. [meta block: range deletion tombstone]
+  //    5. [meta block: properties]
+  //    6. [metaindex block]
+  //    7. Footer
+  BlockHandle metaindex_block_handle, index_block_handle;
+  MetaIndexBuilder meta_index_builder;
+  WriteFilterBlock(&meta_index_builder);
+  WriteIndexBlock(&meta_index_builder, &index_block_handle);
+  WriteCompressionDictBlock(&meta_index_builder);
+  WriteRangeDelBlock(&meta_index_builder);
+  WritePropertiesBlock(&meta_index_builder);
+  if (ok()) {
+    // flush the meta index block
+    WriteMaybeCompressedBlock(meta_index_builder.Finish(), kNoCompression,
+                              &metaindex_block_handle, BlockType::kMetaIndex);
+  }
+  if (ok()) {
+    WriteFooter(metaindex_block_handle, index_block_handle);
+  }
+  r->state = Rep::State::kClosed;
+  r->SetStatus(r->CopyIOStatus());
+  Status ret_status = r->CopyStatus();
+  assert(!ret_status.ok() || io_status().ok());
+  return ret_status;
+}
+
+void BlockBasedTableBuilder::Abandon() {
+  assert(rep_->state != Rep::State::kClosed);
+  if (rep_->IsParallelCompressionEnabled()) {
+    StopParallelCompression();
+  }
+  rep_->state = Rep::State::kClosed;
+  rep_->CopyStatus().PermitUncheckedError();
+  rep_->CopyIOStatus().PermitUncheckedError();
+}
+
+uint64_t BlockBasedTableBuilder::NumEntries() const {
+  return rep_->props.num_entries;
+}
+
+bool BlockBasedTableBuilder::IsEmpty() const {
+  return rep_->props.num_entries == 0 && rep_->props.num_range_deletions == 0;
+}
+
+uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; }
+
+uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
+  if (rep_->IsParallelCompressionEnabled()) {
+    // Use compression ratio so far and inflight uncompressed bytes to estimate
+    // final SST size.
+    return rep_->pc_rep->file_size_estimator.GetEstimatedFileSize();
+  } else {
+    return FileSize();
+  }
+}
+
+bool BlockBasedTableBuilder::NeedCompact() const {
+  for (const auto& collector : rep_->table_properties_collectors) {
+    if (collector->NeedCompact()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+TableProperties BlockBasedTableBuilder::GetTableProperties() const {
+  TableProperties ret = rep_->props;
+  for (const auto& collector : rep_->table_properties_collectors) {
+    for (const auto& prop : collector->GetReadableProperties()) {
+      ret.readable_properties.insert(prop);
+    }
+    collector->Finish(&ret.user_collected_properties).PermitUncheckedError();
+  }
+  return ret;
+}
+
+std::string BlockBasedTableBuilder::GetFileChecksum() const {
+  if (rep_->file != nullptr) {
+    return rep_->file->GetFileChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
+const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const {
+  if (rep_->file != nullptr) {
+    return rep_->file->GetFileChecksumFuncName();
+  } else {
+    return kUnknownFileChecksumFuncName;
+  }
+}
+void BlockBasedTableBuilder::SetSeqnoTimeTableProperties(
+    const std::string& encoded_seqno_to_time_mapping,
+    uint64_t oldest_ancestor_time) {
+  rep_->props.seqno_to_time_mapping = encoded_seqno_to_time_mapping;
+  rep_->props.creation_time = oldest_ancestor_time;
+}
+
+const std::string BlockBasedTable::kObsoleteFilterBlockPrefix = "filter.";
+const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter.";
+const std::string BlockBasedTable::kPartitionedFilterBlockPrefix =
+    "partitionedfilter.";
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_builder.h b/src/rocksdb/table/block_based/block_based_table_builder.h
new file mode 100644
index 000000000..ecc13d0f7
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_builder.h
@@ -0,0 +1,203 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <array>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/meta_blocks.h"
+#include "table/table_builder.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+struct BlockBasedTableOptions;
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+
+class BlockBasedTableBuilder : public TableBuilder {
+ public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish().
+  BlockBasedTableBuilder(const BlockBasedTableOptions& table_options,
+                         const TableBuilderOptions& table_builder_options,
+                         WritableFileWriter* file);
+
+  // No copying allowed
+  BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
+  BlockBasedTableBuilder& operator=(const BlockBasedTableBuilder&) = delete;
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~BlockBasedTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  bool IsEmpty() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+  // Estimated size of the file generated so far. This is used when
+  // FileSize() cannot estimate final SST size, e.g. parallel compression
+  // is enabled.
+  uint64_t EstimatedFileSize() const override;
+
+  bool NeedCompact() const override;
+
+  // Get table properties
+  TableProperties GetTableProperties() const override;
+
+  // Get file checksum
+  std::string GetFileChecksum() const override;
+
+  // Get file checksum function name
+  const char* GetFileChecksumFuncName() const override;
+
+  void SetSeqnoTimeTableProperties(
+      const std::string& encoded_seqno_to_time_mapping,
+      uint64_t oldest_ancestor_time) override;
+
+ private:
+  bool ok() const { return status().ok(); }
+
+  // Transition state from buffered to unbuffered. See `Rep::State` API comment
+  // for details of the states.
+  // REQUIRES: `rep_->state == kBuffered`
+  void EnterUnbuffered();
+
+  // Call block's Finish() method and then
+  // - in buffered mode, buffer the uncompressed block contents.
+  // - in unbuffered mode, write the compressed block contents to file.
+  void WriteBlock(BlockBuilder* block, BlockHandle* handle,
+                  BlockType blocktype);
+
+  // Compress and write block content to the file.
+  void WriteBlock(const Slice& block_contents, BlockHandle* handle,
+                  BlockType block_type);
+  // Directly write data to the file.
+  void WriteMaybeCompressedBlock(const Slice& data, CompressionType,
+                                 BlockHandle* handle, BlockType block_type,
+                                 const Slice* raw_data = nullptr);
+
+  void SetupCacheKeyPrefix(const TableBuilderOptions& tbo);
+
+  template <typename TBlocklike>
+  Status InsertBlockInCache(const Slice& block_contents,
+                            const BlockHandle* handle, BlockType block_type);
+
+  Status InsertBlockInCacheHelper(const Slice& block_contents,
+                                  const BlockHandle* handle,
+                                  BlockType block_type);
+
+  Status InsertBlockInCompressedCache(const Slice& block_contents,
+                                      const CompressionType type,
+                                      const BlockHandle* handle);
+
+  void WriteFilterBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteIndexBlock(MetaIndexBuilder* meta_index_builder,
+                       BlockHandle* index_block_handle);
+  void WritePropertiesBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteCompressionDictBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteRangeDelBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteFooter(BlockHandle& metaindex_block_handle,
+                   BlockHandle& index_block_handle);
+
+  struct Rep;
+  class BlockBasedTablePropertiesCollectorFactory;
+  class BlockBasedTablePropertiesCollector;
+  Rep* rep_;
+
+  struct ParallelCompressionRep;
+
+  // Advanced operation: flush any buffered key/value pairs to file.
+  // Can be used to ensure that two adjacent entries never live in
+  // the same data block.  Most clients should not need to use this method.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Flush();
+
+  // Some compression libraries fail when the uncompressed size is bigger than
+  // int. If uncompressed size is bigger than kCompressionSizeLimit, don't
+  // compress it
+  const uint64_t kCompressionSizeLimit = std::numeric_limits<int>::max();
+
+  // Get blocks from mem-table walking thread, compress them and
+  // pass them to the write thread. Used in parallel compression mode only
+  void BGWorkCompression(const CompressionContext& compression_ctx,
+                         UncompressionContext* verify_ctx);
+
+  // Given uncompressed block content, try to compress it and return result and
+  // compression type
+  void CompressAndVerifyBlock(const Slice& uncompressed_block_data,
+                              bool is_data_block,
+                              const CompressionContext& compression_ctx,
+                              UncompressionContext* verify_ctx,
+                              std::string* compressed_output,
+                              Slice* result_block_contents,
+                              CompressionType* result_compression_type,
+                              Status* out_status);
+
+  // Get compressed blocks from BGWorkCompression and write them into SST
+  void BGWorkWriteMaybeCompressedBlock();
+
+  // Initialize parallel compression context and
+  // start BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads
+  void StartParallelCompression();
+
+  // Stop BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads
+  void StopParallelCompression();
+};
+
+Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
+                    CompressionType* type, uint32_t format_version,
+                    bool do_sample, std::string* compressed_output,
+                    std::string* sampled_output_fast,
+                    std::string* sampled_output_slow);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_factory.cc b/src/rocksdb/table/block_based/block_based_table_factory.cc
new file mode 100644
index 000000000..09c1d2f62
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_factory.cc
@@ -0,0 +1,1058 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_table_factory.h"
+
+#include <stdint.h>
+
+#include <cinttypes>
+#include <memory>
+#include <string>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "logging/logging.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void TailPrefetchStats::RecordEffectiveSize(size_t len) {
+  MutexLock l(&mutex_);
+  if (num_records_ < kNumTracked) {
+    num_records_++;
+  }
+  records_[next_++] = len;
+  if (next_ == kNumTracked) {
+    next_ = 0;
+  }
+}
+
+size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
+  std::vector<size_t> sorted;
+  {
+    MutexLock l(&mutex_);
+
+    if (num_records_ == 0) {
+      return 0;
+    }
+    sorted.assign(records_, records_ + num_records_);
+  }
+
+  // Of the historic size, we find the maximum one that satisifis the condtiion
+  // that if prefetching all, less than 1/8 will be wasted.
+  std::sort(sorted.begin(), sorted.end());
+
+  // Assuming we have 5 data points, and after sorting it looks like this:
+  //
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                    +---+    |   |   |   |
+  //                    |   |    |   |   |   |
+  //           +---+    |   |    |   |   |   |
+  //           |   |    |   |    |   |   |   |
+  //  +---+    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // and we use every of the value as a candidate, and estimate how much we
+  // wasted, compared to read. For example, when we use the 3rd record
+  // as candiate. This area is what we read:
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //  ***  ***  ***  ***+ ***  ***  *** *** **
+  //  *                 |   |    |   |   |   |
+  //           +---+    |   |    |   |   |   *
+  //  *        |   |    |   |    |   |   |   |
+  //  +---+    |   |    |   |    |   |   |   *
+  //  *   |    |   |    | X |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   *
+  //  *   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   *
+  //  *   |    |   |    |   |    |   |   |   |
+  //  *** *** ***-***  ***--*** ***--*** +****
+  // which is (size of the record) X (number of records).
+  //
+  // While wasted is this area:
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //  ***  ***  ***  ****---+    |   |   |   |
+  //  *                 *   |    |   |   |   |
+  //  *        *-***  ***   |    |   |   |   |
+  //  *        *   |    |   |    |   |   |   |
+  //  *--**  ***   |    |   |    |   |   |   |
+  //  |   |    |   |    | X |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // Which can be calculated iteratively.
+  // The difference between wasted using 4st and 3rd record, will
+  // be following area:
+  //                                     +---+
+  //  +--+  +-+   ++  +-+  +-+   +---+   |   |
+  //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //    xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  | xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  +-+ +-+  +-+  ++  +---+ +--+   |   |   |
+  //  |                 |   |    |   |   |   |
+  //           +---+ ++ |   |    |   |   |   |
+  //  |        |   |    |   |    | X |   |   |
+  //  +---+ ++ |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // which will be the size difference between 4st and 3rd record,
+  // times 3, which is number of records before the 4st.
+  // Here we assume that all data within the prefetch range will be useful. In
+  // reality, it may not be the case when a partial block is inside the range,
+  // or there are data in the middle that is not read. We ignore those cases
+  // for simplicity.
+  assert(!sorted.empty());
+  size_t prev_size = sorted[0];
+  size_t max_qualified_size = sorted[0];
+  size_t wasted = 0;
+  for (size_t i = 1; i < sorted.size(); i++) {
+    size_t read = sorted[i] * sorted.size();
+    wasted += (sorted[i] - prev_size) * i;
+    if (wasted <= read / 8) {
+      max_qualified_size = sorted[i];
+    }
+    prev_size = sorted[i];
+  }
+  const size_t kMaxPrefetchSize = 512 * 1024;  // Never exceed 512KB
+  return std::min(kMaxPrefetchSize, max_qualified_size);
+}
+
+#ifndef ROCKSDB_LITE
+
+const std::string kOptNameMetadataCacheOpts = "metadata_cache_options";
+
+static std::unordered_map<std::string, PinningTier>
+    pinning_tier_type_string_map = {
+        {"kFallback", PinningTier::kFallback},
+        {"kNone", PinningTier::kNone},
+        {"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar},
+        {"kAll", PinningTier::kAll}};
+
+static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
+    block_base_table_index_type_string_map = {
+        {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
+        {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
+        {"kTwoLevelIndexSearch",
+         BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch},
+        {"kBinarySearchWithFirstKey",
+         BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
+
+static std::unordered_map<std::string,
+                          BlockBasedTableOptions::DataBlockIndexType>
+    block_base_table_data_block_index_type_string_map = {
+        {"kDataBlockBinarySearch",
+         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
+        {"kDataBlockBinaryAndHash",
+         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};
+
+static std::unordered_map<std::string,
+                          BlockBasedTableOptions::IndexShorteningMode>
+    block_base_table_index_shortening_mode_string_map = {
+        {"kNoShortening",
+         BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
+        {"kShortenSeparators",
+         BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
+        {"kShortenSeparatorsAndSuccessor",
+         BlockBasedTableOptions::IndexShorteningMode::
+             kShortenSeparatorsAndSuccessor}};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    metadata_cache_options_type_info = {
+        {"top_level_index_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, top_level_index_pinning),
+             &pinning_tier_type_string_map)},
+        {"partition_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, partition_pinning),
+             &pinning_tier_type_string_map)},
+        {"unpartitioned_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, unpartitioned_pinning),
+             &pinning_tier_type_string_map)}};
+
+static std::unordered_map<std::string,
+                          BlockBasedTableOptions::PrepopulateBlockCache>
+    block_base_table_prepopulate_block_cache_string_map = {
+        {"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable},
+        {"kFlushOnly",
+         BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}};
+
+#endif  // ROCKSDB_LITE
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    block_based_table_type_info = {
+#ifndef ROCKSDB_LITE
+        /* currently not supported
+          std::shared_ptr<Cache> block_cache = nullptr;
+          std::shared_ptr<Cache> block_cache_compressed = nullptr;
+          CacheUsageOptions cache_usage_options;
+         */
+        {"flush_block_policy_factory",
+         OptionTypeInfo::AsCustomSharedPtr<FlushBlockPolicyFactory>(
+             offsetof(struct BlockBasedTableOptions,
+                      flush_block_policy_factory),
+             OptionVerificationType::kByName, OptionTypeFlags::kCompareNever)},
+        {"cache_index_and_filter_blocks",
+         {offsetof(struct BlockBasedTableOptions,
+                   cache_index_and_filter_blocks),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cache_index_and_filter_blocks_with_high_priority",
+         {offsetof(struct BlockBasedTableOptions,
+                   cache_index_and_filter_blocks_with_high_priority),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"pin_l0_filter_and_index_blocks_in_cache",
+         {offsetof(struct BlockBasedTableOptions,
+                   pin_l0_filter_and_index_blocks_in_cache),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"index_type", OptionTypeInfo::Enum<BlockBasedTableOptions::IndexType>(
+                           offsetof(struct BlockBasedTableOptions, index_type),
+                           &block_base_table_index_type_string_map)},
+        {"hash_index_allow_collision",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"data_block_index_type",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::DataBlockIndexType>(
+             offsetof(struct BlockBasedTableOptions, data_block_index_type),
+             &block_base_table_data_block_index_type_string_map)},
+        {"index_shortening",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::IndexShorteningMode>(
+             offsetof(struct BlockBasedTableOptions, index_shortening),
+             &block_base_table_index_shortening_mode_string_map)},
+        {"data_block_hash_table_util_ratio",
+         {offsetof(struct BlockBasedTableOptions,
+                   data_block_hash_table_util_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"checksum",
+         {offsetof(struct BlockBasedTableOptions, checksum),
+          OptionType::kChecksumType, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"no_block_cache",
+         {offsetof(struct BlockBasedTableOptions, no_block_cache),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"block_size",
+         {offsetof(struct BlockBasedTableOptions, block_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"block_size_deviation",
+         {offsetof(struct BlockBasedTableOptions, block_size_deviation),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"block_restart_interval",
+         {offsetof(struct BlockBasedTableOptions, block_restart_interval),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"index_block_restart_interval",
+         {offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"index_per_partition",
+         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"metadata_block_size",
+         {offsetof(struct BlockBasedTableOptions, metadata_block_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"partition_filters",
+         {offsetof(struct BlockBasedTableOptions, partition_filters),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"optimize_filters_for_memory",
+         {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"filter_policy",
+         OptionTypeInfo::AsCustomSharedPtr<const FilterPolicy>(
+             offsetof(struct BlockBasedTableOptions, filter_policy),
+             OptionVerificationType::kByNameAllowFromNull,
+             OptionTypeFlags::kNone)},
+        {"whole_key_filtering",
+         {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"detect_filter_construct_corruption",
+         {offsetof(struct BlockBasedTableOptions,
+                   detect_filter_construct_corruption),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"reserve_table_builder_memory",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"reserve_table_reader_memory",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"skip_table_builder_flush",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"format_version",
+         {offsetof(struct BlockBasedTableOptions, format_version),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"verify_compression",
+         {offsetof(struct BlockBasedTableOptions, verify_compression),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"read_amp_bytes_per_bit",
+         {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone,
+          [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            // A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13
+            // and 6.14. The bug will write out 8 bytes to OPTIONS file from the
+            // starting address of BlockBasedTableOptions.read_amp_bytes_per_bit
+            // which is actually a uint32. Consequently, the value of
+            // read_amp_bytes_per_bit written in the OPTIONS file is wrong.
+            // From 6.15, RocksDB will try to parse the read_amp_bytes_per_bit
+            // from OPTIONS file as a uint32. To be able to load OPTIONS file
+            // generated by affected releases before the fix, we need to
+            // manually parse read_amp_bytes_per_bit with this special hack.
+            uint64_t read_amp_bytes_per_bit = ParseUint64(value);
+            *(static_cast<uint32_t*>(addr)) =
+                static_cast<uint32_t>(read_amp_bytes_per_bit);
+            return Status::OK();
+          }}},
+        {"enable_index_compression",
+         {offsetof(struct BlockBasedTableOptions, enable_index_compression),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"block_align",
+         {offsetof(struct BlockBasedTableOptions, block_align),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"pin_top_level_index_and_filter",
+         {offsetof(struct BlockBasedTableOptions,
+                   pin_top_level_index_and_filter),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {kOptNameMetadataCacheOpts,
+         OptionTypeInfo::Struct(
+             kOptNameMetadataCacheOpts, &metadata_cache_options_type_info,
+             offsetof(struct BlockBasedTableOptions, metadata_cache_options),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+        {"block_cache",
+         {offsetof(struct BlockBasedTableOptions, block_cache),
+          OptionType::kUnknown, OptionVerificationType::kNormal,
+          (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
+          // Parses the input value as a Cache
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
+            return Cache::CreateFromString(opts, value, cache);
+          }}},
+        {"block_cache_compressed",
+         {offsetof(struct BlockBasedTableOptions, block_cache_compressed),
+          OptionType::kUnknown, OptionVerificationType::kNormal,
+          (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
+          // Parses the input value as a Cache
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
+            return Cache::CreateFromString(opts, value, cache);
+          }}},
+        {"max_auto_readahead_size",
+         {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"prepopulate_block_cache",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::PrepopulateBlockCache>(
+             offsetof(struct BlockBasedTableOptions, prepopulate_block_cache),
+             &block_base_table_prepopulate_block_cache_string_map,
+             OptionTypeFlags::kMutable)},
+        {"initial_auto_readahead_size",
+         {offsetof(struct BlockBasedTableOptions, initial_auto_readahead_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"num_file_reads_for_auto_readahead",
+         {offsetof(struct BlockBasedTableOptions,
+                   num_file_reads_for_auto_readahead),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+
+#endif  // ROCKSDB_LITE
+};
+
+// TODO(myabandeh): We should return an error instead of silently changing the
+// options
+BlockBasedTableFactory::BlockBasedTableFactory(
+    const BlockBasedTableOptions& _table_options)
+    : table_options_(_table_options) {
+  InitializeOptions();
+  RegisterOptions(&table_options_, &block_based_table_type_info);
+
+  const auto table_reader_charged =
+      table_options_.cache_usage_options.options_overrides
+          .at(CacheEntryRole::kBlockBasedTableReader)
+          .charged;
+  if (table_options_.block_cache &&
+      table_reader_charged == CacheEntryRoleOptions::Decision::kEnabled) {
+    table_reader_cache_res_mgr_.reset(new ConcurrentCacheReservationManager(
+        std::make_shared<CacheReservationManagerImpl<
+            CacheEntryRole::kBlockBasedTableReader>>(
+            table_options_.block_cache)));
+  }
+}
+
+void BlockBasedTableFactory::InitializeOptions() {
+  if (table_options_.flush_block_policy_factory == nullptr) {
+    table_options_.flush_block_policy_factory.reset(
+        new FlushBlockBySizePolicyFactory());
+  }
+  if (table_options_.no_block_cache) {
+    table_options_.block_cache.reset();
+  } else if (table_options_.block_cache == nullptr) {
+    LRUCacheOptions co;
+    co.capacity = 8 << 20;
+    // It makes little sense to pay overhead for mid-point insertion while the
+    // block size is only 8MB.
+    co.high_pri_pool_ratio = 0.0;
+    co.low_pri_pool_ratio = 0.0;
+    table_options_.block_cache = NewLRUCache(co);
+  }
+  if (table_options_.block_size_deviation < 0 ||
+      table_options_.block_size_deviation > 100) {
+    table_options_.block_size_deviation = 0;
+  }
+  if (table_options_.block_restart_interval < 1) {
+    table_options_.block_restart_interval = 1;
+  }
+  if (table_options_.index_block_restart_interval < 1) {
+    table_options_.index_block_restart_interval = 1;
+  }
+  if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+      table_options_.index_block_restart_interval != 1) {
+    // Currently kHashSearch is incompatible with
+    // index_block_restart_interval > 1
+    table_options_.index_block_restart_interval = 1;
+  }
+  if (table_options_.partition_filters &&
+      table_options_.index_type !=
+          BlockBasedTableOptions::kTwoLevelIndexSearch) {
+    // We do not support partitioned filters without partitioning indexes
+    table_options_.partition_filters = false;
+  }
+  auto& options_overrides =
+      table_options_.cache_usage_options.options_overrides;
+  const auto options = table_options_.cache_usage_options.options;
+  for (std::uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    CacheEntryRole role = static_cast<CacheEntryRole>(i);
+    auto options_overrides_iter = options_overrides.find(role);
+    if (options_overrides_iter == options_overrides.end()) {
+      options_overrides.insert({role, options});
+    } else if (options_overrides_iter->second.charged ==
+               CacheEntryRoleOptions::Decision::kFallback) {
+      options_overrides_iter->second.charged = options.charged;
+    }
+  }
+}
+
+Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) {
+  InitializeOptions();
+  return TableFactory::PrepareOptions(opts);
+}
+
+namespace {
+// Different cache kinds use the same keys for physically different values, so
+// they must not share an underlying key space with each other.
+Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {
+  int cache_count = (bbto.block_cache != nullptr) +
+                    (bbto.block_cache_compressed != nullptr) +
+                    (bbto.persistent_cache != nullptr);
+  if (cache_count <= 1) {
+    // Nothing to share / overlap
+    return Status::OK();
+  }
+
+  // Simple pointer equality
+  if (bbto.block_cache == bbto.block_cache_compressed) {
+    return Status::InvalidArgument(
+        "block_cache same as block_cache_compressed not currently supported, "
+        "and would be bad for performance anyway");
+  }
+
+  // More complex test of shared key space, in case the instances are wrappers
+  // for some shared underlying cache.
+  CacheKey sentinel_key = CacheKey::CreateUniqueForProcessLifetime();
+  static char kRegularBlockCacheMarker = 'b';
+  static char kCompressedBlockCacheMarker = 'c';
+  static char kPersistentCacheMarker = 'p';
+  if (bbto.block_cache) {
+    bbto.block_cache
+        ->Insert(sentinel_key.AsSlice(), &kRegularBlockCacheMarker, 1,
+                 GetNoopDeleterForRole<CacheEntryRole::kMisc>())
+        .PermitUncheckedError();
+  }
+  if (bbto.block_cache_compressed) {
+    bbto.block_cache_compressed
+        ->Insert(sentinel_key.AsSlice(), &kCompressedBlockCacheMarker, 1,
+                 GetNoopDeleterForRole<CacheEntryRole::kMisc>())
+        .PermitUncheckedError();
+  }
+  if (bbto.persistent_cache) {
+    // Note: persistent cache copies the data, not keeping the pointer
+    bbto.persistent_cache
+        ->Insert(sentinel_key.AsSlice(), &kPersistentCacheMarker, 1)
+        .PermitUncheckedError();
+  }
+  // If we get something different from what we inserted, that indicates
+  // dangerously overlapping key spaces.
+  if (bbto.block_cache) {
+    auto handle = bbto.block_cache->Lookup(sentinel_key.AsSlice());
+    if (handle) {
+      auto v = static_cast<char*>(bbto.block_cache->Value(handle));
+      char c = *v;
+      bbto.block_cache->Release(handle);
+      if (v == &kCompressedBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache and block_cache_compressed share the same key space, "
+            "which is not supported");
+      } else if (c == kPersistentCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache and persistent_cache share the same key space, "
+            "which is not supported");
+      } else if (v != &kRegularBlockCacheMarker) {
+        return Status::Corruption("Unexpected mutation to block_cache");
+      }
+    }
+  }
+  if (bbto.block_cache_compressed) {
+    auto handle = bbto.block_cache_compressed->Lookup(sentinel_key.AsSlice());
+    if (handle) {
+      auto v = static_cast<char*>(bbto.block_cache_compressed->Value(handle));
+      char c = *v;
+      bbto.block_cache_compressed->Release(handle);
+      if (v == &kRegularBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache_compressed and block_cache share the same key space, "
+            "which is not supported");
+      } else if (c == kPersistentCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache_compressed and persistent_cache share the same key "
+            "space, "
+            "which is not supported");
+      } else if (v != &kCompressedBlockCacheMarker) {
+        return Status::Corruption(
+            "Unexpected mutation to block_cache_compressed");
+      }
+    }
+  }
+  if (bbto.persistent_cache) {
+    std::unique_ptr<char[]> data;
+    size_t size = 0;
+    bbto.persistent_cache->Lookup(sentinel_key.AsSlice(), &data, &size)
+        .PermitUncheckedError();
+    if (data && size > 0) {
+      if (data[0] == kRegularBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "persistent_cache and block_cache share the same key space, "
+            "which is not supported");
+      } else if (data[0] == kCompressedBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "persistent_cache and block_cache_compressed share the same key "
+            "space, "
+            "which is not supported");
+      } else if (data[0] != kPersistentCacheMarker) {
+        return Status::Corruption("Unexpected mutation to persistent_cache");
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status BlockBasedTableFactory::NewTableReader(
+    const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader,
+    bool prefetch_index_and_filter_in_cache) const {
+  return BlockBasedTable::Open(
+      ro, table_reader_options.ioptions, table_reader_options.env_options,
+      table_options_, table_reader_options.internal_comparator, std::move(file),
+      file_size, table_reader, table_reader_cache_res_mgr_,
+      table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache,
+      table_reader_options.skip_filters, table_reader_options.level,
+      table_reader_options.immortal, table_reader_options.largest_seqno,
+      table_reader_options.force_direct_prefetch, &tail_prefetch_stats_,
+      table_reader_options.block_cache_tracer,
+      table_reader_options.max_file_size_for_l0_meta_pin,
+      table_reader_options.cur_db_session_id, table_reader_options.cur_file_num,
+      table_reader_options.unique_id);
+}
+
+TableBuilder* BlockBasedTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFileWriter* file) const {
+  return new BlockBasedTableBuilder(table_options_, table_builder_options,
+                                    file);
+}
+
+Status BlockBasedTableFactory::ValidateOptions(
+    const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+  if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+      cf_opts.prefix_extractor == nullptr) {
+    return Status::InvalidArgument(
+        "Hash index is specified for block-based "
+        "table, but prefix_extractor is not given");
+  }
+  if (table_options_.cache_index_and_filter_blocks &&
+      table_options_.no_block_cache) {
+    return Status::InvalidArgument(
+        "Enable cache_index_and_filter_blocks, "
+        ", but block cache is disabled");
+  }
+  if (table_options_.pin_l0_filter_and_index_blocks_in_cache &&
+      table_options_.no_block_cache) {
+    return Status::InvalidArgument(
+        "Enable pin_l0_filter_and_index_blocks_in_cache, "
+        ", but block cache is disabled");
+  }
+  if (!IsSupportedFormatVersion(table_options_.format_version)) {
+    return Status::InvalidArgument(
+        "Unsupported BlockBasedTable format_version. Please check "
+        "include/rocksdb/table.h for more info");
+  }
+  if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
+    return Status::InvalidArgument(
+        "Enable block_align, but compression "
+        "enabled");
+  }
+  if (table_options_.block_align &&
+      (table_options_.block_size & (table_options_.block_size - 1))) {
+    return Status::InvalidArgument(
+        "Block alignment requested but block size is not a power of 2");
+  }
+  if (table_options_.block_size > std::numeric_limits<uint32_t>::max()) {
+    return Status::InvalidArgument(
+        "block size exceeds maximum number (4GiB) allowed");
+  }
+  if (table_options_.data_block_index_type ==
+          BlockBasedTableOptions::kDataBlockBinaryAndHash &&
+      table_options_.data_block_hash_table_util_ratio <= 0) {
+    return Status::InvalidArgument(
+        "data_block_hash_table_util_ratio should be greater than 0 when "
+        "data_block_index_type is set to kDataBlockBinaryAndHash");
+  }
+  if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
+    // TODO(myabandeh): support it
+    return Status::InvalidArgument(
+        "max_successive_merges larger than 0 is currently inconsistent with "
+        "unordered_write");
+  }
+  const auto& options_overrides =
+      table_options_.cache_usage_options.options_overrides;
+  for (auto options_overrides_iter = options_overrides.cbegin();
+       options_overrides_iter != options_overrides.cend();
+       ++options_overrides_iter) {
+    const CacheEntryRole role = options_overrides_iter->first;
+    const CacheEntryRoleOptions options = options_overrides_iter->second;
+    static const std::set<CacheEntryRole> kMemoryChargingSupported = {
+        CacheEntryRole::kCompressionDictionaryBuildingBuffer,
+        CacheEntryRole::kFilterConstruction,
+        CacheEntryRole::kBlockBasedTableReader, CacheEntryRole::kFileMetadata,
+        CacheEntryRole::kBlobCache};
+    if (options.charged != CacheEntryRoleOptions::Decision::kFallback &&
+        kMemoryChargingSupported.count(role) == 0) {
+      return Status::NotSupported(
+          "Enable/Disable CacheEntryRoleOptions::charged"
+          " for CacheEntryRole " +
+          kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+          " is not supported");
+    }
+    if (table_options_.no_block_cache &&
+        options.charged == CacheEntryRoleOptions::Decision::kEnabled) {
+      return Status::InvalidArgument(
+          "Enable CacheEntryRoleOptions::charged"
+          " for CacheEntryRole " +
+          kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+          " but block cache is disabled");
+    }
+    if (role == CacheEntryRole::kBlobCache &&
+        options.charged == CacheEntryRoleOptions::Decision::kEnabled) {
+      if (cf_opts.blob_cache == nullptr) {
+        return Status::InvalidArgument(
+            "Enable CacheEntryRoleOptions::charged"
+            " for CacheEntryRole " +
+            kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+            " but blob cache is not configured");
+      }
+      if (table_options_.no_block_cache) {
+        return Status::InvalidArgument(
+            "Enable CacheEntryRoleOptions::charged"
+            " for CacheEntryRole " +
+            kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+            " but block cache is disabled");
+      }
+      if (table_options_.block_cache == cf_opts.blob_cache) {
+        return Status::InvalidArgument(
+            "Enable CacheEntryRoleOptions::charged"
+            " for CacheEntryRole " +
+            kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+            " but blob cache is the same as block cache");
+      }
+      if (cf_opts.blob_cache->GetCapacity() >
+          table_options_.block_cache->GetCapacity()) {
+        return Status::InvalidArgument(
+            "Enable CacheEntryRoleOptions::charged"
+            " for CacheEntryRole " +
+            kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+            " but blob cache capacity is larger than block cache capacity");
+      }
+    }
+  }
+  {
+    Status s = CheckCacheOptionCompatibility(table_options_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  std::string garbage;
+  if (!SerializeEnum<ChecksumType>(checksum_type_string_map,
+                                   table_options_.checksum, &garbage)) {
+    return Status::InvalidArgument(
+        "Unrecognized ChecksumType for checksum: " +
+        std::to_string(static_cast<uint32_t>(table_options_.checksum)));
+  }
+  return TableFactory::ValidateOptions(db_opts, cf_opts);
+}
+
+std::string BlockBasedTableFactory::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "  flush_block_policy_factory: %s (%p)\n",
+           table_options_.flush_block_policy_factory->Name(),
+           static_cast<void*>(table_options_.flush_block_policy_factory.get()));
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  cache_index_and_filter_blocks: %d\n",
+           table_options_.cache_index_and_filter_blocks);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  cache_index_and_filter_blocks_with_high_priority: %d\n",
+           table_options_.cache_index_and_filter_blocks_with_high_priority);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  pin_l0_filter_and_index_blocks_in_cache: %d\n",
+           table_options_.pin_l0_filter_and_index_blocks_in_cache);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  pin_top_level_index_and_filter: %d\n",
+           table_options_.pin_top_level_index_and_filter);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_type: %d\n",
+           table_options_.index_type);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  data_block_index_type: %d\n",
+           table_options_.data_block_index_type);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_shortening: %d\n",
+           static_cast<int>(table_options_.index_shortening));
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  data_block_hash_table_util_ratio: %lf\n",
+           table_options_.data_block_hash_table_util_ratio);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  checksum: %d\n", table_options_.checksum);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  no_block_cache: %d\n",
+           table_options_.no_block_cache);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_cache: %p\n",
+           static_cast<void*>(table_options_.block_cache.get()));
+  ret.append(buffer);
+  if (table_options_.block_cache) {
+    const char* block_cache_name = table_options_.block_cache->Name();
+    if (block_cache_name != nullptr) {
+      snprintf(buffer, kBufferSize, "  block_cache_name: %s\n",
+               block_cache_name);
+      ret.append(buffer);
+    }
+    ret.append("  block_cache_options:\n");
+    ret.append(table_options_.block_cache->GetPrintableOptions());
+  }
+  snprintf(buffer, kBufferSize, "  block_cache_compressed: %p\n",
+           static_cast<void*>(table_options_.block_cache_compressed.get()));
+  ret.append(buffer);
+  if (table_options_.block_cache_compressed) {
+    const char* block_cache_compressed_name =
+        table_options_.block_cache_compressed->Name();
+    if (block_cache_compressed_name != nullptr) {
+      snprintf(buffer, kBufferSize, "  block_cache_name: %s\n",
+               block_cache_compressed_name);
+      ret.append(buffer);
+    }
+    ret.append("  block_cache_compressed_options:\n");
+    ret.append(table_options_.block_cache_compressed->GetPrintableOptions());
+  }
+  snprintf(buffer, kBufferSize, "  persistent_cache: %p\n",
+           static_cast<void*>(table_options_.persistent_cache.get()));
+  ret.append(buffer);
+  if (table_options_.persistent_cache) {
+    snprintf(buffer, kBufferSize, "  persistent_cache_options:\n");
+    ret.append(buffer);
+    ret.append(table_options_.persistent_cache->GetPrintableOptions());
+  }
+  snprintf(buffer, kBufferSize, "  block_size: %" PRIu64 "\n",
+           table_options_.block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_size_deviation: %d\n",
+           table_options_.block_size_deviation);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_restart_interval: %d\n",
+           table_options_.block_restart_interval);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_block_restart_interval: %d\n",
+           table_options_.index_block_restart_interval);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  metadata_block_size: %" PRIu64 "\n",
+           table_options_.metadata_block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  partition_filters: %d\n",
+           table_options_.partition_filters);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  use_delta_encoding: %d\n",
+           table_options_.use_delta_encoding);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  filter_policy: %s\n",
+           table_options_.filter_policy == nullptr
+               ? "nullptr"
+               : table_options_.filter_policy->Name());
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  whole_key_filtering: %d\n",
+           table_options_.whole_key_filtering);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  verify_compression: %d\n",
+           table_options_.verify_compression);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  read_amp_bytes_per_bit: %d\n",
+           table_options_.read_amp_bytes_per_bit);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  format_version: %d\n",
+           table_options_.format_version);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  enable_index_compression: %d\n",
+           table_options_.enable_index_compression);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_align: %d\n",
+           table_options_.block_align);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.max_auto_readahead_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  prepopulate_block_cache: %d\n",
+           static_cast<int>(table_options_.prepopulate_block_cache));
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  initial_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.initial_auto_readahead_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  num_file_reads_for_auto_readahead: %" PRIu64 "\n",
+           table_options_.num_file_reads_for_auto_readahead);
+  ret.append(buffer);
+  return ret;
+}
+
+const void* BlockBasedTableFactory::GetOptionsPtr(
+    const std::string& name) const {
+  if (name == kBlockCacheOpts()) {
+    if (table_options_.no_block_cache) {
+      return nullptr;
+    } else {
+      return table_options_.block_cache.get();
+    }
+  } else {
+    return TableFactory::GetOptionsPtr(name);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+// Take a default BlockBasedTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// BlockBasedTableOptions "new_table_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in BlockBasedTableOptions:
+//
+// * filter_policy:
+//   We currently only support the following FilterPolicy in the convenience
+//   functions:
+//   - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
+//     to specify BloomFilter.  The above string is equivalent to calling
+//     NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
+//     [Example]:
+//     - Pass {"filter_policy", "bloomfilter:4:true"} in
+//       GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
+//       per key and use_block_based_builder enabled.
+//
+// * block_cache / block_cache_compressed:
+//   We currently only support LRU cache in the GetOptions API.  The LRU
+//   cache can be set by directly specifying its size.
+//   [Example]:
+//   - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
+//     equivalent to setting block_cache using NewLRUCache(1024 * 1024).
+//
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+//     "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+//     with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+//     prefixed by '\' in the values of the opts_map will be further converted
+//     back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+//     instead of resulting in an unknown-option error.
+// @return Status::OK() on success.  Otherwise, a non-ok status indicating
+//     error will be returned, and "new_table_options" will be set to
+//     "table_options".
+Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options,
+                                           const OptionTypeInfo& opt_info,
+                                           const std::string& opt_name,
+                                           const std::string& opt_value,
+                                           void* opt_ptr) {
+  Status status = TableFactory::ParseOption(config_options, opt_info, opt_name,
+                                            opt_value, opt_ptr);
+  if (config_options.input_strings_escaped && !status.ok()) {  // Got an error
+    // !input_strings_escaped indicates the old API, where everything is
+    // parsable.
+    if (opt_info.IsByName()) {
+      status = Status::OK();
+    }
+  }
+  return status;
+}
+
+Status GetBlockBasedTableOptionsFromString(
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+  config_options.invoke_prepare_options = false;
+  config_options.ignore_unsupported_options = false;
+
+  return GetBlockBasedTableOptionsFromString(config_options, table_options,
+                                             opts_str, new_table_options);
+}
+Status GetBlockBasedTableOptionsFromString(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+  s = GetBlockBasedTableOptionsFromMap(config_options, table_options, opts_map,
+                                       new_table_options);
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
+  }
+}
+
+Status GetBlockBasedTableOptionsFromMap(
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options, bool input_strings_escaped,
+    bool ignore_unknown_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = input_strings_escaped;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  config_options.invoke_prepare_options = false;
+
+  return GetBlockBasedTableOptionsFromMap(config_options, table_options,
+                                          opts_map, new_table_options);
+}
+
+Status GetBlockBasedTableOptionsFromMap(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options) {
+  assert(new_table_options);
+  BlockBasedTableFactory bbtf(table_options);
+  Status s = bbtf.ConfigureFromMap(config_options, opts_map);
+  if (s.ok()) {
+    *new_table_options = *(bbtf.GetOptions<BlockBasedTableOptions>());
+  } else {
+    *new_table_options = table_options;
+  }
+  return s;
+}
+#endif  // !ROCKSDB_LITE
+
+TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& _table_options) {
+  return new BlockBasedTableFactory(_table_options);
+}
+
+const std::string BlockBasedTablePropertyNames::kIndexType =
+    "rocksdb.block.based.table.index.type";
+const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
+    "rocksdb.block.based.table.whole.key.filtering";
+const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
+    "rocksdb.block.based.table.prefix.filtering";
+const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
+const std::string kHashIndexPrefixesMetadataBlock =
+    "rocksdb.hashindex.metadata";
+const std::string kPropTrue = "1";
+const std::string kPropFalse = "0";
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_factory.h b/src/rocksdb/table/block_based/block_based_table_factory.h
new file mode 100644
index 000000000..3166cd3cc
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_factory.h
@@ -0,0 +1,101 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "cache/cache_reservation_manager.h"
+#include "port/port.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct ColumnFamilyOptions;
+struct ConfigOptions;
+struct DBOptions;
+struct EnvOptions;
+
+class BlockBasedTableBuilder;
+class RandomAccessFileReader;
+class WritableFileWriter;
+
+// A class used to track actual bytes written from the tail in the recent SST
+// file opens, and provide a suggestion for following open.
+class TailPrefetchStats {
+ public:
+  void RecordEffectiveSize(size_t len);
+  // 0 indicates no information to determine.
+  size_t GetSuggestedPrefetchSize();
+
+ private:
+  const static size_t kNumTracked = 32;
+  size_t records_[kNumTracked];
+  port::Mutex mutex_;
+  size_t next_ = 0;
+  size_t num_records_ = 0;
+};
+
+class BlockBasedTableFactory : public TableFactory {
+ public:
+  explicit BlockBasedTableFactory(
+      const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+  ~BlockBasedTableFactory() {}
+
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kBlockBasedTableName(); }
+
+  const char* Name() const override { return kBlockBasedTableName(); }
+
+  using TableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      bool prefetch_index_and_filter_in_cache = true) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const override;
+
+  // Valdates the specified DB Options.
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override;
+  Status PrepareOptions(const ConfigOptions& opts) override;
+
+  std::string GetPrintableOptions() const override;
+
+  bool IsDeleteRangeSupported() const override { return true; }
+
+  TailPrefetchStats* tail_prefetch_stats() { return &tail_prefetch_stats_; }
+
+ protected:
+  const void* GetOptionsPtr(const std::string& name) const override;
+#ifndef ROCKSDB_LITE
+  Status ParseOption(const ConfigOptions& config_options,
+                     const OptionTypeInfo& opt_info,
+                     const std::string& opt_name, const std::string& opt_value,
+                     void* opt_ptr) override;
+#endif
+  void InitializeOptions();
+
+ private:
+  BlockBasedTableOptions table_options_;
+  std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr_;
+  mutable TailPrefetchStats tail_prefetch_stats_;
+};
+
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+extern const std::string kPropTrue;
+extern const std::string kPropFalse;
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_iterator.cc b/src/rocksdb/table/block_based/block_based_table_iterator.cc
new file mode 100644
index 000000000..d2605670f
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_iterator.cc
@@ -0,0 +1,459 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_based_table_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr, false); }
+
+void BlockBasedTableIterator::Seek(const Slice& target) {
+  SeekImpl(&target, true);
+}
+
+void BlockBasedTableIterator::SeekImpl(const Slice* target,
+                                       bool async_prefetch) {
+  bool is_first_pass = true;
+  if (async_read_in_progress_) {
+    AsyncInitDataBlock(false);
+    is_first_pass = false;
+  }
+
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
+    ResetDataIter();
+    return;
+  }
+
+  bool need_seek_index = true;
+  if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
+    // Reseek.
+    prev_block_offset_ = index_iter_->value().handle.offset();
+
+    if (target) {
+      // We can avoid an index seek if:
+      // 1. The new seek key is larger than the current key
+      // 2. The new seek key is within the upper bound of the block
+      // Since we don't necessarily know the internal key for either
+      // the current key or the upper bound, we check user keys and
+      // exclude the equality case. Considering internal keys can
+      // improve for the boundary cases, but it would complicate the
+      // code.
+      if (user_comparator_.Compare(ExtractUserKey(*target),
+                                   block_iter_.user_key()) > 0 &&
+          user_comparator_.Compare(ExtractUserKey(*target),
+                                   index_iter_->user_key()) < 0) {
+        need_seek_index = false;
+      }
+    }
+  }
+
+  if (need_seek_index) {
+    if (target) {
+      index_iter_->Seek(*target);
+    } else {
+      index_iter_->SeekToFirst();
+    }
+
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      return;
+    }
+  }
+
+  IndexValue v = index_iter_->value();
+  const bool same_block = block_iter_points_to_real_block_ &&
+                          v.handle.offset() == prev_block_offset_;
+
+  if (!v.first_internal_key.empty() && !same_block &&
+      (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
+      allow_unprepared_value_) {
+    // Index contains the first key of the block, and it's >= target.
+    // We can defer reading the block.
+    is_at_first_key_from_index_ = true;
+    // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
+    // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
+    // as that will be done later when the data block is actually read.
+    ResetDataIter();
+  } else {
+    // Need to use the data block.
+    if (!same_block) {
+      if (read_options_.async_io && async_prefetch) {
+        if (is_first_pass) {
+          AsyncInitDataBlock(is_first_pass);
+        }
+        if (async_read_in_progress_) {
+          // Status::TryAgain indicates asynchronous request for retrieval of
+          // data blocks has been submitted. So it should return at this point
+          // and Seek should be called again to retrieve the requested block and
+          // execute the remaining code.
+          return;
+        }
+      } else {
+        InitDataBlock();
+      }
+    } else {
+      // When the user does a reseek, the iterate_upper_bound might have
+      // changed. CheckDataBlockWithinUpperBound() needs to be called
+      // explicitly if the reseek ends up in the same data block.
+      // If the reseek ends up in a different block, InitDataBlock() will do
+      // the iterator upper bound check.
+      CheckDataBlockWithinUpperBound();
+    }
+
+    if (target) {
+      block_iter_.Seek(*target);
+    } else {
+      block_iter_.SeekToFirst();
+    }
+    FindKeyForward();
+  }
+
+  CheckOutOfBound();
+
+  if (target) {
+    assert(!Valid() || icomp_.Compare(*target, key()) <= 0);
+  }
+}
+
+void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  // For now totally disable prefix seek in auto prefix mode because we don't
+  // have logic
+  if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
+    ResetDataIter();
+    return;
+  }
+
+  SavePrevIndexValue();
+
+  // Call Seek() rather than SeekForPrev() in the index block, because the
+  // target data block will likely to contain the position for `target`, the
+  // same as Seek(), rather than than before.
+  // For example, if we have three data blocks, each containing two keys:
+  //   [2, 4]  [6, 8] [10, 12]
+  //  (the keys in the index block would be [4, 8, 12])
+  // and the user calls SeekForPrev(7), we need to go to the second block,
+  // just like if they call Seek(7).
+  // The only case where the block is difference is when they seek to a position
+  // in the boundary. For example, if they SeekForPrev(5), we should go to the
+  // first block, rather than the second. However, we don't have the information
+  // to distinguish the two unless we read the second block. In this case, we'll
+  // end up with reading two blocks.
+  index_iter_->Seek(target);
+
+  if (!index_iter_->Valid()) {
+    auto seek_status = index_iter_->status();
+    // Check for IO error
+    if (!seek_status.IsNotFound() && !seek_status.ok()) {
+      ResetDataIter();
+      return;
+    }
+
+    // With prefix index, Seek() returns NotFound if the prefix doesn't exist
+    if (seek_status.IsNotFound()) {
+      // Any key less than the target is fine for prefix seek
+      ResetDataIter();
+      return;
+    } else {
+      index_iter_->SeekToLast();
+    }
+    // Check for IO error
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      return;
+    }
+  }
+
+  InitDataBlock();
+
+  block_iter_.SeekForPrev(target);
+
+  FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
+  assert(!block_iter_.Valid() ||
+         icomp_.Compare(target, block_iter_.key()) >= 0);
+}
+
+void BlockBasedTableIterator::SeekToLast() {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  SavePrevIndexValue();
+  index_iter_->SeekToLast();
+  if (!index_iter_->Valid()) {
+    ResetDataIter();
+    return;
+  }
+  InitDataBlock();
+  block_iter_.SeekToLast();
+  FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
+}
+
+void BlockBasedTableIterator::Next() {
+  if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
+    return;
+  }
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Next();
+  FindKeyForward();
+  CheckOutOfBound();
+}
+
+bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) {
+  Next();
+  bool is_valid = Valid();
+  if (is_valid) {
+    result->key = key();
+    result->bound_check_result = UpperBoundCheckResult();
+    result->value_prepared = !is_at_first_key_from_index_;
+  }
+  return is_valid;
+}
+
+void BlockBasedTableIterator::Prev() {
+  if (is_at_first_key_from_index_) {
+    is_at_first_key_from_index_ = false;
+
+    index_iter_->Prev();
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToLast();
+  } else {
+    assert(block_iter_points_to_real_block_);
+    block_iter_.Prev();
+  }
+
+  FindKeyBackward();
+}
+
+void BlockBasedTableIterator::InitDataBlock() {
+  BlockHandle data_block_handle = index_iter_->value().handle;
+  if (!block_iter_points_to_real_block_ ||
+      data_block_handle.offset() != prev_block_offset_ ||
+      // if previous attempt of reading the block missed cache, try again
+      block_iter_.status().IsIncomplete()) {
+    if (block_iter_points_to_real_block_) {
+      ResetDataIter();
+    }
+    auto* rep = table_->get_rep();
+
+    bool is_for_compaction =
+        lookup_context_.caller == TableReaderCaller::kCompaction;
+    // Prefetch additional data for range scans (iterators).
+    // Implicit auto readahead:
+    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
+    // Explicit user requested readahead:
+    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
+    block_prefetcher_.PrefetchIfNeeded(
+        rep, data_block_handle, read_options_.readahead_size, is_for_compaction,
+        /*no_sequential_checking=*/false, read_options_.rate_limiter_priority);
+    Status s;
+    table_->NewDataBlockIterator<DataBlockIter>(
+        read_options_, data_block_handle, &block_iter_, BlockType::kData,
+        /*get_context=*/nullptr, &lookup_context_,
+        block_prefetcher_.prefetch_buffer(),
+        /*for_compaction=*/is_for_compaction, /*async_read=*/false, s);
+    block_iter_points_to_real_block_ = true;
+    CheckDataBlockWithinUpperBound();
+  }
+}
+
+void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) {
+  BlockHandle data_block_handle = index_iter_->value().handle;
+  bool is_for_compaction =
+      lookup_context_.caller == TableReaderCaller::kCompaction;
+  if (is_first_pass) {
+    if (!block_iter_points_to_real_block_ ||
+        data_block_handle.offset() != prev_block_offset_ ||
+        // if previous attempt of reading the block missed cache, try again
+        block_iter_.status().IsIncomplete()) {
+      if (block_iter_points_to_real_block_) {
+        ResetDataIter();
+      }
+      auto* rep = table_->get_rep();
+      // Prefetch additional data for range scans (iterators).
+      // Implicit auto readahead:
+      //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
+      // Explicit user requested readahead:
+      //   Enabled from the very first IO when ReadOptions.readahead_size is
+      //   set.
+      // In case of async_io with Implicit readahead, block_prefetcher_ will
+      // always the create the prefetch buffer by setting no_sequential_checking
+      // = true.
+      block_prefetcher_.PrefetchIfNeeded(
+          rep, data_block_handle, read_options_.readahead_size,
+          is_for_compaction, /*no_sequential_checking=*/read_options_.async_io,
+          read_options_.rate_limiter_priority);
+
+      Status s;
+      table_->NewDataBlockIterator<DataBlockIter>(
+          read_options_, data_block_handle, &block_iter_, BlockType::kData,
+          /*get_context=*/nullptr, &lookup_context_,
+          block_prefetcher_.prefetch_buffer(),
+          /*for_compaction=*/is_for_compaction, /*async_read=*/true, s);
+
+      if (s.IsTryAgain()) {
+        async_read_in_progress_ = true;
+        return;
+      }
+    }
+  } else {
+    // Second pass will call the Poll to get the data block which has been
+    // requested asynchronously.
+    Status s;
+    table_->NewDataBlockIterator<DataBlockIter>(
+        read_options_, data_block_handle, &block_iter_, BlockType::kData,
+        /*get_context=*/nullptr, &lookup_context_,
+        block_prefetcher_.prefetch_buffer(),
+        /*for_compaction=*/is_for_compaction, /*async_read=*/false, s);
+  }
+  block_iter_points_to_real_block_ = true;
+  CheckDataBlockWithinUpperBound();
+  async_read_in_progress_ = false;
+}
+
+bool BlockBasedTableIterator::MaterializeCurrentBlock() {
+  assert(is_at_first_key_from_index_);
+  assert(!block_iter_points_to_real_block_);
+  assert(index_iter_->Valid());
+
+  is_at_first_key_from_index_ = false;
+  InitDataBlock();
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.status().ok()) {
+    return false;
+  }
+
+  block_iter_.SeekToFirst();
+
+  if (!block_iter_.Valid() ||
+      icomp_.Compare(block_iter_.key(),
+                     index_iter_->value().first_internal_key) != 0) {
+    block_iter_.Invalidate(Status::Corruption(
+        "first key in index doesn't match first key in block"));
+    return false;
+  }
+
+  return true;
+}
+
+void BlockBasedTableIterator::FindKeyForward() {
+  // This method's code is kept short to make it likely to be inlined.
+
+  assert(!is_out_of_bound_);
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.Valid()) {
+    // This is the only call site of FindBlockForward(), but it's extracted into
+    // a separate method to keep FindKeyForward() short and likely to be
+    // inlined. When transitioning to a different block, we call
+    // FindBlockForward(), which is much longer and is probably not inlined.
+    FindBlockForward();
+  } else {
+    // This is the fast path that avoids a function call.
+  }
+}
+
+void BlockBasedTableIterator::FindBlockForward() {
+  // TODO the while loop inherits from two-level-iterator. We don't know
+  // whether a block can be empty so it can be replaced by an "if".
+  do {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+    // Whether next data block is out of upper bound, if there is one.
+    const bool next_block_is_out_of_bound =
+        read_options_.iterate_upper_bound != nullptr &&
+        block_iter_points_to_real_block_ &&
+        block_upper_bound_check_ == BlockUpperBound::kUpperBoundInCurBlock;
+    assert(!next_block_is_out_of_bound ||
+           user_comparator_.CompareWithoutTimestamp(
+               *read_options_.iterate_upper_bound, /*a_has_ts=*/false,
+               index_iter_->user_key(), /*b_has_ts=*/true) <= 0);
+    ResetDataIter();
+    index_iter_->Next();
+    if (next_block_is_out_of_bound) {
+      // The next block is out of bound. No need to read it.
+      TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
+      // We need to make sure this is not the last data block before setting
+      // is_out_of_bound_, since the index key for the last data block can be
+      // larger than smallest key of the next file on the same level.
+      if (index_iter_->Valid()) {
+        is_out_of_bound_ = true;
+      }
+      return;
+    }
+
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    IndexValue v = index_iter_->value();
+
+    if (!v.first_internal_key.empty() && allow_unprepared_value_) {
+      // Index contains the first key of the block. Defer reading the block.
+      is_at_first_key_from_index_ = true;
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
+}
+
+void BlockBasedTableIterator::FindKeyBackward() {
+  while (!block_iter_.Valid()) {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    ResetDataIter();
+    index_iter_->Prev();
+
+    if (index_iter_->Valid()) {
+      InitDataBlock();
+      block_iter_.SeekToLast();
+    } else {
+      return;
+    }
+  }
+
+  // We could have check lower bound here too, but we opt not to do it for
+  // code simplicity.
+}
+
+void BlockBasedTableIterator::CheckOutOfBound() {
+  if (read_options_.iterate_upper_bound != nullptr &&
+      block_upper_bound_check_ != BlockUpperBound::kUpperBoundBeyondCurBlock &&
+      Valid()) {
+    is_out_of_bound_ =
+        user_comparator_.CompareWithoutTimestamp(
+            *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(),
+            /*b_has_ts=*/true) <= 0;
+  }
+}
+
+void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() {
+  if (read_options_.iterate_upper_bound != nullptr &&
+      block_iter_points_to_real_block_) {
+    block_upper_bound_check_ = (user_comparator_.CompareWithoutTimestamp(
+                                    *read_options_.iterate_upper_bound,
+                                    /*a_has_ts=*/false, index_iter_->user_key(),
+                                    /*b_has_ts=*/true) > 0)
+                                   ? BlockUpperBound::kUpperBoundBeyondCurBlock
+                                   : BlockUpperBound::kUpperBoundInCurBlock;
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_iterator.h b/src/rocksdb/table/block_based/block_based_table_iterator.h
new file mode 100644
index 000000000..a2918b248
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_iterator.h
@@ -0,0 +1,280 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_based_table_reader_impl.h"
+#include "table/block_based/block_prefetcher.h"
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Iterates over the contents of BlockBasedTable.
+class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+  // @param read_options Must outlive this iterator.
+ public:
+  BlockBasedTableIterator(
+      const BlockBasedTable* table, const ReadOptions& read_options,
+      const InternalKeyComparator& icomp,
+      std::unique_ptr<InternalIteratorBase<IndexValue>>&& index_iter,
+      bool check_filter, bool need_upper_bound_check,
+      const SliceTransform* prefix_extractor, TableReaderCaller caller,
+      size_t compaction_readahead_size = 0, bool allow_unprepared_value = false)
+      : index_iter_(std::move(index_iter)),
+        table_(table),
+        read_options_(read_options),
+        icomp_(icomp),
+        user_comparator_(icomp.user_comparator()),
+        pinned_iters_mgr_(nullptr),
+        prefix_extractor_(prefix_extractor),
+        lookup_context_(caller),
+        block_prefetcher_(
+            compaction_readahead_size,
+            table_->get_rep()->table_options.initial_auto_readahead_size),
+        allow_unprepared_value_(allow_unprepared_value),
+        block_iter_points_to_real_block_(false),
+        check_filter_(check_filter),
+        need_upper_bound_check_(need_upper_bound_check),
+        async_read_in_progress_(false) {}
+
+  ~BlockBasedTableIterator() {}
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() final override;
+  bool NextAndGetResult(IterateResult* result) override;
+  void Prev() override;
+  bool Valid() const override {
+    return !is_out_of_bound_ &&
+           (is_at_first_key_from_index_ ||
+            (block_iter_points_to_real_block_ && block_iter_.Valid()));
+  }
+  Slice key() const override {
+    assert(Valid());
+    if (is_at_first_key_from_index_) {
+      return index_iter_->value().first_internal_key;
+    } else {
+      return block_iter_.key();
+    }
+  }
+  Slice user_key() const override {
+    assert(Valid());
+    if (is_at_first_key_from_index_) {
+      return ExtractUserKey(index_iter_->value().first_internal_key);
+    } else {
+      return block_iter_.user_key();
+    }
+  }
+  bool PrepareValue() override {
+    assert(Valid());
+
+    if (!is_at_first_key_from_index_) {
+      return true;
+    }
+
+    return const_cast<BlockBasedTableIterator*>(this)
+        ->MaterializeCurrentBlock();
+  }
+  Slice value() const override {
+    // PrepareValue() must have been called.
+    assert(!is_at_first_key_from_index_);
+    assert(Valid());
+
+    return block_iter_.value();
+  }
+  Status status() const override {
+    // Prefix index set status to NotFound when the prefix does not exist
+    if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
+      return index_iter_->status();
+    } else if (block_iter_points_to_real_block_) {
+      return block_iter_.status();
+    } else if (async_read_in_progress_) {
+      return Status::TryAgain();
+    } else {
+      return Status::OK();
+    }
+  }
+
+  inline IterBoundCheck UpperBoundCheckResult() override {
+    if (is_out_of_bound_) {
+      return IterBoundCheck::kOutOfBound;
+    } else if (block_upper_bound_check_ ==
+               BlockUpperBound::kUpperBoundBeyondCurBlock) {
+      assert(!is_out_of_bound_);
+      return IterBoundCheck::kInbound;
+    } else {
+      return IterBoundCheck::kUnknown;
+    }
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+  }
+  bool IsKeyPinned() const override {
+    // Our key comes either from block_iter_'s current key
+    // or index_iter_'s current *value*.
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
+            (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
+  }
+  bool IsValuePinned() const override {
+    assert(!is_at_first_key_from_index_);
+    assert(Valid());
+
+    // BlockIter::IsValuePinned() is always true. No need to check
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           block_iter_points_to_real_block_;
+  }
+
+  void ResetDataIter() {
+    if (block_iter_points_to_real_block_) {
+      if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
+        block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
+      }
+      block_iter_.Invalidate(Status::OK());
+      block_iter_points_to_real_block_ = false;
+    }
+    block_upper_bound_check_ = BlockUpperBound::kUnknown;
+  }
+
+  void SavePrevIndexValue() {
+    if (block_iter_points_to_real_block_) {
+      // Reseek. If they end up with the same data block, we shouldn't re-fetch
+      // the same data block.
+      prev_block_offset_ = index_iter_->value().handle.offset();
+    }
+  }
+
+  void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (block_prefetcher_.prefetch_buffer() != nullptr &&
+        read_options_.adaptive_readahead) {
+      block_prefetcher_.prefetch_buffer()->GetReadaheadState(
+          &(readahead_file_info->data_block_readahead_info));
+      if (index_iter_) {
+        index_iter_->GetReadaheadState(readahead_file_info);
+      }
+    }
+  }
+
+  void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (read_options_.adaptive_readahead) {
+      block_prefetcher_.SetReadaheadState(
+          &(readahead_file_info->data_block_readahead_info));
+      if (index_iter_) {
+        index_iter_->SetReadaheadState(readahead_file_info);
+      }
+    }
+  }
+
+  std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
+
+ private:
+  enum class IterDirection {
+    kForward,
+    kBackward,
+  };
+  // This enum indicates whether the upper bound falls into current block
+  // or beyond.
+  //   +-------------+
+  //   |  cur block  |       <-- (1)
+  //   +-------------+
+  //                         <-- (2)
+  //  --- <boundary key> ---
+  //                         <-- (3)
+  //   +-------------+
+  //   |  next block |       <-- (4)
+  //        ......
+  //
+  // When the block is smaller than <boundary key>, kUpperBoundInCurBlock
+  // is the value to use. The examples are (1) or (2) in the graph. It means
+  // all keys in the next block or beyond will be out of bound. Keys within
+  // the current block may or may not be out of bound.
+  // When the block is larger or equal to <boundary key>,
+  // kUpperBoundBeyondCurBlock is to be used. The examples are (3) and (4)
+  // in the graph. It means that all keys in the current block is within the
+  // upper bound and keys in the next block may or may not be within the uppder
+  // bound.
+  // If the boundary key hasn't been checked against the upper bound,
+  // kUnknown can be used.
+  enum class BlockUpperBound {
+    kUpperBoundInCurBlock,
+    kUpperBoundBeyondCurBlock,
+    kUnknown,
+  };
+
+  const BlockBasedTable* table_;
+  const ReadOptions& read_options_;
+  const InternalKeyComparator& icomp_;
+  UserComparatorWrapper user_comparator_;
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  DataBlockIter block_iter_;
+  const SliceTransform* prefix_extractor_;
+  uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
+  BlockCacheLookupContext lookup_context_;
+
+  BlockPrefetcher block_prefetcher_;
+
+  const bool allow_unprepared_value_;
+  // True if block_iter_ is initialized and points to the same block
+  // as index iterator.
+  bool block_iter_points_to_real_block_;
+  // See InternalIteratorBase::IsOutOfBound().
+  bool is_out_of_bound_ = false;
+  // How current data block's boundary key with the next block is compared with
+  // iterate upper bound.
+  BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown;
+  // True if we're standing at the first key of a block, and we haven't loaded
+  // that block yet. A call to PrepareValue() will trigger loading the block.
+  bool is_at_first_key_from_index_ = false;
+  bool check_filter_;
+  // TODO(Zhongyi): pick a better name
+  bool need_upper_bound_check_;
+
+  bool async_read_in_progress_;
+
+  // If `target` is null, seek to first.
+  void SeekImpl(const Slice* target, bool async_prefetch);
+
+  void InitDataBlock();
+  void AsyncInitDataBlock(bool is_first_pass);
+  bool MaterializeCurrentBlock();
+  void FindKeyForward();
+  void FindBlockForward();
+  void FindKeyBackward();
+  void CheckOutOfBound();
+
+  // Check if data block is fully within iterate_upper_bound.
+  //
+  // Note MyRocks may update iterate bounds between seek. To workaround it,
+  // we need to check and update data_block_within_upper_bound_ accordingly.
+  void CheckDataBlockWithinUpperBound();
+
+  bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
+    if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
+      // Upper bound check isn't sufficient for backward direction to
+      // guarantee the same result as total order, so disable prefix
+      // check.
+      return true;
+    }
+    if (check_filter_ && !table_->PrefixRangeMayMatch(
+                             ikey, read_options_, prefix_extractor_,
+                             need_upper_bound_check_, &lookup_context_)) {
+      // TODO remember the iterator is invalidated because of prefix
+      // match. This can avoid the upper level file iterator to falsely
+      // believe the position is the end of the SST file and move to
+      // the first key of the next file.
+      ResetDataIter();
+      return false;
+    }
+    return true;
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader.cc b/src/rocksdb/table/block_based/block_based_table_reader.cc
new file mode 100644
index 000000000..43962ba1d
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader.cc
@@ -0,0 +1,3092 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_based_table_reader.h"
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "file/file_prefetch_buffer.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/lang.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/trace_record.h"
+#include "table/block_based/binary_search_index_reader.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_iterator.h"
+#include "table/block_based/block_like_traits.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/hash_index_reader.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/block_based/partitioned_index_reader.h"
+#include "table/block_fetcher.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/multiget_context.h"
+#include "table/persistent_cache_helper.h"
+#include "table/persistent_cache_options.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
+  CacheAllocationPtr heap_buf;
+  heap_buf = AllocateBlock(buf.size(), allocator);
+  memcpy(heap_buf.get(), buf.data(), buf.size());
+  return heap_buf;
+}
+}  // namespace
+}  // namespace ROCKSDB_NAMESPACE
+
+// Generate the regular and coroutine versions of some methods by
+// including block_based_table_reader_sync_and_async.h twice
+// Macros in the header will expand differently based on whether
+// WITH_COROUTINES or WITHOUT_COROUTINES is defined
+// clang-format off
+#define WITHOUT_COROUTINES
+#include "table/block_based/block_based_table_reader_sync_and_async.h"
+#undef WITHOUT_COROUTINES
+#define WITH_COROUTINES
+#include "table/block_based/block_based_table_reader_sync_and_async.h"
+#undef WITH_COROUTINES
+// clang-format on
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+
+BlockBasedTable::~BlockBasedTable() { delete rep_; }
+
+namespace {
+// Read the block identified by "handle" from "file".
+// The only relevant option is options.verify_checksums for now.
+// On failure return non-OK.
+// On success fill *result and return OK - caller owns *result
+// @param uncompression_dict Data for presetting the compression library's
+//    dictionary.
+template <typename TBlocklike>
+Status ReadBlockFromFile(
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
+    std::unique_ptr<TBlocklike>* result, const ImmutableOptions& ioptions,
+    bool do_uncompress, bool maybe_compressed, BlockType block_type,
+    const UncompressionDict& uncompression_dict,
+    const PersistentCacheOptions& cache_options, size_t read_amp_bytes_per_bit,
+    MemoryAllocator* memory_allocator, bool for_compaction, bool using_zstd,
+    const FilterPolicy* filter_policy, bool async_read) {
+  assert(result);
+
+  BlockContents contents;
+  BlockFetcher block_fetcher(
+      file, prefetch_buffer, footer, options, handle, &contents, ioptions,
+      do_uncompress, maybe_compressed, block_type, uncompression_dict,
+      cache_options, memory_allocator, nullptr, for_compaction);
+  Status s;
+  // If prefetch_buffer is not allocated, it will fallback to synchronous
+  // reading of block contents.
+  if (async_read && prefetch_buffer != nullptr) {
+    s = block_fetcher.ReadAsyncBlockContents();
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    s = block_fetcher.ReadBlockContents();
+  }
+  if (s.ok()) {
+    result->reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(contents), read_amp_bytes_per_bit, ioptions.stats, using_zstd,
+        filter_policy));
+  }
+
+  return s;
+}
+
+// For hash based index, return false if table_properties->prefix_extractor_name
+// and prefix_extractor both exist and match, otherwise true.
+inline bool PrefixExtractorChangedHelper(
+    const TableProperties* table_properties,
+    const SliceTransform* prefix_extractor) {
+  // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
+  // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
+  // is set but prefix_extractor_block is not set, also disable hash index
+  if (prefix_extractor == nullptr || table_properties == nullptr ||
+      table_properties->prefix_extractor_name.empty()) {
+    return true;
+  }
+
+  // prefix_extractor and prefix_extractor_block are both non-empty
+  if (table_properties->prefix_extractor_name != prefix_extractor->AsString()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace
+
+void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
+                                            GetContext* get_context,
+                                            size_t usage) const {
+  Statistics* const statistics = rep_->ioptions.stats;
+
+  PERF_COUNTER_ADD(block_cache_hit_count, 1);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
+                            static_cast<uint32_t>(rep_->level));
+
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_hit;
+    get_context->get_context_stats_.num_cache_bytes_read += usage;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_HIT);
+    RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
+  }
+
+  switch (block_type) {
+    case BlockType::kFilter:
+    case BlockType::kFilterPartitionIndex:
+      PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
+
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      // TODO: introduce perf counter for compression dictionary hit count
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+      }
+      break;
+
+    case BlockType::kIndex:
+      PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
+
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
+      }
+      break;
+  }
+}
+
+void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
+                                             GetContext* get_context) const {
+  Statistics* const statistics = rep_->ioptions.stats;
+
+  // TODO: introduce aggregate (not per-level) block cache miss count
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
+                            static_cast<uint32_t>(rep_->level));
+
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_miss;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_MISS);
+  }
+
+  // TODO: introduce perf counters for misses per block type
+  switch (block_type) {
+    case BlockType::kFilter:
+    case BlockType::kFilterPartitionIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+      }
+      break;
+
+    case BlockType::kIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
+      }
+      break;
+  }
+}
+
+void BlockBasedTable::UpdateCacheInsertionMetrics(
+    BlockType block_type, GetContext* get_context, size_t usage, bool redundant,
+    Statistics* const statistics) {
+  // TODO: introduce perf counters for block cache insertions
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_add;
+    if (redundant) {
+      ++get_context->get_context_stats_.num_cache_add_redundant;
+    }
+    get_context->get_context_stats_.num_cache_bytes_write += usage;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_ADD);
+    if (redundant) {
+      RecordTick(statistics, BLOCK_CACHE_ADD_REDUNDANT);
+    }
+    RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
+  }
+
+  switch (block_type) {
+    case BlockType::kFilter:
+    case BlockType::kFilterPartitionIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_add;
+        if (redundant) {
+          ++get_context->get_context_stats_.num_cache_filter_add_redundant;
+        }
+        get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_FILTER_ADD_REDUNDANT);
+        }
+        RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_add;
+        if (redundant) {
+          ++get_context->get_context_stats_
+                .num_cache_compression_dict_add_redundant;
+        }
+        get_context->get_context_stats_
+            .num_cache_compression_dict_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT);
+        }
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+                   usage);
+      }
+      break;
+
+    case BlockType::kIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_add;
+        if (redundant) {
+          ++get_context->get_context_stats_.num_cache_index_add_redundant;
+        }
+        get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_INDEX_ADD_REDUNDANT);
+        }
+        RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_add;
+        if (redundant) {
+          ++get_context->get_context_stats_.num_cache_data_add_redundant;
+        }
+        get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_DATA_ADD_REDUNDANT);
+        }
+        RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
+      }
+      break;
+  }
+}
+
+Cache::Handle* BlockBasedTable::GetEntryFromCache(
+    const CacheTier& cache_tier, Cache* block_cache, const Slice& key,
+    BlockType block_type, const bool wait, GetContext* get_context,
+    const Cache::CacheItemHelper* cache_helper,
+    const Cache::CreateCallback& create_cb, Cache::Priority priority) const {
+  Cache::Handle* cache_handle = nullptr;
+  if (cache_tier == CacheTier::kNonVolatileBlockTier) {
+    cache_handle = block_cache->Lookup(key, cache_helper, create_cb, priority,
+                                       wait, rep_->ioptions.statistics.get());
+  } else {
+    cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics.get());
+  }
+
+  // Avoid updating metrics here if the handle is not complete yet. This
+  // happens with MultiGet and secondary cache. So update the metrics only
+  // if its a miss, or a hit and value is ready
+  if (!cache_handle || block_cache->Value(cache_handle)) {
+    if (cache_handle != nullptr) {
+      UpdateCacheHitMetrics(block_type, get_context,
+                            block_cache->GetUsage(cache_handle));
+    } else {
+      UpdateCacheMissMetrics(block_type, get_context);
+    }
+  }
+
+  return cache_handle;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::InsertEntryToCache(
+    const CacheTier& cache_tier, Cache* block_cache, const Slice& key,
+    const Cache::CacheItemHelper* cache_helper,
+    std::unique_ptr<TBlocklike>&& block_holder, size_t charge,
+    Cache::Handle** cache_handle, Cache::Priority priority) const {
+  Status s = Status::OK();
+  if (cache_tier == CacheTier::kNonVolatileBlockTier) {
+    s = block_cache->Insert(key, block_holder.get(), cache_helper, charge,
+                            cache_handle, priority);
+  } else {
+    s = block_cache->Insert(key, block_holder.get(), charge,
+                            cache_helper->del_cb, cache_handle, priority);
+  }
+  if (s.ok()) {
+    // Cache took ownership
+    block_holder.release();
+  }
+  s.MustCheck();
+  return s;
+}
+
+namespace {
+// Return True if table_properties has `user_prop_name` has a `true` value
+// or it doesn't contain this property (for backward compatible).
+bool IsFeatureSupported(const TableProperties& table_properties,
+                        const std::string& user_prop_name, Logger* info_log) {
+  auto& props = table_properties.user_collected_properties;
+  auto pos = props.find(user_prop_name);
+  // Older version doesn't have this value set. Skip this check.
+  if (pos != props.end()) {
+    if (pos->second == kPropFalse) {
+      return false;
+    } else if (pos->second != kPropTrue) {
+      ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
+                     user_prop_name.c_str(), pos->second.c_str());
+    }
+  }
+  return true;
+}
+
+// Caller has to ensure seqno is not nullptr.
+Status GetGlobalSequenceNumber(const TableProperties& table_properties,
+                               SequenceNumber largest_seqno,
+                               SequenceNumber* seqno) {
+  const auto& props = table_properties.user_collected_properties;
+  const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
+  const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+
+  *seqno = kDisableGlobalSequenceNumber;
+  if (version_pos == props.end()) {
+    if (seqno_pos != props.end()) {
+      std::array<char, 200> msg_buf;
+      // This is not an external sst file, global_seqno is not supported.
+      snprintf(
+          msg_buf.data(), msg_buf.max_size(),
+          "A non-external sst file have global seqno property with value %s",
+          seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
+    }
+    return Status::OK();
+  }
+
+  uint32_t version = DecodeFixed32(version_pos->second.c_str());
+  if (version < 2) {
+    if (seqno_pos != props.end() || version != 1) {
+      std::array<char, 200> msg_buf;
+      // This is a v1 external sst file, global_seqno is not supported.
+      snprintf(msg_buf.data(), msg_buf.max_size(),
+               "An external sst file with version %u have global seqno "
+               "property with value %s",
+               version, seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
+    }
+    return Status::OK();
+  }
+
+  // Since we have a plan to deprecate global_seqno, we do not return failure
+  // if seqno_pos == props.end(). We rely on version_pos to detect whether the
+  // SST is external.
+  SequenceNumber global_seqno(0);
+  if (seqno_pos != props.end()) {
+    global_seqno = DecodeFixed64(seqno_pos->second.c_str());
+  }
+  // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
+  // to denote it is unknown.
+  if (largest_seqno < kMaxSequenceNumber) {
+    if (global_seqno == 0) {
+      global_seqno = largest_seqno;
+    }
+    if (global_seqno != largest_seqno) {
+      std::array<char, 200> msg_buf;
+      snprintf(
+          msg_buf.data(), msg_buf.max_size(),
+          "An external sst file with version %u have global seqno property "
+          "with value %s, while largest seqno in the file is %llu",
+          version, seqno_pos->second.c_str(),
+          static_cast<unsigned long long>(largest_seqno));
+      return Status::Corruption(msg_buf.data());
+    }
+  }
+  *seqno = global_seqno;
+
+  if (global_seqno > kMaxSequenceNumber) {
+    std::array<char, 200> msg_buf;
+    snprintf(msg_buf.data(), msg_buf.max_size(),
+             "An external sst file with version %u have global seqno property "
+             "with value %llu, which is greater than kMaxSequenceNumber",
+             version, static_cast<unsigned long long>(global_seqno));
+    return Status::Corruption(msg_buf.data());
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties,
+                                        const std::string& cur_db_session_id,
+                                        uint64_t cur_file_number,
+                                        OffsetableCacheKey* out_base_cache_key,
+                                        bool* out_is_stable) {
+  // Use a stable cache key if sufficient data is in table properties
+  std::string db_session_id;
+  uint64_t file_num;
+  std::string db_id;
+  if (properties && !properties->db_session_id.empty() &&
+      properties->orig_file_number > 0) {
+    // (Newer SST file case)
+    // We must have both properties to get a stable unique id because
+    // CreateColumnFamilyWithImport or IngestExternalFiles can change the
+    // file numbers on a file.
+    db_session_id = properties->db_session_id;
+    file_num = properties->orig_file_number;
+    // Less critical, populated in earlier release than above
+    db_id = properties->db_id;
+    if (out_is_stable) {
+      *out_is_stable = true;
+    }
+  } else {
+    // (Old SST file case)
+    // We use (unique) cache keys based on current identifiers. These are at
+    // least stable across table file close and re-open, but not across
+    // different DBs nor DB close and re-open.
+    db_session_id = cur_db_session_id;
+    file_num = cur_file_number;
+    // Plumbing through the DB ID to here would be annoying, and of limited
+    // value because of the case of VersionSet::Recover opening some table
+    // files and later setting the DB ID. So we just rely on uniqueness
+    // level provided by session ID.
+    db_id = "unknown";
+    if (out_is_stable) {
+      *out_is_stable = false;
+    }
+  }
+
+  // Too many tests to update to get these working
+  // assert(file_num > 0);
+  // assert(!db_session_id.empty());
+  // assert(!db_id.empty());
+
+  // Minimum block size is 5 bytes; therefore we can trim off two lower bits
+  // from offsets. See GetCacheKey.
+  *out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num);
+}
+
+CacheKey BlockBasedTable::GetCacheKey(const OffsetableCacheKey& base_cache_key,
+                                      const BlockHandle& handle) {
+  // Minimum block size is 5 bytes; therefore we can trim off two lower bits
+  // from offet.
+  return base_cache_key.WithOffset(handle.offset() >> 2);
+}
+
+Status BlockBasedTable::Open(
+    const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const EnvOptions& env_options, const BlockBasedTableOptions& table_options,
+    const InternalKeyComparator& internal_comparator,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader,
+    std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
+    const int level, const bool immortal_table,
+    const SequenceNumber largest_seqno, const bool force_direct_prefetch,
+    TailPrefetchStats* tail_prefetch_stats,
+    BlockCacheTracer* const block_cache_tracer,
+    size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id,
+    uint64_t cur_file_num, UniqueId64x2 expected_unique_id) {
+  table_reader->reset();
+
+  Status s;
+  Footer footer;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+
+  // From read_options, retain deadline, io_timeout, and rate_limiter_priority.
+  // In future, we may retain more
+  // options. Specifically, we ignore verify_checksums and default to
+  // checksum verification anyway when creating the index and filter
+  // readers.
+  ReadOptions ro;
+  ro.deadline = read_options.deadline;
+  ro.io_timeout = read_options.io_timeout;
+  ro.rate_limiter_priority = read_options.rate_limiter_priority;
+
+  // prefetch both index and filters, down to all partitions
+  const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
+  const bool preload_all = !table_options.cache_index_and_filter_blocks;
+
+  if (!ioptions.allow_mmap_reads) {
+    s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch,
+                     tail_prefetch_stats, prefetch_all, preload_all,
+                     &prefetch_buffer);
+    // Return error in prefetch path to users.
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    // Should not prefetch for mmap mode.
+    prefetch_buffer.reset(new FilePrefetchBuffer(
+        0 /* readahead_size */, 0 /* max_readahead_size */, false /* enable */,
+        true /* track_min_offset */));
+  }
+
+  // Read in the following order:
+  //    1. Footer
+  //    2. [metaindex block]
+  //    3. [meta block: properties]
+  //    4. [meta block: range deletion tombstone]
+  //    5. [meta block: compression dictionary]
+  //    6. [meta block: index]
+  //    7. [meta block: filter]
+  IOOptions opts;
+  s = file->PrepareIOOptions(ro, opts);
+  if (s.ok()) {
+    s = ReadFooterFromFile(opts, file.get(), prefetch_buffer.get(), file_size,
+                           &footer, kBlockBasedTableMagicNumber);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  if (!IsSupportedFormatVersion(footer.format_version())) {
+    return Status::Corruption(
+        "Unknown Footer version. Maybe this file was created with newer "
+        "version of RocksDB?");
+  }
+
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+  Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
+                                      internal_comparator, skip_filters,
+                                      file_size, level, immortal_table);
+  rep->file = std::move(file);
+  rep->footer = footer;
+
+  // For fully portable/stable cache keys, we need to read the properties
+  // block before setting up cache keys. TODO: consider setting up a bootstrap
+  // cache key for PersistentCache to use for metaindex and properties blocks.
+  rep->persistent_cache_options = PersistentCacheOptions();
+
+  // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
+  // handle to null, otherwise it may be seen as uninitialized during the below
+  // meta-block reads.
+  rep->compression_dict_handle = BlockHandle::NullBlockHandle();
+
+  // Read metaindex
+  std::unique_ptr<BlockBasedTable> new_table(
+      new BlockBasedTable(rep, block_cache_tracer));
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  s = new_table->ReadMetaIndexBlock(ro, prefetch_buffer.get(), &metaindex,
+                                    &metaindex_iter);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Populates table_properties and some fields that depend on it,
+  // such as index_type.
+  s = new_table->ReadPropertiesBlock(ro, prefetch_buffer.get(),
+                                     metaindex_iter.get(), largest_seqno);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Check expected unique id if provided
+  if (expected_unique_id != kNullUniqueId64x2) {
+    auto props = rep->table_properties;
+    if (!props) {
+      return Status::Corruption("Missing table properties on file " +
+                                std::to_string(cur_file_num) +
+                                " with known unique ID");
+    }
+    UniqueId64x2 actual_unique_id{};
+    s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                               props->orig_file_number, &actual_unique_id,
+                               /*force*/ true);
+    assert(s.ok());  // because force=true
+    if (expected_unique_id != actual_unique_id) {
+      return Status::Corruption(
+          "Mismatch in unique ID on table file " +
+          std::to_string(cur_file_num) +
+          ". Expected: " + InternalUniqueIdToHumanString(&expected_unique_id) +
+          " Actual: " + InternalUniqueIdToHumanString(&actual_unique_id));
+    }
+    TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::PassedVerifyUniqueId",
+                             &actual_unique_id);
+  } else {
+    TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::SkippedVerifyUniqueId",
+                             nullptr);
+    if (ioptions.verify_sst_unique_id_in_manifest && ioptions.logger) {
+      // A crude but isolated way of reporting unverified files. This should not
+      // be an ongoing concern so doesn't deserve a place in Statistics IMHO.
+      static std::atomic<uint64_t> unverified_count{0};
+      auto prev_count =
+          unverified_count.fetch_add(1, std::memory_order_relaxed);
+      if (prev_count == 0) {
+        ROCKS_LOG_WARN(
+            ioptions.logger,
+            "At least one SST file opened without unique ID to verify: %" PRIu64
+            ".sst",
+            cur_file_num);
+      } else if (prev_count % 1000 == 0) {
+        ROCKS_LOG_WARN(
+            ioptions.logger,
+            "Another ~1000 SST files opened without unique ID to verify");
+      }
+    }
+  }
+
+  // Set up prefix extracto as needed
+  bool force_null_table_prefix_extractor = false;
+  TEST_SYNC_POINT_CALLBACK(
+      "BlockBasedTable::Open::ForceNullTablePrefixExtractor",
+      &force_null_table_prefix_extractor);
+  if (force_null_table_prefix_extractor) {
+    assert(!rep->table_prefix_extractor);
+  } else if (!PrefixExtractorChangedHelper(rep->table_properties.get(),
+                                           prefix_extractor.get())) {
+    // Establish fast path for unchanged prefix_extractor
+    rep->table_prefix_extractor = prefix_extractor;
+  } else {
+    // Current prefix_extractor doesn't match table
+#ifndef ROCKSDB_LITE
+    if (rep->table_properties) {
+      //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions
+      // will need to use it
+      ConfigOptions config_options;
+      Status st = SliceTransform::CreateFromString(
+          config_options, rep->table_properties->prefix_extractor_name,
+          &(rep->table_prefix_extractor));
+      if (!st.ok()) {
+        //**TODO: Should this be error be returned or swallowed?
+        ROCKS_LOG_ERROR(rep->ioptions.logger,
+                        "Failed to create prefix extractor[%s]: %s",
+                        rep->table_properties->prefix_extractor_name.c_str(),
+                        st.ToString().c_str());
+      }
+    }
+#endif  // ROCKSDB_LITE
+  }
+
+  // With properties loaded, we can set up portable/stable cache keys
+  SetupBaseCacheKey(rep->table_properties.get(), cur_db_session_id,
+                    cur_file_num, &rep->base_cache_key);
+
+  rep->persistent_cache_options =
+      PersistentCacheOptions(rep->table_options.persistent_cache,
+                             rep->base_cache_key, rep->ioptions.stats);
+
+  s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(),
+                                   metaindex_iter.get(), internal_comparator,
+                                   &lookup_context);
+  if (!s.ok()) {
+    return s;
+  }
+  s = new_table->PrefetchIndexAndFilterBlocks(
+      ro, prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
+      prefetch_all, table_options, level, file_size,
+      max_file_size_for_l0_meta_pin, &lookup_context);
+
+  if (s.ok()) {
+    // Update tail prefetch stats
+    assert(prefetch_buffer.get() != nullptr);
+    if (tail_prefetch_stats != nullptr) {
+      assert(prefetch_buffer->min_offset_read() < file_size);
+      tail_prefetch_stats->RecordEffectiveSize(
+          static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
+    }
+  }
+
+  if (s.ok() && table_reader_cache_res_mgr) {
+    std::size_t mem_usage = new_table->ApproximateMemoryUsage();
+    s = table_reader_cache_res_mgr->MakeCacheReservation(
+        mem_usage, &(rep->table_reader_cache_res_handle));
+    if (s.IsMemoryLimit()) {
+      s = Status::MemoryLimit(
+          "Can't allocate " +
+          kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+              CacheEntryRole::kBlockBasedTableReader)] +
+          " due to memory limit based on "
+          "cache capacity for memory allocation");
+    }
+  }
+
+  if (s.ok()) {
+    *table_reader = std::move(new_table);
+  }
+  return s;
+}
+
+Status BlockBasedTable::PrefetchTail(
+    const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
+    bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
+    const bool prefetch_all, const bool preload_all,
+    std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
+  size_t tail_prefetch_size = 0;
+  if (tail_prefetch_stats != nullptr) {
+    // Multiple threads may get a 0 (no history) when running in parallel,
+    // but it will get cleared after the first of them finishes.
+    tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
+  }
+  if (tail_prefetch_size == 0) {
+    // Before read footer, readahead backwards to prefetch data. Do more
+    // readahead if we're going to read index/filter.
+    // TODO: This may incorrectly select small readahead in case partitioned
+    // index/filter is enabled and top-level partition pinning is enabled.
+    // That's because we need to issue readahead before we read the properties,
+    // at which point we don't yet know the index type.
+    tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
+  }
+  size_t prefetch_off;
+  size_t prefetch_len;
+  if (file_size < tail_prefetch_size) {
+    prefetch_off = 0;
+    prefetch_len = static_cast<size_t>(file_size);
+  } else {
+    prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
+    prefetch_len = tail_prefetch_size;
+  }
+  TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
+                           &tail_prefetch_size);
+
+  // Try file system prefetch
+  if (!file->use_direct_io() && !force_direct_prefetch) {
+    if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
+             .IsNotSupported()) {
+      prefetch_buffer->reset(new FilePrefetchBuffer(
+          0 /* readahead_size */, 0 /* max_readahead_size */,
+          false /* enable */, true /* track_min_offset */));
+      return Status::OK();
+    }
+  }
+
+  // Use `FilePrefetchBuffer`
+  prefetch_buffer->reset(
+      new FilePrefetchBuffer(0 /* readahead_size */, 0 /* max_readahead_size */,
+                             true /* enable */, true /* track_min_offset */));
+
+  IOOptions opts;
+  Status s = file->PrepareIOOptions(ro, opts);
+  if (s.ok()) {
+    s = (*prefetch_buffer)
+            ->Prefetch(opts, file, prefetch_off, prefetch_len,
+                       ro.rate_limiter_priority);
+  }
+  return s;
+}
+
+Status BlockBasedTable::ReadPropertiesBlock(
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter, const SequenceNumber largest_seqno) {
+  Status s;
+  BlockHandle handle;
+  s = FindOptionalMetaBlock(meta_iter, kPropertiesBlockName, &handle);
+
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(rep_->ioptions.logger,
+                   "Error when seeking to properties block from file: %s",
+                   s.ToString().c_str());
+  } else if (!handle.IsNull()) {
+    s = meta_iter->status();
+    std::unique_ptr<TableProperties> table_properties;
+    if (s.ok()) {
+      s = ReadTablePropertiesHelper(
+          ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer,
+          rep_->ioptions, &table_properties, nullptr /* memory_allocator */);
+    }
+    IGNORE_STATUS_IF_ERROR(s);
+
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(rep_->ioptions.logger,
+                     "Encountered error while reading data from properties "
+                     "block %s",
+                     s.ToString().c_str());
+    } else {
+      assert(table_properties != nullptr);
+      rep_->table_properties = std::move(table_properties);
+      rep_->blocks_maybe_compressed =
+          rep_->table_properties->compression_name !=
+          CompressionTypeToString(kNoCompression);
+      rep_->blocks_definitely_zstd_compressed =
+          (rep_->table_properties->compression_name ==
+               CompressionTypeToString(kZSTD) ||
+           rep_->table_properties->compression_name ==
+               CompressionTypeToString(kZSTDNotFinalCompression));
+    }
+  } else {
+    ROCKS_LOG_ERROR(rep_->ioptions.logger,
+                    "Cannot find Properties block from file.");
+  }
+
+  // Read the table properties, if provided.
+  if (rep_->table_properties) {
+    rep_->whole_key_filtering &=
+        IsFeatureSupported(*(rep_->table_properties),
+                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
+                           rep_->ioptions.logger);
+    rep_->prefix_filtering &= IsFeatureSupported(
+        *(rep_->table_properties),
+        BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger);
+
+    rep_->index_key_includes_seq =
+        rep_->table_properties->index_key_is_user_key == 0;
+    rep_->index_value_is_full =
+        rep_->table_properties->index_value_is_delta_encoded == 0;
+
+    // Update index_type with the true type.
+    // If table properties don't contain index type, we assume that the table
+    // is in very old format and has kBinarySearch index type.
+    auto& props = rep_->table_properties->user_collected_properties;
+    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (pos != props.end()) {
+      rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
+          DecodeFixed32(pos->second.c_str()));
+    }
+
+    rep_->index_has_first_key =
+        rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
+
+    s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
+                                &(rep_->global_seqno));
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str());
+    }
+  }
+  return s;
+}
+
+Status BlockBasedTable::ReadRangeDelBlock(
+    const ReadOptions& read_options, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter,
+    const InternalKeyComparator& internal_comparator,
+    BlockCacheLookupContext* lookup_context) {
+  Status s;
+  BlockHandle range_del_handle;
+  s = FindOptionalMetaBlock(meta_iter, kRangeDelBlockName, &range_del_handle);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(
+        rep_->ioptions.logger,
+        "Error when seeking to range delete tombstones block from file: %s",
+        s.ToString().c_str());
+  } else if (!range_del_handle.IsNull()) {
+    Status tmp_status;
+    std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
+        read_options, range_del_handle,
+        /*input_iter=*/nullptr, BlockType::kRangeDeletion,
+        /*get_context=*/nullptr, lookup_context, prefetch_buffer,
+        /*for_compaction= */ false, /*async_read= */ false, tmp_status));
+    assert(iter != nullptr);
+    s = iter->status();
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(
+          rep_->ioptions.logger,
+          "Encountered error while reading data from range del block %s",
+          s.ToString().c_str());
+      IGNORE_STATUS_IF_ERROR(s);
+    } else {
+      rep_->fragmented_range_dels =
+          std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
+                                                         internal_comparator);
+    }
+  }
+  return s;
+}
+
+Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all,
+    const BlockBasedTableOptions& table_options, const int level,
+    size_t file_size, size_t max_file_size_for_l0_meta_pin,
+    BlockCacheLookupContext* lookup_context) {
+  // Find filter handle and filter type
+  if (rep_->filter_policy) {
+    auto name = rep_->filter_policy->CompatibilityName();
+    bool builtin_compatible =
+        strcmp(name, BuiltinFilterPolicy::kCompatibilityName()) == 0;
+
+    for (const auto& [filter_type, prefix] :
+         {std::make_pair(Rep::FilterType::kFullFilter, kFullFilterBlockPrefix),
+          std::make_pair(Rep::FilterType::kPartitionedFilter,
+                         kPartitionedFilterBlockPrefix),
+          std::make_pair(Rep::FilterType::kNoFilter,
+                         kObsoleteFilterBlockPrefix)}) {
+      if (builtin_compatible) {
+        // This code is only here to deal with a hiccup in early 7.0.x where
+        // there was an unintentional name change in the SST files metadata.
+        // It should be OK to remove this in the future (late 2022) and just
+        // have the 'else' code.
+        // NOTE: the test:: names below are likely not needed but included
+        // out of caution
+        static const std::unordered_set<std::string> kBuiltinNameAndAliases = {
+            BuiltinFilterPolicy::kCompatibilityName(),
+            test::LegacyBloomFilterPolicy::kClassName(),
+            test::FastLocalBloomFilterPolicy::kClassName(),
+            test::Standard128RibbonFilterPolicy::kClassName(),
+            "rocksdb.internal.DeprecatedBlockBasedBloomFilter",
+            BloomFilterPolicy::kClassName(),
+            RibbonFilterPolicy::kClassName(),
+        };
+
+        // For efficiency, do a prefix seek and see if the first match is
+        // good.
+        meta_iter->Seek(prefix);
+        if (meta_iter->status().ok() && meta_iter->Valid()) {
+          Slice key = meta_iter->key();
+          if (key.starts_with(prefix)) {
+            key.remove_prefix(prefix.size());
+            if (kBuiltinNameAndAliases.find(key.ToString()) !=
+                kBuiltinNameAndAliases.end()) {
+              Slice v = meta_iter->value();
+              Status s = rep_->filter_handle.DecodeFrom(&v);
+              if (s.ok()) {
+                rep_->filter_type = filter_type;
+                if (filter_type == Rep::FilterType::kNoFilter) {
+                  ROCKS_LOG_WARN(rep_->ioptions.logger,
+                                 "Detected obsolete filter type in %s. Read "
+                                 "performance might suffer until DB is fully "
+                                 "re-compacted.",
+                                 rep_->file->file_name().c_str());
+                }
+                break;
+              }
+            }
+          }
+        }
+      } else {
+        std::string filter_block_key = prefix + name;
+        if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
+                .ok()) {
+          rep_->filter_type = filter_type;
+          if (filter_type == Rep::FilterType::kNoFilter) {
+            ROCKS_LOG_WARN(
+                rep_->ioptions.logger,
+                "Detected obsolete filter type in %s. Read performance might "
+                "suffer until DB is fully re-compacted.",
+                rep_->file->file_name().c_str());
+          }
+          break;
+        }
+      }
+    }
+  }
+  // Partition filters cannot be enabled without partition indexes
+  assert(rep_->filter_type != Rep::FilterType::kPartitionedFilter ||
+         rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+
+  // Find compression dictionary handle
+  Status s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName,
+                                   &rep_->compression_dict_handle);
+  if (!s.ok()) {
+    return s;
+  }
+
+  BlockBasedTableOptions::IndexType index_type = rep_->index_type;
+
+  const bool use_cache = table_options.cache_index_and_filter_blocks;
+
+  const bool maybe_flushed =
+      level == 0 && file_size <= max_file_size_for_l0_meta_pin;
+  std::function<bool(PinningTier, PinningTier)> is_pinned =
+      [maybe_flushed, &is_pinned](PinningTier pinning_tier,
+                                  PinningTier fallback_pinning_tier) {
+        // Fallback to fallback would lead to infinite recursion. Disallow it.
+        assert(fallback_pinning_tier != PinningTier::kFallback);
+
+        switch (pinning_tier) {
+          case PinningTier::kFallback:
+            return is_pinned(fallback_pinning_tier,
+                             PinningTier::kNone /* fallback_pinning_tier */);
+          case PinningTier::kNone:
+            return false;
+          case PinningTier::kFlushedAndSimilar:
+            return maybe_flushed;
+          case PinningTier::kAll:
+            return true;
+        };
+
+        // In GCC, this is needed to suppress `control reaches end of non-void
+        // function [-Werror=return-type]`.
+        assert(false);
+        return false;
+      };
+  const bool pin_top_level_index = is_pinned(
+      table_options.metadata_cache_options.top_level_index_pinning,
+      table_options.pin_top_level_index_and_filter ? PinningTier::kAll
+                                                   : PinningTier::kNone);
+  const bool pin_partition =
+      is_pinned(table_options.metadata_cache_options.partition_pinning,
+                table_options.pin_l0_filter_and_index_blocks_in_cache
+                    ? PinningTier::kFlushedAndSimilar
+                    : PinningTier::kNone);
+  const bool pin_unpartitioned =
+      is_pinned(table_options.metadata_cache_options.unpartitioned_pinning,
+                table_options.pin_l0_filter_and_index_blocks_in_cache
+                    ? PinningTier::kFlushedAndSimilar
+                    : PinningTier::kNone);
+
+  // pin the first level of index
+  const bool pin_index =
+      index_type == BlockBasedTableOptions::kTwoLevelIndexSearch
+          ? pin_top_level_index
+          : pin_unpartitioned;
+  // prefetch the first level of index
+  // WART: this might be redundant (unnecessary cache hit) if !pin_index,
+  // depending on prepopulate_block_cache option
+  const bool prefetch_index = prefetch_all || pin_index;
+
+  std::unique_ptr<IndexReader> index_reader;
+  s = new_table->CreateIndexReader(ro, prefetch_buffer, meta_iter, use_cache,
+                                   prefetch_index, pin_index, lookup_context,
+                                   &index_reader);
+  if (!s.ok()) {
+    return s;
+  }
+
+  rep_->index_reader = std::move(index_reader);
+
+  // The partitions of partitioned index are always stored in cache. They
+  // are hence follow the configuration for pin and prefetch regardless of
+  // the value of cache_index_and_filter_blocks
+  if (prefetch_all || pin_partition) {
+    s = rep_->index_reader->CacheDependencies(ro, pin_partition);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  // pin the first level of filter
+  const bool pin_filter =
+      rep_->filter_type == Rep::FilterType::kPartitionedFilter
+          ? pin_top_level_index
+          : pin_unpartitioned;
+  // prefetch the first level of filter
+  // WART: this might be redundant (unnecessary cache hit) if !pin_filter,
+  // depending on prepopulate_block_cache option
+  const bool prefetch_filter = prefetch_all || pin_filter;
+
+  if (rep_->filter_policy) {
+    auto filter = new_table->CreateFilterBlockReader(
+        ro, prefetch_buffer, use_cache, prefetch_filter, pin_filter,
+        lookup_context);
+
+    if (filter) {
+      // Refer to the comment above about paritioned indexes always being cached
+      if (prefetch_all || pin_partition) {
+        s = filter->CacheDependencies(ro, pin_partition);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+      rep_->filter = std::move(filter);
+    }
+  }
+
+  if (!rep_->compression_dict_handle.IsNull()) {
+    std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
+    s = UncompressionDictReader::Create(
+        this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned,
+        pin_unpartitioned, lookup_context, &uncompression_dict_reader);
+    if (!s.ok()) {
+      return s;
+    }
+
+    rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
+  }
+
+  assert(s.ok());
+  return s;
+}
+
+void BlockBasedTable::SetupForCompaction() {
+  switch (rep_->ioptions.access_hint_on_compaction_start) {
+    case Options::NONE:
+      break;
+    case Options::NORMAL:
+      rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
+      break;
+    case Options::SEQUENTIAL:
+      rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
+      break;
+    case Options::WILLNEED:
+      rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
+      break;
+    default:
+      assert(false);
+  }
+}
+
+std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
+    const {
+  return rep_->table_properties;
+}
+
+size_t BlockBasedTable::ApproximateMemoryUsage() const {
+  size_t usage = 0;
+  if (rep_) {
+    usage += rep_->ApproximateMemoryUsage();
+  } else {
+    return usage;
+  }
+  if (rep_->filter) {
+    usage += rep_->filter->ApproximateMemoryUsage();
+  }
+  if (rep_->index_reader) {
+    usage += rep_->index_reader->ApproximateMemoryUsage();
+  }
+  if (rep_->uncompression_dict_reader) {
+    usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
+  }
+  if (rep_->table_properties) {
+    usage += rep_->table_properties->ApproximateMemoryUsage();
+  }
+  return usage;
+}
+
+// Load the meta-index-block from the file. On success, return the loaded
+// metaindex
+// block and its iterator.
+Status BlockBasedTable::ReadMetaIndexBlock(
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+    std::unique_ptr<Block>* metaindex_block,
+    std::unique_ptr<InternalIterator>* iter) {
+  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
+  // it is an empty block.
+  std::unique_ptr<Block> metaindex;
+  Status s = ReadBlockFromFile(
+      rep_->file.get(), prefetch_buffer, rep_->footer, ro,
+      rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
+      true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
+      0 /* read_amp_bytes_per_bit */, GetMemoryAllocator(rep_->table_options),
+      false /* for_compaction */, rep_->blocks_definitely_zstd_compressed,
+      nullptr /* filter_policy */, false /* async_read */);
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(rep_->ioptions.logger,
+                    "Encountered error while reading data from properties"
+                    " block %s",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  *metaindex_block = std::move(metaindex);
+  // meta block uses bytewise comparator.
+  iter->reset(metaindex_block->get()->NewMetaIterator());
+  return Status::OK();
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::GetDataBlockFromCache(
+    const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed,
+    const ReadOptions& read_options,
+    CachableEntry<TBlocklike>* out_parsed_block,
+    const UncompressionDict& uncompression_dict, BlockType block_type,
+    const bool wait, GetContext* get_context) const {
+  const size_t read_amp_bytes_per_bit =
+      block_type == BlockType::kData
+          ? rep_->table_options.read_amp_bytes_per_bit
+          : 0;
+  assert(out_parsed_block);
+  assert(out_parsed_block->IsEmpty());
+  // Here we treat the legacy name "...index_and_filter_blocks..." to mean all
+  // metadata blocks that might go into block cache, EXCEPT only those needed
+  // for the read path (Get, etc.). TableProperties should not be needed on the
+  // read path (prefix extractor setting is an O(1) size special case that we
+  // are working not to require from TableProperties), so it is not given
+  // high-priority treatment if it should go into BlockCache.
+  const Cache::Priority priority =
+      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
+              block_type != BlockType::kData &&
+              block_type != BlockType::kProperties
+          ? Cache::Priority::HIGH
+          : Cache::Priority::LOW;
+
+  Status s;
+  BlockContents* compressed_block = nullptr;
+  Cache::Handle* block_cache_compressed_handle = nullptr;
+  Statistics* statistics = rep_->ioptions.statistics.get();
+  bool using_zstd = rep_->blocks_definitely_zstd_compressed;
+  const FilterPolicy* filter_policy = rep_->filter_policy;
+  Cache::CreateCallback create_cb = GetCreateCallback<TBlocklike>(
+      read_amp_bytes_per_bit, statistics, using_zstd, filter_policy);
+
+  // Lookup uncompressed cache first
+  if (block_cache != nullptr) {
+    assert(!cache_key.empty());
+    Cache::Handle* cache_handle = nullptr;
+    cache_handle = GetEntryFromCache(
+        rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
+        block_type, wait, get_context,
+        BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), create_cb,
+        priority);
+    if (cache_handle != nullptr) {
+      out_parsed_block->SetCachedValue(
+          reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
+          block_cache, cache_handle);
+      return s;
+    }
+  }
+
+  // If not found, search from the compressed block cache.
+  assert(out_parsed_block->IsEmpty());
+
+  if (block_cache_compressed == nullptr) {
+    return s;
+  }
+
+  assert(!cache_key.empty());
+  BlockContents contents;
+  block_cache_compressed_handle =
+      block_cache_compressed->Lookup(cache_key, statistics);
+
+  // if we found in the compressed cache, then uncompress and insert into
+  // uncompressed cache
+  if (block_cache_compressed_handle == nullptr) {
+    RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
+    return s;
+  }
+
+  // found compressed block
+  RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
+  compressed_block = reinterpret_cast<BlockContents*>(
+      block_cache_compressed->Value(block_cache_compressed_handle));
+  CompressionType compression_type = GetBlockCompressionType(*compressed_block);
+  assert(compression_type != kNoCompression);
+
+  // Retrieve the uncompressed contents into a new buffer
+  UncompressionContext context(compression_type);
+  UncompressionInfo info(context, uncompression_dict, compression_type);
+  s = UncompressSerializedBlock(
+      info, compressed_block->data.data(), compressed_block->data.size(),
+      &contents, rep_->table_options.format_version, rep_->ioptions,
+      GetMemoryAllocator(rep_->table_options));
+
+  // Insert parsed block into block cache, the priority is based on the
+  // data block type.
+  if (s.ok()) {
+    std::unique_ptr<TBlocklike> block_holder(
+        BlocklikeTraits<TBlocklike>::Create(
+            std::move(contents), read_amp_bytes_per_bit, statistics,
+            rep_->blocks_definitely_zstd_compressed,
+            rep_->table_options.filter_policy.get()));
+
+    if (block_cache != nullptr && block_holder->own_bytes() &&
+        read_options.fill_cache) {
+      size_t charge = block_holder->ApproximateMemoryUsage();
+      Cache::Handle* cache_handle = nullptr;
+      auto block_holder_raw_ptr = block_holder.get();
+      s = InsertEntryToCache(
+          rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
+          BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type),
+          std::move(block_holder), charge, &cache_handle, priority);
+      if (s.ok()) {
+        assert(cache_handle != nullptr);
+        out_parsed_block->SetCachedValue(block_holder_raw_ptr, block_cache,
+                                         cache_handle);
+
+        UpdateCacheInsertionMetrics(block_type, get_context, charge,
+                                    s.IsOkOverwritten(), rep_->ioptions.stats);
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
+      }
+    } else {
+      out_parsed_block->SetOwnedValue(std::move(block_holder));
+    }
+  }
+
+  // Release hold on compressed cache entry
+  block_cache_compressed->Release(block_cache_compressed_handle);
+  return s;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::PutDataBlockToCache(
+    const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed,
+    CachableEntry<TBlocklike>* out_parsed_block, BlockContents&& block_contents,
+    CompressionType block_comp_type,
+    const UncompressionDict& uncompression_dict,
+    MemoryAllocator* memory_allocator, BlockType block_type,
+    GetContext* get_context) const {
+  const ImmutableOptions& ioptions = rep_->ioptions;
+  const uint32_t format_version = rep_->table_options.format_version;
+  const size_t read_amp_bytes_per_bit =
+      block_type == BlockType::kData
+          ? rep_->table_options.read_amp_bytes_per_bit
+          : 0;
+  const Cache::Priority priority =
+      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
+              block_type != BlockType::kData
+          ? Cache::Priority::HIGH
+          : Cache::Priority::LOW;
+  assert(out_parsed_block);
+  assert(out_parsed_block->IsEmpty());
+
+  Status s;
+  Statistics* statistics = ioptions.stats;
+
+  std::unique_ptr<TBlocklike> block_holder;
+  if (block_comp_type != kNoCompression) {
+    // Retrieve the uncompressed contents into a new buffer
+    BlockContents uncompressed_block_contents;
+    UncompressionContext context(block_comp_type);
+    UncompressionInfo info(context, uncompression_dict, block_comp_type);
+    s = UncompressBlockData(info, block_contents.data.data(),
+                            block_contents.data.size(),
+                            &uncompressed_block_contents, format_version,
+                            ioptions, memory_allocator);
+    if (!s.ok()) {
+      return s;
+    }
+
+    block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(uncompressed_block_contents), read_amp_bytes_per_bit,
+        statistics, rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get()));
+  } else {
+    block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(block_contents), read_amp_bytes_per_bit, statistics,
+        rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get()));
+  }
+
+  // Insert compressed block into compressed block cache.
+  // Release the hold on the compressed cache entry immediately.
+  if (block_cache_compressed != nullptr && block_comp_type != kNoCompression &&
+      block_contents.own_bytes()) {
+    assert(block_contents.has_trailer);
+    assert(!cache_key.empty());
+
+    // We cannot directly put block_contents because this could point to
+    // an object in the stack.
+    auto block_cont_for_comp_cache =
+        std::make_unique<BlockContents>(std::move(block_contents));
+    size_t charge = block_cont_for_comp_cache->ApproximateMemoryUsage();
+
+    s = block_cache_compressed->Insert(
+        cache_key, block_cont_for_comp_cache.get(), charge,
+        &DeleteCacheEntry<BlockContents>, nullptr /*handle*/,
+        Cache::Priority::LOW);
+
+    if (s.ok()) {
+      // Cache took ownership
+      block_cont_for_comp_cache.release();
+      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
+    } else {
+      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+    }
+  }
+
+  // insert into uncompressed block cache
+  if (block_cache != nullptr && block_holder->own_bytes()) {
+    size_t charge = block_holder->ApproximateMemoryUsage();
+    auto block_holder_raw_ptr = block_holder.get();
+    Cache::Handle* cache_handle = nullptr;
+    s = InsertEntryToCache(
+        rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
+        BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type),
+        std::move(block_holder), charge, &cache_handle, priority);
+    if (s.ok()) {
+      assert(cache_handle != nullptr);
+      out_parsed_block->SetCachedValue(block_holder_raw_ptr, block_cache,
+                                       cache_handle);
+
+      UpdateCacheInsertionMetrics(block_type, get_context, charge,
+                                  s.IsOkOverwritten(), rep_->ioptions.stats);
+    } else {
+      RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
+    }
+  } else {
+    out_parsed_block->SetOwnedValue(std::move(block_holder));
+  }
+
+  return s;
+}
+
+std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+    bool prefetch, bool pin, BlockCacheLookupContext* lookup_context) {
+  auto& rep = rep_;
+  auto filter_type = rep->filter_type;
+  if (filter_type == Rep::FilterType::kNoFilter) {
+    return std::unique_ptr<FilterBlockReader>();
+  }
+
+  assert(rep->filter_policy);
+
+  switch (filter_type) {
+    case Rep::FilterType::kPartitionedFilter:
+      return PartitionedFilterBlockReader::Create(
+          this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
+
+    case Rep::FilterType::kFullFilter:
+      return FullFilterBlockReader::Create(this, ro, prefetch_buffer, use_cache,
+                                           prefetch, pin, lookup_context);
+
+    default:
+      // filter_type is either kNoFilter (exited the function at the first if),
+      // or it must be covered in this switch block
+      assert(false);
+      return std::unique_ptr<FilterBlockReader>();
+  }
+}
+
+// disable_prefix_seek should be set to true when prefix_extractor found in SST
+// differs from the one in mutable_cf_options and index type is HashBasedIndex
+InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
+    const ReadOptions& read_options, bool disable_prefix_seek,
+    IndexBlockIter* input_iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) const {
+  assert(rep_ != nullptr);
+  assert(rep_->index_reader != nullptr);
+
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
+                                         input_iter, get_context,
+                                         lookup_context);
+}
+
+template <>
+DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
+    const Rep* rep, Block* block, BlockType block_type,
+    DataBlockIter* input_iter, bool block_contents_pinned) {
+  return block->NewDataIterator(rep->internal_comparator.user_comparator(),
+                                rep->get_global_seqno(block_type), input_iter,
+                                rep->ioptions.stats, block_contents_pinned);
+}
+
+template <>
+IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
+    const Rep* rep, Block* block, BlockType block_type,
+    IndexBlockIter* input_iter, bool block_contents_pinned) {
+  return block->NewIndexIterator(
+      rep->internal_comparator.user_comparator(),
+      rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats,
+      /* total_order_seek */ true, rep->index_has_first_key,
+      rep->index_key_includes_seq, rep->index_value_is_full,
+      block_contents_pinned);
+}
+
+// If contents is nullptr, this function looks up the block caches for the
+// data block referenced by handle, and read the block from disk if necessary.
+// If contents is non-null, it skips the cache lookup and disk read, since
+// the caller has already read it. In both cases, if ro.fill_cache is true,
+// it inserts the block into the block cache.
+template <typename TBlocklike>
+Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    const bool wait, const bool for_compaction,
+    CachableEntry<TBlocklike>* out_parsed_block, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    BlockContents* contents, bool async_read) const {
+  assert(out_parsed_block != nullptr);
+  const bool no_io = (ro.read_tier == kBlockCacheTier);
+  Cache* block_cache = rep_->table_options.block_cache.get();
+  Cache* block_cache_compressed =
+      rep_->table_options.block_cache_compressed.get();
+
+  // First, try to get the block from the cache
+  //
+  // If either block cache is enabled, we'll try to read from it.
+  Status s;
+  CacheKey key_data;
+  Slice key;
+  bool is_cache_hit = false;
+  if (block_cache != nullptr || block_cache_compressed != nullptr) {
+    // create key for block cache
+    key_data = GetCacheKey(rep_->base_cache_key, handle);
+    key = key_data.AsSlice();
+
+    if (!contents) {
+      s = GetDataBlockFromCache(key, block_cache, block_cache_compressed, ro,
+                                out_parsed_block, uncompression_dict,
+                                block_type, wait, get_context);
+      // Value could still be null at this point, so check the cache handle
+      // and update the read pattern for prefetching
+      if (out_parsed_block->GetValue() || out_parsed_block->GetCacheHandle()) {
+        // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
+        // compressed block cache.
+        is_cache_hit = true;
+        if (prefetch_buffer) {
+          // Update the block details so that PrefetchBuffer can use the read
+          // pattern to determine if reads are sequential or not for
+          // prefetching. It should also take in account blocks read from cache.
+          prefetch_buffer->UpdateReadPattern(
+              handle.offset(), BlockSizeWithTrailer(handle),
+              ro.adaptive_readahead /*decrease_readahead_size*/);
+        }
+      }
+    }
+
+    // Can't find the block from the cache. If I/O is allowed, read from the
+    // file.
+    if (out_parsed_block->GetValue() == nullptr &&
+        out_parsed_block->GetCacheHandle() == nullptr && !no_io &&
+        ro.fill_cache) {
+      Statistics* statistics = rep_->ioptions.stats;
+      const bool maybe_compressed =
+          block_type != BlockType::kFilter &&
+          block_type != BlockType::kCompressionDictionary &&
+          rep_->blocks_maybe_compressed;
+      const bool do_uncompress = maybe_compressed && !block_cache_compressed;
+      CompressionType contents_comp_type;
+      // Maybe serialized or uncompressed
+      BlockContents tmp_contents;
+      if (!contents) {
+        Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS
+                                              : READ_BLOCK_GET_MICROS;
+        StopWatch sw(rep_->ioptions.clock, statistics, histogram);
+        BlockFetcher block_fetcher(
+            rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
+            &tmp_contents, rep_->ioptions, do_uncompress, maybe_compressed,
+            block_type, uncompression_dict, rep_->persistent_cache_options,
+            GetMemoryAllocator(rep_->table_options),
+            GetMemoryAllocatorForCompressedBlock(rep_->table_options));
+
+        // If prefetch_buffer is not allocated, it will fallback to synchronous
+        // reading of block contents.
+        if (async_read && prefetch_buffer != nullptr) {
+          s = block_fetcher.ReadAsyncBlockContents();
+          if (!s.ok()) {
+            return s;
+          }
+        } else {
+          s = block_fetcher.ReadBlockContents();
+        }
+
+        contents_comp_type = block_fetcher.get_compression_type();
+        contents = &tmp_contents;
+        if (get_context) {
+          switch (block_type) {
+            case BlockType::kIndex:
+              ++get_context->get_context_stats_.num_index_read;
+              break;
+            case BlockType::kFilter:
+            case BlockType::kFilterPartitionIndex:
+              ++get_context->get_context_stats_.num_filter_read;
+              break;
+            default:
+              break;
+          }
+        }
+      } else {
+        contents_comp_type = GetBlockCompressionType(*contents);
+      }
+
+      if (s.ok()) {
+        // If filling cache is allowed and a cache is configured, try to put the
+        // block to the cache.
+        s = PutDataBlockToCache(
+            key, block_cache, block_cache_compressed, out_parsed_block,
+            std::move(*contents), contents_comp_type, uncompression_dict,
+            GetMemoryAllocator(rep_->table_options), block_type, get_context);
+      }
+    }
+  }
+
+  // Fill lookup_context.
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
+      lookup_context) {
+    size_t usage = 0;
+    uint64_t nkeys = 0;
+    if (out_parsed_block->GetValue()) {
+      // Approximate the number of keys in the block using restarts.
+      nkeys = rep_->table_options.block_restart_interval *
+              BlocklikeTraits<TBlocklike>::GetNumRestarts(
+                  *out_parsed_block->GetValue());
+      usage = out_parsed_block->GetValue()->ApproximateMemoryUsage();
+    }
+    TraceType trace_block_type = TraceType::kTraceMax;
+    switch (block_type) {
+      case BlockType::kData:
+        trace_block_type = TraceType::kBlockTraceDataBlock;
+        break;
+      case BlockType::kFilter:
+      case BlockType::kFilterPartitionIndex:
+        trace_block_type = TraceType::kBlockTraceFilterBlock;
+        break;
+      case BlockType::kCompressionDictionary:
+        trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
+        break;
+      case BlockType::kRangeDeletion:
+        trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
+        break;
+      case BlockType::kIndex:
+        trace_block_type = TraceType::kBlockTraceIndexBlock;
+        break;
+      default:
+        // This cannot happen.
+        assert(false);
+        break;
+    }
+    bool no_insert = no_io || !ro.fill_cache;
+    if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
+            trace_block_type, lookup_context->caller)) {
+      // Defer logging the access to Get() and MultiGet() to trace additional
+      // information, e.g., referenced_key_exist_in_block.
+
+      // Make a copy of the block key here since it will be logged later.
+      lookup_context->FillLookupContext(
+          is_cache_hit, no_insert, trace_block_type,
+          /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
+    } else {
+      // Avoid making copy of block_key and cf_name when constructing the access
+      // record.
+      BlockCacheTraceRecord access_record(
+          rep_->ioptions.clock->NowMicros(),
+          /*block_key=*/"", trace_block_type,
+          /*block_size=*/usage, rep_->cf_id_for_tracing(),
+          /*cf_name=*/"", rep_->level_for_tracing(),
+          rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
+          no_insert, lookup_context->get_id,
+          lookup_context->get_from_user_specified_snapshot,
+          /*referenced_key=*/"");
+      // TODO: Should handle this error?
+      block_cache_tracer_
+          ->WriteBlockAccess(access_record, key, rep_->cf_name_for_tracing(),
+                             lookup_context->referenced_key)
+          .PermitUncheckedError();
+    }
+  }
+
+  assert(s.ok() || out_parsed_block->GetValue() == nullptr);
+  return s;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::RetrieveBlock(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<TBlocklike>* out_parsed_block, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache, bool wait_for_cache,
+    bool async_read) const {
+  assert(out_parsed_block);
+  assert(out_parsed_block->IsEmpty());
+
+  Status s;
+  if (use_cache) {
+    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
+                                     uncompression_dict, wait_for_cache,
+                                     for_compaction, out_parsed_block,
+                                     block_type, get_context, lookup_context,
+                                     /*contents=*/nullptr, async_read);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (out_parsed_block->GetValue() != nullptr ||
+        out_parsed_block->GetCacheHandle() != nullptr) {
+      assert(s.ok());
+      return s;
+    }
+  }
+
+  assert(out_parsed_block->IsEmpty());
+
+  const bool no_io = ro.read_tier == kBlockCacheTier;
+  if (no_io) {
+    return Status::Incomplete("no blocking io");
+  }
+
+  const bool maybe_compressed =
+      block_type != BlockType::kFilter &&
+      block_type != BlockType::kCompressionDictionary &&
+      rep_->blocks_maybe_compressed;
+  const bool do_uncompress = maybe_compressed;
+  std::unique_ptr<TBlocklike> block;
+
+  {
+    Histograms histogram =
+        for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS;
+    StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram);
+    s = ReadBlockFromFile(
+        rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
+        rep_->ioptions, do_uncompress, maybe_compressed, block_type,
+        uncompression_dict, rep_->persistent_cache_options,
+        block_type == BlockType::kData
+            ? rep_->table_options.read_amp_bytes_per_bit
+            : 0,
+        GetMemoryAllocator(rep_->table_options), for_compaction,
+        rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get(), async_read);
+
+    if (get_context) {
+      switch (block_type) {
+        case BlockType::kIndex:
+          ++(get_context->get_context_stats_.num_index_read);
+          break;
+        case BlockType::kFilter:
+        case BlockType::kFilterPartitionIndex:
+          ++(get_context->get_context_stats_.num_filter_read);
+          break;
+        default:
+          break;
+      }
+    }
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  out_parsed_block->SetOwnedValue(std::move(block));
+
+  assert(s.ok());
+  return s;
+}
+
+// Explicitly instantiate templates for each "blocklike" type we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<ParsedFullFilterBlock>* out_parsed_block,
+    BlockType block_type, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context, bool for_compaction,
+    bool use_cache, bool wait_for_cache, bool async_read) const;
+
+template Status BlockBasedTable::RetrieveBlock<Block>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<Block>* out_parsed_block, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache, bool wait_for_cache,
+    bool async_read) const;
+
+template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<UncompressionDict>* out_parsed_block, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache, bool wait_for_cache,
+    bool async_read) const;
+
+BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
+    const BlockBasedTable* table,
+    UnorderedMap<uint64_t, CachableEntry<Block>>* block_map)
+    : table_(table), block_map_(block_map) {}
+
+InternalIteratorBase<IndexValue>*
+BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
+    const BlockHandle& handle) {
+  // Return a block iterator on the index partition
+  auto block = block_map_->find(handle.offset());
+  // block_map_ must be exhaustive
+  if (block == block_map_->end()) {
+    assert(false);
+    // Signal problem to caller
+    return nullptr;
+  }
+  const Rep* rep = table_->get_rep();
+  assert(rep);
+
+  Statistics* kNullStats = nullptr;
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  return block->second.GetValue()->NewIndexIterator(
+      rep->internal_comparator.user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
+      rep->index_has_first_key, rep->index_key_includes_seq,
+      rep->index_value_is_full);
+}
+
+// This will be broken if the user specifies an unusual implementation
+// of Options.comparator, or if the user specifies an unusual
+// definition of prefixes in BlockBasedTableOptions.filter_policy.
+// In particular, we require the following three properties:
+//
+// 1) key.starts_with(prefix(key))
+// 2) Compare(prefix(key), key) <= 0.
+// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
+//
+// If read_options.read_tier == kBlockCacheTier, this method will do no I/O and
+// will return true if the filter block is not in memory and not found in block
+// cache.
+//
+// REQUIRES: this method shouldn't be called while the DB lock is held.
+bool BlockBasedTable::PrefixRangeMayMatch(
+    const Slice& internal_key, const ReadOptions& read_options,
+    const SliceTransform* options_prefix_extractor,
+    const bool need_upper_bound_check,
+    BlockCacheLookupContext* lookup_context) const {
+  if (!rep_->filter_policy) {
+    return true;
+  }
+
+  const SliceTransform* prefix_extractor;
+
+  if (rep_->table_prefix_extractor == nullptr) {
+    if (need_upper_bound_check) {
+      return true;
+    }
+    prefix_extractor = options_prefix_extractor;
+  } else {
+    prefix_extractor = rep_->table_prefix_extractor.get();
+  }
+  auto ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
+  auto user_key_without_ts =
+      ExtractUserKeyAndStripTimestamp(internal_key, ts_sz);
+  if (!prefix_extractor->InDomain(user_key_without_ts)) {
+    return true;
+  }
+
+  bool may_match = true;
+
+  FilterBlockReader* const filter = rep_->filter.get();
+  bool filter_checked = false;
+  if (filter != nullptr) {
+    const bool no_io = read_options.read_tier == kBlockCacheTier;
+
+    const Slice* const const_ikey_ptr = &internal_key;
+    may_match = filter->RangeMayExist(
+        read_options.iterate_upper_bound, user_key_without_ts, prefix_extractor,
+        rep_->internal_comparator.user_comparator(), const_ikey_ptr,
+        &filter_checked, need_upper_bound_check, no_io, lookup_context,
+        read_options.rate_limiter_priority);
+  }
+
+  if (filter_checked) {
+    Statistics* statistics = rep_->ioptions.stats;
+    RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
+    if (!may_match) {
+      RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
+    }
+  }
+
+  return may_match;
+}
+
+bool BlockBasedTable::PrefixExtractorChanged(
+    const SliceTransform* prefix_extractor) const {
+  if (prefix_extractor == nullptr) {
+    return true;
+  } else if (prefix_extractor == rep_->table_prefix_extractor.get()) {
+    return false;
+  } else {
+    return PrefixExtractorChangedHelper(rep_->table_properties.get(),
+                                        prefix_extractor);
+  }
+}
+
+InternalIterator* BlockBasedTable::NewIterator(
+    const ReadOptions& read_options, const SliceTransform* prefix_extractor,
+    Arena* arena, bool skip_filters, TableReaderCaller caller,
+    size_t compaction_readahead_size, bool allow_unprepared_value) {
+  BlockCacheLookupContext lookup_context{caller};
+  bool need_upper_bound_check =
+      read_options.auto_prefix_mode || PrefixExtractorChanged(prefix_extractor);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(NewIndexIterator(
+      read_options,
+      /*disable_prefix_seek=*/need_upper_bound_check &&
+          rep_->index_type == BlockBasedTableOptions::kHashSearch,
+      /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context));
+  if (arena == nullptr) {
+    return new BlockBasedTableIterator(
+        this, read_options, rep_->internal_comparator, std::move(index_iter),
+        !skip_filters && !read_options.total_order_seek &&
+            prefix_extractor != nullptr,
+        need_upper_bound_check, prefix_extractor, caller,
+        compaction_readahead_size, allow_unprepared_value);
+  } else {
+    auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
+    return new (mem) BlockBasedTableIterator(
+        this, read_options, rep_->internal_comparator, std::move(index_iter),
+        !skip_filters && !read_options.total_order_seek &&
+            prefix_extractor != nullptr,
+        need_upper_bound_check, prefix_extractor, caller,
+        compaction_readahead_size, allow_unprepared_value);
+  }
+}
+
+FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
+    const ReadOptions& read_options) {
+  if (rep_->fragmented_range_dels == nullptr) {
+    return nullptr;
+  }
+  SequenceNumber snapshot = kMaxSequenceNumber;
+  if (read_options.snapshot != nullptr) {
+    snapshot = read_options.snapshot->GetSequenceNumber();
+  }
+  return new FragmentedRangeTombstoneIterator(rep_->fragmented_range_dels,
+                                              rep_->internal_comparator,
+                                              snapshot, read_options.timestamp);
+}
+
+bool BlockBasedTable::FullFilterKeyMayMatch(
+    FilterBlockReader* filter, const Slice& internal_key, const bool no_io,
+    const SliceTransform* prefix_extractor, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) const {
+  if (filter == nullptr) {
+    return true;
+  }
+  Slice user_key = ExtractUserKey(internal_key);
+  const Slice* const const_ikey_ptr = &internal_key;
+  bool may_match = true;
+  size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
+  Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
+  if (rep_->whole_key_filtering) {
+    may_match =
+        filter->KeyMayMatch(user_key_without_ts, no_io, const_ikey_ptr,
+                            get_context, lookup_context, rate_limiter_priority);
+  } else if (!PrefixExtractorChanged(prefix_extractor) &&
+             prefix_extractor->InDomain(user_key_without_ts) &&
+             !filter->PrefixMayMatch(
+                 prefix_extractor->Transform(user_key_without_ts), no_io,
+                 const_ikey_ptr, get_context, lookup_context,
+                 rate_limiter_priority)) {
+    // FIXME ^^^: there should be no reason for Get() to depend on current
+    // prefix_extractor at all. It should always use table_prefix_extractor.
+    may_match = false;
+  }
+  if (may_match) {
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
+  }
+  return may_match;
+}
+
+void BlockBasedTable::FullFilterKeysMayMatch(
+    FilterBlockReader* filter, MultiGetRange* range, const bool no_io,
+    const SliceTransform* prefix_extractor,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) const {
+  if (filter == nullptr) {
+    return;
+  }
+  uint64_t before_keys = range->KeysLeft();
+  assert(before_keys > 0);  // Caller should ensure
+  if (rep_->whole_key_filtering) {
+    filter->KeysMayMatch(range, no_io, lookup_context, rate_limiter_priority);
+    uint64_t after_keys = range->KeysLeft();
+    if (after_keys) {
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE, after_keys);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, after_keys,
+                                rep_->level);
+    }
+    uint64_t filtered_keys = before_keys - after_keys;
+    if (filtered_keys) {
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL, filtered_keys);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, filtered_keys,
+                                rep_->level);
+    }
+  } else if (!PrefixExtractorChanged(prefix_extractor)) {
+    // FIXME ^^^: there should be no reason for MultiGet() to depend on current
+    // prefix_extractor at all. It should always use table_prefix_extractor.
+    filter->PrefixesMayMatch(range, prefix_extractor, false, lookup_context,
+                             rate_limiter_priority);
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED, before_keys);
+    uint64_t after_keys = range->KeysLeft();
+    uint64_t filtered_keys = before_keys - after_keys;
+    if (filtered_keys) {
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL,
+                 filtered_keys);
+    }
+  }
+}
+
+Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options,
+                                              std::vector<Anchor>& anchors) {
+  // We iterator the whole index block here. More efficient implementation
+  // is possible if we push this operation into IndexReader. For example, we
+  // can directly sample from restart block entries in the index block and
+  // only read keys needed. Here we take a simple solution. Performance is
+  // likely not to be a problem. We are compacting the whole file, so all
+  // keys will be read out anyway. An extra read to index block might be
+  // a small share of the overhead. We can try to optimize if needed.
+  IndexBlockIter iiter_on_stack;
+  auto iiter = NewIndexIterator(
+      read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
+      /*get_context=*/nullptr, /*lookup_context=*/nullptr);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr.reset(iiter);
+  }
+
+  // If needed the threshold could be more adaptive. For example, it can be
+  // based on size, so that a larger will be sampled to more partitions than a
+  // smaller file. The size might also need to be passed in by the caller based
+  // on total compaction size.
+  const uint64_t kMaxNumAnchors = uint64_t{128};
+  uint64_t num_blocks = this->GetTableProperties()->num_data_blocks;
+  uint64_t num_blocks_per_anchor = num_blocks / kMaxNumAnchors;
+  if (num_blocks_per_anchor == 0) {
+    num_blocks_per_anchor = 1;
+  }
+
+  uint64_t count = 0;
+  std::string last_key;
+  uint64_t range_size = 0;
+  uint64_t prev_offset = 0;
+  for (iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) {
+    const BlockHandle& bh = iiter->value().handle;
+    range_size += bh.offset() + bh.size() - prev_offset;
+    prev_offset = bh.offset() + bh.size();
+    if (++count % num_blocks_per_anchor == 0) {
+      count = 0;
+      anchors.emplace_back(iiter->user_key(), range_size);
+      range_size = 0;
+    } else {
+      last_key = iiter->user_key().ToString();
+    }
+  }
+  if (count != 0) {
+    anchors.emplace_back(last_key, range_size);
+  }
+  return Status::OK();
+}
+
+Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
+                            GetContext* get_context,
+                            const SliceTransform* prefix_extractor,
+                            bool skip_filters) {
+  assert(key.size() >= 8);  // key must be internal key
+  assert(get_context != nullptr);
+  Status s;
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+
+  FilterBlockReader* const filter =
+      !skip_filters ? rep_->filter.get() : nullptr;
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  uint64_t tracing_get_id = get_context->get_tracing_get_id();
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserGet, tracing_get_id,
+      /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+    // Trace the key since it contains both user key and sequence number.
+    lookup_context.referenced_key = key.ToString();
+    lookup_context.get_from_user_specified_snapshot =
+        read_options.snapshot != nullptr;
+  }
+  TEST_SYNC_POINT("BlockBasedTable::Get:BeforeFilterMatch");
+  const bool may_match = FullFilterKeyMayMatch(
+      filter, key, no_io, prefix_extractor, get_context, &lookup_context,
+      read_options.rate_limiter_priority);
+  TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch");
+  if (!may_match) {
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
+  } else {
+    IndexBlockIter iiter_on_stack;
+    // if prefix_extractor found in block differs from options, disable
+    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
+    bool need_upper_bound_check = false;
+    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
+      need_upper_bound_check = PrefixExtractorChanged(prefix_extractor);
+    }
+    auto iiter =
+        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+                         get_context, &lookup_context);
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+    if (iiter != &iiter_on_stack) {
+      iiter_unique_ptr.reset(iiter);
+    }
+
+    size_t ts_sz =
+        rep_->internal_comparator.user_comparator()->timestamp_size();
+    bool matched = false;  // if such user key matched a key in SST
+    bool done = false;
+    for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
+      IndexValue v = iiter->value();
+
+      if (!v.first_internal_key.empty() && !skip_filters &&
+          UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                  .CompareWithoutTimestamp(
+                      ExtractUserKey(key),
+                      ExtractUserKey(v.first_internal_key)) < 0) {
+        // The requested key falls between highest key in previous block and
+        // lowest key in current block.
+        break;
+      }
+
+      BlockCacheLookupContext lookup_data_block_context{
+          TableReaderCaller::kUserGet, tracing_get_id,
+          /*get_from_user_specified_snapshot=*/read_options.snapshot !=
+              nullptr};
+      bool does_referenced_key_exist = false;
+      DataBlockIter biter;
+      uint64_t referenced_data_size = 0;
+      Status tmp_status;
+      NewDataBlockIterator<DataBlockIter>(
+          read_options, v.handle, &biter, BlockType::kData, get_context,
+          &lookup_data_block_context, /*prefetch_buffer=*/nullptr,
+          /*for_compaction=*/false, /*async_read=*/false, tmp_status);
+
+      if (no_io && biter.status().IsIncomplete()) {
+        // couldn't get block from block_cache
+        // Update Saver.state to Found because we are only looking for
+        // whether we can guarantee the key is not there when "no_io" is set
+        get_context->MarkKeyMayExist();
+        s = biter.status();
+        break;
+      }
+      if (!biter.status().ok()) {
+        s = biter.status();
+        break;
+      }
+
+      bool may_exist = biter.SeekForGet(key);
+      // If user-specified timestamp is supported, we cannot end the search
+      // just because hash index lookup indicates the key+ts does not exist.
+      if (!may_exist && ts_sz == 0) {
+        // HashSeek cannot find the key this block and the the iter is not
+        // the end of the block, i.e. cannot be in the following blocks
+        // either. In this case, the seek_key cannot be found, so we break
+        // from the top level for-loop.
+        done = true;
+      } else {
+        // Call the *saver function on each entry/block until it returns false
+        for (; biter.Valid(); biter.Next()) {
+          ParsedInternalKey parsed_key;
+          Status pik_status = ParseInternalKey(
+              biter.key(), &parsed_key, false /* log_err_key */);  // TODO
+          if (!pik_status.ok()) {
+            s = pik_status;
+          }
+
+          if (!get_context->SaveValue(
+                  parsed_key, biter.value(), &matched,
+                  biter.IsValuePinned() ? &biter : nullptr)) {
+            if (get_context->State() == GetContext::GetState::kFound) {
+              does_referenced_key_exist = true;
+              referenced_data_size = biter.key().size() + biter.value().size();
+            }
+            done = true;
+            break;
+          }
+        }
+        s = biter.status();
+      }
+      // Write the block cache access record.
+      if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+        // Avoid making copy of block_key, cf_name, and referenced_key when
+        // constructing the access record.
+        Slice referenced_key;
+        if (does_referenced_key_exist) {
+          referenced_key = biter.key();
+        } else {
+          referenced_key = key;
+        }
+        BlockCacheTraceRecord access_record(
+            rep_->ioptions.clock->NowMicros(),
+            /*block_key=*/"", lookup_data_block_context.block_type,
+            lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+            /*cf_name=*/"", rep_->level_for_tracing(),
+            rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+            lookup_data_block_context.is_cache_hit,
+            lookup_data_block_context.no_insert,
+            lookup_data_block_context.get_id,
+            lookup_data_block_context.get_from_user_specified_snapshot,
+            /*referenced_key=*/"", referenced_data_size,
+            lookup_data_block_context.num_keys_in_block,
+            does_referenced_key_exist);
+        // TODO: Should handle status here?
+        block_cache_tracer_
+            ->WriteBlockAccess(access_record,
+                               lookup_data_block_context.block_key,
+                               rep_->cf_name_for_tracing(), referenced_key)
+            .PermitUncheckedError();
+      }
+
+      if (done) {
+        // Avoid the extra Next which is expensive in two-level indexes
+        break;
+      }
+    }
+    if (matched && filter != nullptr) {
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+                                rep_->level);
+    }
+    if (s.ok() && !iiter->status().IsNotFound()) {
+      s = iiter->status();
+    }
+  }
+
+  return s;
+}
+
+Status BlockBasedTable::MultiGetFilter(const ReadOptions& read_options,
+                                       const SliceTransform* prefix_extractor,
+                                       MultiGetRange* mget_range) {
+  if (mget_range->empty()) {
+    // Caller should ensure non-empty (performance bug)
+    assert(false);
+    return Status::OK();  // Nothing to do
+  }
+
+  FilterBlockReader* const filter = rep_->filter.get();
+  if (!filter) {
+    return Status::OK();
+  }
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+  if (mget_range->begin()->get_context) {
+    tracing_mget_id = mget_range->begin()->get_context->get_tracing_get_id();
+  }
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserMultiGet, tracing_mget_id,
+      /*_get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+  FullFilterKeysMayMatch(filter, mget_range, no_io, prefix_extractor,
+                         &lookup_context, read_options.rate_limiter_priority);
+
+  return Status::OK();
+}
+
+Status BlockBasedTable::Prefetch(const Slice* const begin,
+                                 const Slice* const end) {
+  auto& comparator = rep_->internal_comparator;
+  UserComparatorWrapper user_comparator(comparator.user_comparator());
+  // pre-condition
+  if (begin && end && comparator.Compare(*begin, *end) > 0) {
+    return Status::InvalidArgument(*begin, *end);
+  }
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+  IndexBlockIter iiter_on_stack;
+  auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                                &iiter_on_stack, /*get_context=*/nullptr,
+                                &lookup_context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+  }
+
+  if (!iiter->status().ok()) {
+    // error opening index iterator
+    return iiter->status();
+  }
+
+  // indicates if we are on the last page that need to be pre-fetched
+  bool prefetching_boundary_page = false;
+
+  for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
+       iiter->Next()) {
+    BlockHandle block_handle = iiter->value().handle;
+    const bool is_user_key = !rep_->index_key_includes_seq;
+    if (end &&
+        ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
+         (is_user_key &&
+          user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
+      if (prefetching_boundary_page) {
+        break;
+      }
+
+      // The index entry represents the last key in the data block.
+      // We should load this page into memory as well, but no more
+      prefetching_boundary_page = true;
+    }
+
+    // Load the block specified by the block_handle into the block cache
+    DataBlockIter biter;
+    Status tmp_status;
+    NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, &lookup_context,
+        /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
+        /*async_read=*/false, tmp_status);
+
+    if (!biter.status().ok()) {
+      // there was an unexpected error while pre-fetching
+      return biter.status();
+    }
+  }
+
+  return Status::OK();
+}
+
+Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
+                                       TableReaderCaller caller) {
+  Status s;
+  // Check Meta blocks
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  ReadOptions ro;
+  s = ReadMetaIndexBlock(ro, nullptr /* prefetch buffer */, &metaindex,
+                         &metaindex_iter);
+  if (s.ok()) {
+    s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    return s;
+  }
+  // Check Data blocks
+  IndexBlockIter iiter_on_stack;
+  BlockCacheLookupContext context{caller};
+  InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
+      read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
+      /*get_context=*/nullptr, &context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+  }
+  if (!iiter->status().ok()) {
+    // error opening index iterator
+    return iiter->status();
+  }
+  s = VerifyChecksumInBlocks(read_options, iiter);
+  return s;
+}
+
+Status BlockBasedTable::VerifyChecksumInBlocks(
+    const ReadOptions& read_options,
+    InternalIteratorBase<IndexValue>* index_iter) {
+  Status s;
+  // We are scanning the whole file, so no need to do exponential
+  // increasing of the buffer size.
+  size_t readahead_size = (read_options.readahead_size != 0)
+                              ? read_options.readahead_size
+                              : rep_->table_options.max_auto_readahead_size;
+  // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
+  // needed there.
+  FilePrefetchBuffer prefetch_buffer(
+      readahead_size /* readahead_size */,
+      readahead_size /* max_readahead_size */,
+      !rep_->ioptions.allow_mmap_reads /* enable */);
+
+  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+    s = index_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    BlockHandle handle = index_iter->value().handle;
+    BlockContents contents;
+    BlockFetcher block_fetcher(
+        rep_->file.get(), &prefetch_buffer, rep_->footer, read_options, handle,
+        &contents, rep_->ioptions, false /* decompress */,
+        false /*maybe_compressed*/, BlockType::kData,
+        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+    s = block_fetcher.ReadBlockContents();
+    if (!s.ok()) {
+      break;
+    }
+  }
+  if (s.ok()) {
+    // In the case of two level indexes, we would have exited the above loop
+    // by checking index_iter->Valid(), but Valid() might have returned false
+    // due to an IO error. So check the index_iter status
+    s = index_iter->status();
+  }
+  return s;
+}
+
+BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
+    const Slice& meta_block_name) {
+  if (meta_block_name.starts_with(kFullFilterBlockPrefix)) {
+    return BlockType::kFilter;
+  }
+
+  if (meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
+    return BlockType::kFilterPartitionIndex;
+  }
+
+  if (meta_block_name == kPropertiesBlockName) {
+    return BlockType::kProperties;
+  }
+
+  if (meta_block_name == kCompressionDictBlockName) {
+    return BlockType::kCompressionDictionary;
+  }
+
+  if (meta_block_name == kRangeDelBlockName) {
+    return BlockType::kRangeDeletion;
+  }
+
+  if (meta_block_name == kHashIndexPrefixesBlock) {
+    return BlockType::kHashIndexPrefixes;
+  }
+
+  if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
+    return BlockType::kHashIndexMetadata;
+  }
+
+  if (meta_block_name.starts_with(kObsoleteFilterBlockPrefix)) {
+    // Obsolete but possible in old files
+    return BlockType::kInvalid;
+  }
+
+  assert(false);
+  return BlockType::kInvalid;
+}
+
+Status BlockBasedTable::VerifyChecksumInMetaBlocks(
+    InternalIteratorBase<Slice>* index_iter) {
+  Status s;
+  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+    s = index_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    BlockHandle handle;
+    Slice input = index_iter->value();
+    s = handle.DecodeFrom(&input);
+    BlockContents contents;
+    const Slice meta_block_name = index_iter->key();
+    if (meta_block_name == kPropertiesBlockName) {
+      // Unfortunate special handling for properties block checksum w/
+      // global seqno
+      std::unique_ptr<TableProperties> table_properties;
+      s = ReadTablePropertiesHelper(ReadOptions(), handle, rep_->file.get(),
+                                    nullptr /* prefetch_buffer */, rep_->footer,
+                                    rep_->ioptions, &table_properties,
+                                    nullptr /* memory_allocator */);
+    } else {
+      s = BlockFetcher(
+              rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
+              ReadOptions(), handle, &contents, rep_->ioptions,
+              false /* decompress */, false /*maybe_compressed*/,
+              GetBlockTypeForMetaBlockByName(meta_block_name),
+              UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options)
+              .ReadBlockContents();
+    }
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
+bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
+  assert(rep_ != nullptr);
+
+  Cache* const cache = rep_->table_options.block_cache.get();
+  if (cache == nullptr) {
+    return false;
+  }
+
+  CacheKey key = GetCacheKey(rep_->base_cache_key, handle);
+
+  Cache::Handle* const cache_handle = cache->Lookup(key.AsSlice());
+  if (cache_handle == nullptr) {
+    return false;
+  }
+
+  cache->Release(cache_handle);
+
+  return true;
+}
+
+bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
+                                      const Slice& key) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
+      options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+      /*get_context=*/nullptr, /*lookup_context=*/nullptr));
+  iiter->Seek(key);
+  assert(iiter->Valid());
+
+  return TEST_BlockInCache(iiter->value().handle);
+}
+
+// REQUIRES: The following fields of rep_ should have already been populated:
+//  1. file
+//  2. index_handle,
+//  3. options
+//  4. internal_comparator
+//  5. index_type
+Status BlockBasedTable::CreateIndexReader(
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter, bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<IndexReader>* index_reader) {
+  switch (rep_->index_type) {
+    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
+      return PartitionIndexReader::Create(this, ro, prefetch_buffer, use_cache,
+                                          prefetch, pin, lookup_context,
+                                          index_reader);
+    }
+    case BlockBasedTableOptions::kBinarySearch:
+      FALLTHROUGH_INTENDED;
+    case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
+      return BinarySearchIndexReader::Create(this, ro, prefetch_buffer,
+                                             use_cache, prefetch, pin,
+                                             lookup_context, index_reader);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      if (!rep_->table_prefix_extractor) {
+        ROCKS_LOG_WARN(rep_->ioptions.logger,
+                       "Missing prefix extractor for hash index. Fall back to"
+                       " binary search index.");
+        return BinarySearchIndexReader::Create(this, ro, prefetch_buffer,
+                                               use_cache, prefetch, pin,
+                                               lookup_context, index_reader);
+      } else {
+        return HashIndexReader::Create(this, ro, prefetch_buffer, meta_iter,
+                                       use_cache, prefetch, pin, lookup_context,
+                                       index_reader);
+      }
+    }
+    default: {
+      std::string error_message =
+          "Unrecognized index type: " + std::to_string(rep_->index_type);
+      return Status::InvalidArgument(error_message.c_str());
+    }
+  }
+}
+
+uint64_t BlockBasedTable::ApproximateDataOffsetOf(
+    const InternalIteratorBase<IndexValue>& index_iter,
+    uint64_t data_size) const {
+  assert(index_iter.status().ok());
+  if (index_iter.Valid()) {
+    BlockHandle handle = index_iter.value().handle;
+    return handle.offset();
+  } else {
+    // The iterator is past the last key in the file.
+    return data_size;
+  }
+}
+
+uint64_t BlockBasedTable::GetApproximateDataSize() {
+  // Should be in table properties unless super old version
+  if (rep_->table_properties) {
+    return rep_->table_properties->data_size;
+  }
+  // Fall back to rough estimate from footer
+  return rep_->footer.metaindex_handle().offset();
+}
+
+uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
+                                              TableReaderCaller caller) {
+  uint64_t data_size = GetApproximateDataSize();
+  if (UNLIKELY(data_size == 0)) {
+    // Hmm. Let's just split in half to avoid skewing one way or another,
+    // since we don't know whether we're operating on lower bound or
+    // upper bound.
+    return rep_->file_size / 2;
+  }
+
+  BlockCacheLookupContext context(caller);
+  IndexBlockIter iiter_on_stack;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  auto index_iter =
+      NewIndexIterator(ro, /*disable_prefix_seek=*/true,
+                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+                       /*lookup_context=*/&context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (index_iter != &iiter_on_stack) {
+    iiter_unique_ptr.reset(index_iter);
+  }
+
+  index_iter->Seek(key);
+  uint64_t offset;
+  if (index_iter->status().ok()) {
+    offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Split in half to avoid skewing one way or another,
+    // since we don't know whether we're operating on lower bound or
+    // upper bound.
+    return rep_->file_size / 2;
+  }
+
+  // Pro-rate file metadata (incl filters) size-proportionally across data
+  // blocks.
+  double size_ratio =
+      static_cast<double>(offset) / static_cast<double>(data_size);
+  return static_cast<uint64_t>(size_ratio *
+                               static_cast<double>(rep_->file_size));
+}
+
+uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
+                                          TableReaderCaller caller) {
+  assert(rep_->internal_comparator.Compare(start, end) <= 0);
+
+  uint64_t data_size = GetApproximateDataSize();
+  if (UNLIKELY(data_size == 0)) {
+    // Hmm. Assume whole file is involved, since we have lower and upper
+    // bound. This likely skews the estimate if we consider that this function
+    // is typically called with `[start, end]` fully contained in the file's
+    // key-range.
+    return rep_->file_size;
+  }
+
+  BlockCacheLookupContext context(caller);
+  IndexBlockIter iiter_on_stack;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  auto index_iter =
+      NewIndexIterator(ro, /*disable_prefix_seek=*/true,
+                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+                       /*lookup_context=*/&context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (index_iter != &iiter_on_stack) {
+    iiter_unique_ptr.reset(index_iter);
+  }
+
+  index_iter->Seek(start);
+  uint64_t start_offset;
+  if (index_iter->status().ok()) {
+    start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Assume file is involved from the start. This likely skews the estimate
+    // but is consistent with the above error handling.
+    start_offset = 0;
+  }
+
+  index_iter->Seek(end);
+  uint64_t end_offset;
+  if (index_iter->status().ok()) {
+    end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Assume file is involved until the end. This likely skews the estimate
+    // but is consistent with the above error handling.
+    end_offset = data_size;
+  }
+
+  assert(end_offset >= start_offset);
+  // Pro-rate file metadata (incl filters) size-proportionally across data
+  // blocks.
+  double size_ratio = static_cast<double>(end_offset - start_offset) /
+                      static_cast<double>(data_size);
+  return static_cast<uint64_t>(size_ratio *
+                               static_cast<double>(rep_->file_size));
+}
+
+bool BlockBasedTable::TEST_FilterBlockInCache() const {
+  assert(rep_ != nullptr);
+  return rep_->filter_type != Rep::FilterType::kNoFilter &&
+         TEST_BlockInCache(rep_->filter_handle);
+}
+
+bool BlockBasedTable::TEST_IndexBlockInCache() const {
+  assert(rep_ != nullptr);
+
+  return TEST_BlockInCache(rep_->footer.index_handle());
+}
+
+Status BlockBasedTable::GetKVPairsFromDataBlocks(
+    std::vector<KVPairBlock>* kv_pair_blocks) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    // Cannot read Index Block
+    return s;
+  }
+
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+
+    if (!s.ok()) {
+      break;
+    }
+
+    std::unique_ptr<InternalIterator> datablock_iter;
+    Status tmp_status;
+    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), blockhandles_iter->value().handle,
+        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, /*lookup_context=*/nullptr,
+        /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
+        /*async_read=*/false, tmp_status));
+    s = datablock_iter->status();
+
+    if (!s.ok()) {
+      // Error reading the block - Skipped
+      continue;
+    }
+
+    KVPairBlock kv_pair_block;
+    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+         datablock_iter->Next()) {
+      s = datablock_iter->status();
+      if (!s.ok()) {
+        // Error reading the block - Skipped
+        break;
+      }
+      const Slice& key = datablock_iter->key();
+      const Slice& value = datablock_iter->value();
+      std::string key_copy = std::string(key.data(), key.size());
+      std::string value_copy = std::string(value.data(), value.size());
+
+      kv_pair_block.push_back(
+          std::make_pair(std::move(key_copy), std::move(value_copy)));
+    }
+    kv_pair_blocks->push_back(std::move(kv_pair_block));
+  }
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpTable(WritableFile* out_file) {
+  WritableFileStringStreamAdapter out_file_wrapper(out_file);
+  std::ostream out_stream(&out_file_wrapper);
+  // Output Footer
+  out_stream << "Footer Details:\n"
+                "--------------------------------------\n";
+  out_stream << "  " << rep_->footer.ToString() << "\n";
+
+  // Output MetaIndex
+  out_stream << "Metaindex Details:\n"
+                "--------------------------------------\n";
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  ReadOptions ro;
+  Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex,
+                                &metaindex_iter);
+  if (s.ok()) {
+    for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+         metaindex_iter->Next()) {
+      s = metaindex_iter->status();
+      if (!s.ok()) {
+        return s;
+      }
+      if (metaindex_iter->key() == kPropertiesBlockName) {
+        out_stream << "  Properties block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
+      } else if (metaindex_iter->key() == kCompressionDictBlockName) {
+        out_stream << "  Compression dictionary block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
+      } else if (strstr(metaindex_iter->key().ToString().c_str(),
+                        "filter.rocksdb.") != nullptr) {
+        out_stream << "  Filter block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
+      } else if (metaindex_iter->key() == kRangeDelBlockName) {
+        out_stream << "  Range deletion block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
+      }
+    }
+    out_stream << "\n";
+  } else {
+    return s;
+  }
+
+  // Output TableProperties
+  const ROCKSDB_NAMESPACE::TableProperties* table_properties;
+  table_properties = rep_->table_properties.get();
+
+  if (table_properties != nullptr) {
+    out_stream << "Table Properties:\n"
+                  "--------------------------------------\n";
+    out_stream << "  " << table_properties->ToString("\n  ", ": ") << "\n";
+  }
+
+  if (rep_->filter) {
+    out_stream << "Filter Details:\n"
+                  "--------------------------------------\n";
+    out_stream << "  " << rep_->filter->ToString() << "\n";
+  }
+
+  // Output Index block
+  s = DumpIndexBlock(out_stream);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Output compression dictionary
+  if (rep_->uncompression_dict_reader) {
+    CachableEntry<UncompressionDict> uncompression_dict;
+    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        nullptr /* prefetch_buffer */, false /* no_io */,
+        false, /* verify_checksums */
+        nullptr /* get_context */, nullptr /* lookup_context */,
+        &uncompression_dict);
+    if (!s.ok()) {
+      return s;
+    }
+
+    assert(uncompression_dict.GetValue());
+
+    const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
+    out_stream << "Compression Dictionary:\n"
+                  "--------------------------------------\n";
+    out_stream << "  size (bytes): " << raw_dict.size() << "\n\n";
+    out_stream << "  HEX    " << raw_dict.ToString(true) << "\n\n";
+  }
+
+  // Output range deletions block
+  auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
+  if (range_del_iter != nullptr) {
+    range_del_iter->SeekToFirst();
+    if (range_del_iter->Valid()) {
+      out_stream << "Range deletions:\n"
+                    "--------------------------------------\n";
+      for (; range_del_iter->Valid(); range_del_iter->Next()) {
+        DumpKeyValue(range_del_iter->key(), range_del_iter->value(),
+                     out_stream);
+      }
+      out_stream << "\n";
+    }
+    delete range_del_iter;
+  }
+  // Output Data blocks
+  s = DumpDataBlocks(out_stream);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (!out_stream.good()) {
+    return Status::IOError("Failed to write to output file");
+  }
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) {
+  out_stream << "Index Details:\n"
+                "--------------------------------------\n";
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_stream << "Can not read Index Block \n\n";
+    return s;
+  }
+
+  out_stream << "  Block key hex dump: Data block handle\n";
+  out_stream << "  Block key ascii\n\n";
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    Slice key = blockhandles_iter->key();
+    Slice user_key;
+    InternalKey ikey;
+    if (!rep_->index_key_includes_seq) {
+      user_key = key;
+    } else {
+      ikey.DecodeFrom(key);
+      user_key = ikey.user_key();
+    }
+
+    out_stream << "  HEX    " << user_key.ToString(true) << ": "
+               << blockhandles_iter->value().ToString(true,
+                                                      rep_->index_has_first_key)
+               << " offset " << blockhandles_iter->value().handle.offset()
+               << " size " << blockhandles_iter->value().handle.size() << "\n";
+
+    std::string str_key = user_key.ToString();
+    std::string res_key("");
+    char cspace = ' ';
+    for (size_t i = 0; i < str_key.size(); i++) {
+      res_key.append(&str_key[i], 1);
+      res_key.append(1, cspace);
+    }
+    out_stream << "  ASCII  " << res_key << "\n";
+    out_stream << "  ------\n";
+  }
+  out_stream << "\n";
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_stream << "Can not read Index Block \n\n";
+    return s;
+  }
+
+  uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
+  uint64_t datablock_size_max = 0;
+  uint64_t datablock_size_sum = 0;
+
+  size_t block_id = 1;
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       block_id++, blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    BlockHandle bh = blockhandles_iter->value().handle;
+    uint64_t datablock_size = bh.size();
+    datablock_size_min = std::min(datablock_size_min, datablock_size);
+    datablock_size_max = std::max(datablock_size_max, datablock_size);
+    datablock_size_sum += datablock_size;
+
+    out_stream << "Data Block # " << block_id << " @ "
+               << blockhandles_iter->value().handle.ToString(true) << "\n";
+    out_stream << "--------------------------------------\n";
+
+    std::unique_ptr<InternalIterator> datablock_iter;
+    Status tmp_status;
+    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), blockhandles_iter->value().handle,
+        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, /*lookup_context=*/nullptr,
+        /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
+        /*async_read=*/false, tmp_status));
+    s = datablock_iter->status();
+
+    if (!s.ok()) {
+      out_stream << "Error reading the block - Skipped \n\n";
+      continue;
+    }
+
+    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+         datablock_iter->Next()) {
+      s = datablock_iter->status();
+      if (!s.ok()) {
+        out_stream << "Error reading the block - Skipped \n";
+        break;
+      }
+      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream);
+    }
+    out_stream << "\n";
+  }
+
+  uint64_t num_datablocks = block_id - 1;
+  if (num_datablocks) {
+    double datablock_size_avg =
+        static_cast<double>(datablock_size_sum) / num_datablocks;
+    out_stream << "Data Block Summary:\n";
+    out_stream << "--------------------------------------\n";
+    out_stream << "  # data blocks: " << num_datablocks << "\n";
+    out_stream << "  min data block size: " << datablock_size_min << "\n";
+    out_stream << "  max data block size: " << datablock_size_max << "\n";
+    out_stream << "  avg data block size: "
+               << std::to_string(datablock_size_avg) << "\n";
+  }
+
+  return Status::OK();
+}
+
+void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
+                                   std::ostream& out_stream) {
+  InternalKey ikey;
+  ikey.DecodeFrom(key);
+
+  out_stream << "  HEX    " << ikey.user_key().ToString(true) << ": "
+             << value.ToString(true) << "\n";
+
+  std::string str_key = ikey.user_key().ToString();
+  std::string str_value = value.ToString();
+  std::string res_key(""), res_value("");
+  char cspace = ' ';
+  for (size_t i = 0; i < str_key.size(); i++) {
+    if (str_key[i] == '\0') {
+      res_key.append("\\0", 2);
+    } else {
+      res_key.append(&str_key[i], 1);
+    }
+    res_key.append(1, cspace);
+  }
+  for (size_t i = 0; i < str_value.size(); i++) {
+    if (str_value[i] == '\0') {
+      res_value.append("\\0", 2);
+    } else {
+      res_value.append(&str_value[i], 1);
+    }
+    res_value.append(1, cspace);
+  }
+
+  out_stream << "  ASCII  " << res_key << ": " << res_value << "\n";
+  out_stream << "  ------\n";
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader.h b/src/rocksdb/table/block_based/block_based_table_reader.h
new file mode 100644
index 000000000..89de891c9
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader.h
@@ -0,0 +1,739 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
+#include "cache/cache_reservation_manager.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "file/filename.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/uncompression_dict_reader.h"
+#include "table/format.h"
+#include "table/persistent_cache_options.h"
+#include "table/table_properties_internal.h"
+#include "table/table_reader.h"
+#include "table/two_level_iterator.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/coro_utils.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class FilterBlockReader;
+class FullFilterBlockReader;
+class Footer;
+class InternalKeyComparator;
+class Iterator;
+class FSRandomAccessFile;
+class TableCache;
+class TableReader;
+class WritableFile;
+struct BlockBasedTableOptions;
+struct EnvOptions;
+struct ReadOptions;
+class GetContext;
+
+using KVPairBlock = std::vector<std::pair<std::string, std::string>>;
+
+// Reader class for BlockBasedTable format.
+// For the format of BlockBasedTable refer to
+// https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
+// This is the default table type. Data is chucked into fixed size blocks and
+// each block in-turn stores entries. When storing data, we can compress and/or
+// encode data efficiently within a block, which often results in a much smaller
+// data size compared with the raw data size. As for the record retrieval, we'll
+// first locate the block where target record may reside, then read the block to
+// memory, and finally search that record within the block. Of course, to avoid
+// frequent reads of the same block, we introduced the block cache to keep the
+// loaded blocks in the memory.
+class BlockBasedTable : public TableReader {
+ public:
+  static const std::string kObsoleteFilterBlockPrefix;
+  static const std::string kFullFilterBlockPrefix;
+  static const std::string kPartitionedFilterBlockPrefix;
+
+  // 1-byte compression type + 32-bit checksum
+  static constexpr size_t kBlockTrailerSize = 5;
+
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
+  //
+  // If successful, returns ok and sets "*table_reader" to the newly opened
+  // table.  The client should delete "*table_reader" when no longer needed.
+  // If there was an error while initializing the table, sets "*table_reader"
+  // to nullptr and returns a non-ok status.
+  //
+  // @param file must remain live while this Table is in use.
+  // @param prefetch_index_and_filter_in_cache can be used to disable
+  // prefetching of
+  //    index and filter blocks into block cache at startup
+  // @param skip_filters Disables loading/accessing the filter block. Overrides
+  //    prefetch_index_and_filter_in_cache, so filter will be skipped if both
+  //    are set.
+  // @param force_direct_prefetch if true, always prefetching to RocksDB
+  //    buffer, rather than calling RandomAccessFile::Prefetch().
+  static Status Open(
+      const ReadOptions& ro, const ImmutableOptions& ioptions,
+      const EnvOptions& env_options,
+      const BlockBasedTableOptions& table_options,
+      const InternalKeyComparator& internal_key_comparator,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr =
+          nullptr,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false,
+      int level = -1, const bool immortal_table = false,
+      const SequenceNumber largest_seqno = 0,
+      bool force_direct_prefetch = false,
+      TailPrefetchStats* tail_prefetch_stats = nullptr,
+      BlockCacheTracer* const block_cache_tracer = nullptr,
+      size_t max_file_size_for_l0_meta_pin = 0,
+      const std::string& cur_db_session_id = "", uint64_t cur_file_num = 0,
+      UniqueId64x2 expected_unique_id = {});
+
+  bool PrefixRangeMayMatch(const Slice& internal_key,
+                           const ReadOptions& read_options,
+                           const SliceTransform* options_prefix_extractor,
+                           const bool need_upper_bound_check,
+                           BlockCacheLookupContext* lookup_context) const;
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  // @param read_options Must outlive the returned iterator.
+  // @param skip_filters Disables loading/accessing the filter block
+  // compaction_readahead_size: its value will only be used if caller =
+  // kCompaction.
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
+
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& read_options) override;
+
+  // @param skip_filters Disables loading/accessing the filter block
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
+
+  Status MultiGetFilter(const ReadOptions& read_options,
+                        const SliceTransform* prefix_extractor,
+                        MultiGetRange* mget_range) override;
+
+  DECLARE_SYNC_AND_ASYNC_OVERRIDE(void, MultiGet,
+                                  const ReadOptions& readOptions,
+                                  const MultiGetContext::Range* mget_range,
+                                  const SliceTransform* prefix_extractor,
+                                  bool skip_filters = false);
+
+  // Pre-fetch the disk blocks that correspond to the key range specified by
+  // (kbegin, kend). The call will return error status in the event of
+  // IO or iteration error.
+  Status Prefetch(const Slice* begin, const Slice* end) override;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file). The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  uint64_t ApproximateOffsetOf(const Slice& key,
+                               TableReaderCaller caller) override;
+
+  // Given start and end keys, return the approximate data size in the file
+  // between the keys. The returned value is in terms of file bytes, and so
+  // includes effects like compression of the underlying data.
+  // The start key must not be greater than the end key.
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           TableReaderCaller caller) override;
+
+  Status ApproximateKeyAnchors(const ReadOptions& read_options,
+                               std::vector<Anchor>& anchors) override;
+
+  bool TEST_BlockInCache(const BlockHandle& handle) const;
+
+  // Returns true if the block for the specified key is in cache.
+  // REQUIRES: key is in this table && block cache enabled
+  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  void SetupForCompaction() override;
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+  size_t ApproximateMemoryUsage() const override;
+
+  // convert SST file to a human readable form
+  Status DumpTable(WritableFile* out_file) override;
+
+  Status VerifyChecksum(const ReadOptions& readOptions,
+                        TableReaderCaller caller) override;
+
+  ~BlockBasedTable();
+
+  bool TEST_FilterBlockInCache() const;
+  bool TEST_IndexBlockInCache() const;
+
+  // IndexReader is the interface that provides the functionality for index
+  // access.
+  class IndexReader {
+   public:
+    virtual ~IndexReader() = default;
+
+    // Create an iterator for index access. If iter is null, then a new object
+    // is created on the heap, and the callee will have the ownership.
+    // If a non-null iter is passed in, it will be used, and the returned value
+    // is either the same as iter or a new on-heap object that
+    // wraps the passed iter. In the latter case the return value points
+    // to a different object then iter, and the callee has the ownership of the
+    // returned object.
+    virtual InternalIteratorBase<IndexValue>* NewIterator(
+        const ReadOptions& read_options, bool disable_prefix_seek,
+        IndexBlockIter* iter, GetContext* get_context,
+        BlockCacheLookupContext* lookup_context) = 0;
+
+    // Report an approximation of how much memory has been used other than
+    // memory that was allocated in block cache.
+    virtual size_t ApproximateMemoryUsage() const = 0;
+    // Cache the dependencies of the index reader (e.g. the partitions
+    // of a partitioned index).
+    virtual Status CacheDependencies(const ReadOptions& /*ro*/,
+                                     bool /* pin */) {
+      return Status::OK();
+    }
+  };
+
+  class IndexReaderCommon;
+
+  static void SetupBaseCacheKey(const TableProperties* properties,
+                                const std::string& cur_db_session_id,
+                                uint64_t cur_file_number,
+                                OffsetableCacheKey* out_base_cache_key,
+                                bool* out_is_stable = nullptr);
+
+  static CacheKey GetCacheKey(const OffsetableCacheKey& base_cache_key,
+                              const BlockHandle& handle);
+
+  static void UpdateCacheInsertionMetrics(BlockType block_type,
+                                          GetContext* get_context, size_t usage,
+                                          bool redundant,
+                                          Statistics* const statistics);
+
+  // Get the size to read from storage for a BlockHandle. size_t because we
+  // are about to load into memory.
+  static inline size_t BlockSizeWithTrailer(const BlockHandle& handle) {
+    return static_cast<size_t>(handle.size() + kBlockTrailerSize);
+  }
+
+  // It is the caller's responsibility to make sure that this is called with
+  // block-based table serialized block contents, which contains the compression
+  // byte in the trailer after `block_size`.
+  static inline CompressionType GetBlockCompressionType(const char* block_data,
+                                                        size_t block_size) {
+    return static_cast<CompressionType>(block_data[block_size]);
+  }
+  static inline CompressionType GetBlockCompressionType(
+      const BlockContents& contents) {
+    assert(contents.has_trailer);
+    return GetBlockCompressionType(contents.data.data(), contents.data.size());
+  }
+
+  // Retrieve all key value pairs from data blocks in the table.
+  // The key retrieved are internal keys.
+  Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
+
+  struct Rep;
+
+  Rep* get_rep() { return rep_; }
+  const Rep* get_rep() const { return rep_; }
+
+  // input_iter: if it is not null, update this one and return it as Iterator
+  template <typename TBlockIter>
+  TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
+                                   const BlockHandle& block_handle,
+                                   TBlockIter* input_iter, BlockType block_type,
+                                   GetContext* get_context,
+                                   BlockCacheLookupContext* lookup_context,
+                                   FilePrefetchBuffer* prefetch_buffer,
+                                   bool for_compaction, bool async_read,
+                                   Status& s) const;
+
+  // input_iter: if it is not null, update this one and return it as Iterator
+  template <typename TBlockIter>
+  TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
+                                   CachableEntry<Block>& block,
+                                   TBlockIter* input_iter, Status s) const;
+
+  class PartitionedIndexIteratorState;
+
+  template <typename TBlocklike>
+  friend class FilterBlockReaderCommon;
+
+  friend class PartitionIndexReader;
+
+  friend class UncompressionDictReader;
+
+ protected:
+  Rep* rep_;
+  explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
+      : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
+  // No copying allowed
+  explicit BlockBasedTable(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+
+ private:
+  friend class MockedBlockBasedTable;
+  friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test;
+  BlockCacheTracer* const block_cache_tracer_;
+
+  void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
+                             size_t usage) const;
+  void UpdateCacheMissMetrics(BlockType block_type,
+                              GetContext* get_context) const;
+
+  Cache::Handle* GetEntryFromCache(const CacheTier& cache_tier,
+                                   Cache* block_cache, const Slice& key,
+                                   BlockType block_type, const bool wait,
+                                   GetContext* get_context,
+                                   const Cache::CacheItemHelper* cache_helper,
+                                   const Cache::CreateCallback& create_cb,
+                                   Cache::Priority priority) const;
+
+  template <typename TBlocklike>
+  Status InsertEntryToCache(const CacheTier& cache_tier, Cache* block_cache,
+                            const Slice& key,
+                            const Cache::CacheItemHelper* cache_helper,
+                            std::unique_ptr<TBlocklike>&& block_holder,
+                            size_t charge, Cache::Handle** cache_handle,
+                            Cache::Priority priority) const;
+
+  // Either Block::NewDataIterator() or Block::NewIndexIterator().
+  template <typename TBlockIter>
+  static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
+                                       BlockType block_type,
+                                       TBlockIter* input_iter,
+                                       bool block_contents_pinned);
+
+  // If block cache enabled (compressed or uncompressed), looks for the block
+  // identified by handle in (1) uncompressed cache, (2) compressed cache, and
+  // then (3) file. If found, inserts into the cache(s) that were searched
+  // unsuccessfully (e.g., if found in file, will add to both uncompressed and
+  // compressed caches if they're enabled).
+  //
+  // @param block_entry value is set to the uncompressed block if found. If
+  //    in uncompressed block cache, also sets cache_handle to reference that
+  //    block.
+  template <typename TBlocklike>
+  Status MaybeReadBlockAndLoadToCache(
+      FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+      const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+      const bool wait, const bool for_compaction,
+      CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+      GetContext* get_context, BlockCacheLookupContext* lookup_context,
+      BlockContents* contents, bool async_read) const;
+
+  // Similar to the above, with one crucial difference: it will retrieve the
+  // block from the file even if there are no caches configured (assuming the
+  // read options allow I/O).
+  template <typename TBlocklike>
+  Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
+                       const ReadOptions& ro, const BlockHandle& handle,
+                       const UncompressionDict& uncompression_dict,
+                       CachableEntry<TBlocklike>* block_entry,
+                       BlockType block_type, GetContext* get_context,
+                       BlockCacheLookupContext* lookup_context,
+                       bool for_compaction, bool use_cache, bool wait_for_cache,
+                       bool async_read) const;
+
+  DECLARE_SYNC_AND_ASYNC_CONST(
+      void, RetrieveMultipleBlocks, const ReadOptions& options,
+      const MultiGetRange* batch,
+      const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
+      autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+      autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>*
+          results,
+      char* scratch, const UncompressionDict& uncompression_dict);
+
+  // Get the iterator from the index reader.
+  //
+  // If input_iter is not set, return a new Iterator.
+  // If input_iter is set, try to update it and return it as Iterator.
+  // However note that in some cases the returned iterator may be different
+  // from input_iter. In such case the returned iterator should be freed.
+  //
+  // Note: ErrorIterator with Status::Incomplete shall be returned if all the
+  // following conditions are met:
+  //  1. We enabled table_options.cache_index_and_filter_blocks.
+  //  2. index is not present in block cache.
+  //  3. We disallowed any io to be performed, that is, read_options ==
+  //     kBlockCacheTier
+  InternalIteratorBase<IndexValue>* NewIndexIterator(
+      const ReadOptions& read_options, bool need_upper_bound_check,
+      IndexBlockIter* input_iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) const;
+
+  // Read block cache from block caches (if set): block_cache and
+  // block_cache_compressed.
+  // On success, Status::OK with be returned and @block will be populated with
+  // pointer to the block as well as its block handle.
+  // @param uncompression_dict Data for presetting the compression library's
+  //    dictionary.
+  template <typename TBlocklike>
+  Status GetDataBlockFromCache(const Slice& cache_key, Cache* block_cache,
+                               Cache* block_cache_compressed,
+                               const ReadOptions& read_options,
+                               CachableEntry<TBlocklike>* block,
+                               const UncompressionDict& uncompression_dict,
+                               BlockType block_type, const bool wait,
+                               GetContext* get_context) const;
+
+  // Put a maybe compressed block to the corresponding block caches.
+  // This method will perform decompression against block_contents if needed
+  // and then populate the block caches.
+  // On success, Status::OK will be returned; also @block will be populated with
+  // uncompressed block and its cache handle.
+  //
+  // Allocated memory managed by block_contents will be transferred to
+  // PutDataBlockToCache(). After the call, the object will be invalid.
+  // @param uncompression_dict Data for presetting the compression library's
+  //    dictionary.
+  template <typename TBlocklike>
+  Status PutDataBlockToCache(const Slice& cache_key, Cache* block_cache,
+                             Cache* block_cache_compressed,
+                             CachableEntry<TBlocklike>* cached_block,
+                             BlockContents&& block_contents,
+                             CompressionType block_comp_type,
+                             const UncompressionDict& uncompression_dict,
+                             MemoryAllocator* memory_allocator,
+                             BlockType block_type,
+                             GetContext* get_context) const;
+
+  // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
+  // after a call to Seek(key), until handle_result returns false.
+  // May not make such a call if filter policy says that key is not present.
+  friend class TableCache;
+  friend class BlockBasedTableBuilder;
+
+  // Create a index reader based on the index type stored in the table.
+  // Optionally, user can pass a preloaded meta_index_iter for the index that
+  // need to access extra meta blocks for index construction. This parameter
+  // helps avoid re-reading meta index block if caller already created one.
+  Status CreateIndexReader(const ReadOptions& ro,
+                           FilePrefetchBuffer* prefetch_buffer,
+                           InternalIterator* preloaded_meta_index_iter,
+                           bool use_cache, bool prefetch, bool pin,
+                           BlockCacheLookupContext* lookup_context,
+                           std::unique_ptr<IndexReader>* index_reader);
+
+  bool FullFilterKeyMayMatch(FilterBlockReader* filter, const Slice& user_key,
+                             const bool no_io,
+                             const SliceTransform* prefix_extractor,
+                             GetContext* get_context,
+                             BlockCacheLookupContext* lookup_context,
+                             Env::IOPriority rate_limiter_priority) const;
+
+  void FullFilterKeysMayMatch(FilterBlockReader* filter, MultiGetRange* range,
+                              const bool no_io,
+                              const SliceTransform* prefix_extractor,
+                              BlockCacheLookupContext* lookup_context,
+                              Env::IOPriority rate_limiter_priority) const;
+
+  // If force_direct_prefetch is true, always prefetching to RocksDB
+  //    buffer, rather than calling RandomAccessFile::Prefetch().
+  static Status PrefetchTail(
+      const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
+      bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
+      const bool prefetch_all, const bool preload_all,
+      std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
+  Status ReadMetaIndexBlock(const ReadOptions& ro,
+                            FilePrefetchBuffer* prefetch_buffer,
+                            std::unique_ptr<Block>* metaindex_block,
+                            std::unique_ptr<InternalIterator>* iter);
+  Status ReadPropertiesBlock(const ReadOptions& ro,
+                             FilePrefetchBuffer* prefetch_buffer,
+                             InternalIterator* meta_iter,
+                             const SequenceNumber largest_seqno);
+  Status ReadRangeDelBlock(const ReadOptions& ro,
+                           FilePrefetchBuffer* prefetch_buffer,
+                           InternalIterator* meta_iter,
+                           const InternalKeyComparator& internal_comparator,
+                           BlockCacheLookupContext* lookup_context);
+  Status PrefetchIndexAndFilterBlocks(
+      const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+      InternalIterator* meta_iter, BlockBasedTable* new_table,
+      bool prefetch_all, const BlockBasedTableOptions& table_options,
+      const int level, size_t file_size, size_t max_file_size_for_l0_meta_pin,
+      BlockCacheLookupContext* lookup_context);
+
+  static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
+
+  Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
+  Status VerifyChecksumInBlocks(const ReadOptions& read_options,
+                                InternalIteratorBase<IndexValue>* index_iter);
+
+  // Create the filter from the filter block.
+  std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
+      const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
+
+  // Size of all data blocks, maybe approximate
+  uint64_t GetApproximateDataSize();
+
+  // Given an iterator return its offset in data block section of file.
+  uint64_t ApproximateDataOffsetOf(
+      const InternalIteratorBase<IndexValue>& index_iter,
+      uint64_t data_size) const;
+
+  // Helper functions for DumpTable()
+  Status DumpIndexBlock(std::ostream& out_stream);
+  Status DumpDataBlocks(std::ostream& out_stream);
+  void DumpKeyValue(const Slice& key, const Slice& value,
+                    std::ostream& out_stream);
+
+  // Returns false if prefix_extractor exists and is compatible with that used
+  // in building the table file, otherwise true.
+  bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const;
+
+  // A cumulative data block file read in MultiGet lower than this size will
+  // use a stack buffer
+  static constexpr size_t kMultiGetReadStackBufSize = 8192;
+
+  friend class PartitionedFilterBlockReader;
+  friend class PartitionedFilterBlockTest;
+  friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
+};
+
+// Maintaining state of a two-level iteration on a partitioned index structure.
+class BlockBasedTable::PartitionedIndexIteratorState
+    : public TwoLevelIteratorState {
+ public:
+  PartitionedIndexIteratorState(
+      const BlockBasedTable* table,
+      UnorderedMap<uint64_t, CachableEntry<Block>>* block_map);
+  InternalIteratorBase<IndexValue>* NewSecondaryIterator(
+      const BlockHandle& index_value) override;
+
+ private:
+  // Don't own table_
+  const BlockBasedTable* table_;
+  UnorderedMap<uint64_t, CachableEntry<Block>>* block_map_;
+};
+
+// Stores all the properties associated with a BlockBasedTable.
+// These are immutable.
+struct BlockBasedTable::Rep {
+  Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options,
+      const BlockBasedTableOptions& _table_opt,
+      const InternalKeyComparator& _internal_comparator, bool skip_filters,
+      uint64_t _file_size, int _level, const bool _immortal_table)
+      : ioptions(_ioptions),
+        env_options(_env_options),
+        table_options(_table_opt),
+        filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
+        internal_comparator(_internal_comparator),
+        filter_type(FilterType::kNoFilter),
+        index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
+        whole_key_filtering(_table_opt.whole_key_filtering),
+        prefix_filtering(true),
+        global_seqno(kDisableGlobalSequenceNumber),
+        file_size(_file_size),
+        level(_level),
+        immortal_table(_immortal_table) {}
+  ~Rep() { status.PermitUncheckedError(); }
+  const ImmutableOptions& ioptions;
+  const EnvOptions& env_options;
+  const BlockBasedTableOptions table_options;
+  const FilterPolicy* const filter_policy;
+  const InternalKeyComparator& internal_comparator;
+  Status status;
+  std::unique_ptr<RandomAccessFileReader> file;
+  OffsetableCacheKey base_cache_key;
+  PersistentCacheOptions persistent_cache_options;
+
+  // Footer contains the fixed table information
+  Footer footer;
+
+  std::unique_ptr<IndexReader> index_reader;
+  std::unique_ptr<FilterBlockReader> filter;
+  std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
+
+  enum class FilterType {
+    kNoFilter,
+    kFullFilter,
+    kPartitionedFilter,
+  };
+  FilterType filter_type;
+  BlockHandle filter_handle;
+  BlockHandle compression_dict_handle;
+
+  std::shared_ptr<const TableProperties> table_properties;
+  BlockBasedTableOptions::IndexType index_type;
+  bool whole_key_filtering;
+  bool prefix_filtering;
+  std::shared_ptr<const SliceTransform> table_prefix_extractor;
+
+  std::shared_ptr<FragmentedRangeTombstoneList> fragmented_range_dels;
+
+  // If global_seqno is used, all Keys in this file will have the same
+  // seqno with value `global_seqno`.
+  //
+  // A value of kDisableGlobalSequenceNumber means that this feature is disabled
+  // and every key have it's own seqno.
+  SequenceNumber global_seqno;
+
+  // Size of the table file on disk
+  uint64_t file_size;
+
+  // the level when the table is opened, could potentially change when trivial
+  // move is involved
+  int level;
+
+  // If false, blocks in this file are definitely all uncompressed. Knowing this
+  // before reading individual blocks enables certain optimizations.
+  bool blocks_maybe_compressed = true;
+
+  // If true, data blocks in this file are definitely ZSTD compressed. If false
+  // they might not be. When false we skip creating a ZSTD digested
+  // uncompression dictionary. Even if we get a false negative, things should
+  // still work, just not as quickly.
+  bool blocks_definitely_zstd_compressed = false;
+
+  // These describe how index is encoded.
+  bool index_has_first_key = false;
+  bool index_key_includes_seq = true;
+  bool index_value_is_full = true;
+
+  const bool immortal_table;
+
+  std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+      table_reader_cache_res_handle = nullptr;
+
+  SequenceNumber get_global_seqno(BlockType block_type) const {
+    return (block_type == BlockType::kFilterPartitionIndex ||
+            block_type == BlockType::kCompressionDictionary)
+               ? kDisableGlobalSequenceNumber
+               : global_seqno;
+  }
+
+  uint64_t cf_id_for_tracing() const {
+    return table_properties
+               ? table_properties->column_family_id
+               : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context::
+                     kUnknownColumnFamily;
+  }
+
+  Slice cf_name_for_tracing() const {
+    return table_properties ? table_properties->column_family_name
+                            : BlockCacheTraceHelper::kUnknownColumnFamilyName;
+  }
+
+  uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
+
+  uint64_t sst_number_for_tracing() const {
+    return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
+  }
+  void CreateFilePrefetchBuffer(
+      size_t readahead_size, size_t max_readahead_size,
+      std::unique_ptr<FilePrefetchBuffer>* fpb, bool implicit_auto_readahead,
+      uint64_t num_file_reads,
+      uint64_t num_file_reads_for_auto_readahead) const {
+    fpb->reset(new FilePrefetchBuffer(
+        readahead_size, max_readahead_size,
+        !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset */,
+        implicit_auto_readahead, num_file_reads,
+        num_file_reads_for_auto_readahead, ioptions.fs.get(), ioptions.clock,
+        ioptions.stats));
+  }
+
+  void CreateFilePrefetchBufferIfNotExists(
+      size_t readahead_size, size_t max_readahead_size,
+      std::unique_ptr<FilePrefetchBuffer>* fpb, bool implicit_auto_readahead,
+      uint64_t num_file_reads,
+      uint64_t num_file_reads_for_auto_readahead) const {
+    if (!(*fpb)) {
+      CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb,
+                               implicit_auto_readahead, num_file_reads,
+                               num_file_reads_for_auto_readahead);
+    }
+  }
+
+  std::size_t ApproximateMemoryUsage() const {
+    std::size_t usage = 0;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<BlockBasedTable::Rep*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+};
+
+// This is an adapter class for `WritableFile` to be used for `std::ostream`.
+// The adapter wraps a `WritableFile`, which can be passed to a `std::ostream`
+// constructor for storing streaming data.
+// Note:
+//  * This adapter doesn't provide any buffering, each write is forwarded to
+//    `WritableFile->Append()` directly.
+//  * For a failed write, the user needs to check the status by `ostream.good()`
+class WritableFileStringStreamAdapter : public std::stringbuf {
+ public:
+  explicit WritableFileStringStreamAdapter(WritableFile* writable_file)
+      : file_(writable_file) {}
+
+  // Override overflow() to handle `sputc()`. There are cases that will not go
+  // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by
+  // `os.put()` directly and will call `sputc()` By internal implementation:
+  //    int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) {  // put a character
+  //        return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) :
+  //        overflow(_Traits::to_int_type(_Ch));
+  //    }
+  // As we explicitly disabled buffering (_Pnavail() is always 0), every write,
+  // not captured by xsputn(), becomes an overflow here.
+  int overflow(int ch = EOF) override {
+    if (ch != EOF) {
+      Status s = file_->Append(Slice((char*)&ch, 1));
+      if (s.ok()) {
+        return ch;
+      }
+    }
+    return EOF;
+  }
+
+  std::streamsize xsputn(char const* p, std::streamsize n) override {
+    Status s = file_->Append(Slice(p, n));
+    if (!s.ok()) {
+      return 0;
+    }
+    return n;
+  }
+
+ private:
+  WritableFile* file_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader_impl.h b/src/rocksdb/table/block_based/block_based_table_reader_impl.h
new file mode 100644
index 000000000..1f6f5f223
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader_impl.h
@@ -0,0 +1,171 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/reader_common.h"
+
+// The file contains some member functions of BlockBasedTable that
+// cannot be implemented in block_based_table_reader.cc because
+// it's called by other files (e.g. block_based_iterator.h) and
+// are templates.
+
+namespace ROCKSDB_NAMESPACE {
+// Convert an index iterator value (i.e., an encoded BlockHandle)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(
+    const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
+    BlockType block_type, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    FilePrefetchBuffer* prefetch_buffer, bool for_compaction, bool async_read,
+    Status& s) const {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  CachableEntry<Block> block;
+  if (rep_->uncompression_dict_reader && block_type == BlockType::kData) {
+    CachableEntry<UncompressionDict> uncompression_dict;
+    const bool no_io = (ro.read_tier == kBlockCacheTier);
+    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        prefetch_buffer, no_io, ro.verify_checksums, get_context,
+        lookup_context, &uncompression_dict);
+    if (!s.ok()) {
+      iter->Invalidate(s);
+      return iter;
+    }
+    const UncompressionDict& dict = uncompression_dict.GetValue()
+                                        ? *uncompression_dict.GetValue()
+                                        : UncompressionDict::GetEmptyDict();
+    s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
+                      get_context, lookup_context, for_compaction,
+                      /* use_cache */ true, /* wait_for_cache */ true,
+                      async_read);
+  } else {
+    s = RetrieveBlock(
+        prefetch_buffer, ro, handle, UncompressionDict::GetEmptyDict(), &block,
+        block_type, get_context, lookup_context, for_compaction,
+        /* use_cache */ true, /* wait_for_cache */ true, async_read);
+  }
+
+  if (s.IsTryAgain() && async_read) {
+    return iter;
+  }
+
+  if (!s.ok()) {
+    assert(block.IsEmpty());
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), block_type, iter,
+                                       block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache) {
+      Cache* const block_cache = rep_->table_options.block_cache.get();
+      if (block_cache) {
+        // insert a dummy record to block cache to track the memory usage
+        Cache::Handle* cache_handle = nullptr;
+        CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache);
+        s = block_cache->Insert(key.AsSlice(), nullptr,
+                                block.GetValue()->ApproximateMemoryUsage(),
+                                nullptr, &cache_handle);
+
+        if (s.ok()) {
+          assert(cache_handle != nullptr);
+          iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                                cache_handle);
+        }
+      }
+    }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
+  }
+
+  block.TransferTo(iter);
+
+  return iter;
+}
+
+// Convert an uncompressed data block (i.e CachableEntry<Block>)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
+                                                  CachableEntry<Block>& block,
+                                                  TBlockIter* input_iter,
+                                                  Status s) const {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), BlockType::kData,
+                                       iter, block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache) {
+      Cache* const block_cache = rep_->table_options.block_cache.get();
+      if (block_cache) {
+        // insert a dummy record to block cache to track the memory usage
+        Cache::Handle* cache_handle = nullptr;
+        CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache);
+        s = block_cache->Insert(key.AsSlice(), nullptr,
+                                block.GetValue()->ApproximateMemoryUsage(),
+                                nullptr, &cache_handle);
+
+        if (s.ok()) {
+          assert(cache_handle != nullptr);
+          iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                                cache_handle);
+        }
+      }
+    }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
+  }
+
+  block.TransferTo(iter);
+  return iter;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h b/src/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h
new file mode 100644
index 000000000..8c7547a2a
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h
@@ -0,0 +1,760 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/async_file_reader.h"
+#include "util/coro_utils.h"
+
+#if defined(WITHOUT_COROUTINES) || \
+    (defined(USE_COROUTINES) && defined(WITH_COROUTINES))
+
+namespace ROCKSDB_NAMESPACE {
+
+// This function reads multiple data blocks from disk using Env::MultiRead()
+// and optionally inserts them into the block cache. It uses the scratch
+// buffer provided by the caller, which is contiguous. If scratch is a nullptr
+// it allocates a separate buffer for each block. Typically, if the blocks
+// need to be uncompressed and there is no compressed block cache, callers
+// can allocate a temporary scratch buffer in order to minimize memory
+// allocations.
+// If options.fill_cache is true, it inserts the blocks into cache. If its
+// false and scratch is non-null and the blocks are uncompressed, it copies
+// the buffers to heap. In any case, the CachableEntry<Block> returned will
+// own the data bytes.
+// If compression is enabled and also there is no compressed block cache,
+// the adjacent blocks are read out in one IO (combined read)
+// batch - A MultiGetRange with only those keys with unique data blocks not
+//         found in cache
+// handles - A vector of block handles. Some of them me be NULL handles
+// scratch - An optional contiguous buffer to read compressed blocks into
+DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
+(const ReadOptions& options, const MultiGetRange* batch,
+ const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
+ autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+ autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
+ char* scratch, const UncompressionDict& uncompression_dict) const {
+  RandomAccessFileReader* file = rep_->file.get();
+  const Footer& footer = rep_->footer;
+  const ImmutableOptions& ioptions = rep_->ioptions;
+  size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
+  MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
+
+  if (ioptions.allow_mmap_reads) {
+    size_t idx_in_batch = 0;
+    for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+         ++mget_iter, ++idx_in_batch) {
+      BlockCacheLookupContext lookup_data_block_context(
+          TableReaderCaller::kUserMultiGet);
+      const BlockHandle& handle = (*handles)[idx_in_batch];
+      if (handle.IsNull()) {
+        continue;
+      }
+
+      (*statuses)[idx_in_batch] =
+          RetrieveBlock(nullptr, options, handle, uncompression_dict,
+                        &(*results)[idx_in_batch], BlockType::kData,
+                        mget_iter->get_context, &lookup_data_block_context,
+                        /* for_compaction */ false, /* use_cache */ true,
+                        /* wait_for_cache */ true, /* async_read */ false);
+    }
+    CO_RETURN;
+  }
+
+  // In direct IO mode, blocks share the direct io buffer.
+  // Otherwise, blocks share the scratch buffer.
+  const bool use_shared_buffer = file->use_direct_io() || scratch != nullptr;
+
+  autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
+  size_t buf_offset = 0;
+  size_t idx_in_batch = 0;
+
+  uint64_t prev_offset = 0;
+  size_t prev_len = 0;
+  autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block;
+  autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block;
+  for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+       ++mget_iter, ++idx_in_batch) {
+    const BlockHandle& handle = (*handles)[idx_in_batch];
+    if (handle.IsNull()) {
+      continue;
+    }
+
+    size_t prev_end = static_cast<size_t>(prev_offset) + prev_len;
+
+    // If current block is adjacent to the previous one, at the same time,
+    // compression is enabled and there is no compressed cache, we combine
+    // the two block read as one.
+    // We don't combine block reads here in direct IO mode, because when doing
+    // direct IO read, the block requests will be realigned and merged when
+    // necessary.
+    if (use_shared_buffer && !file->use_direct_io() &&
+        prev_end == handle.offset()) {
+      req_offset_for_block.emplace_back(prev_len);
+      prev_len += BlockSizeWithTrailer(handle);
+    } else {
+      // No compression or current block and previous one is not adjacent:
+      // Step 1, create a new request for previous blocks
+      if (prev_len != 0) {
+        FSReadRequest req;
+        req.offset = prev_offset;
+        req.len = prev_len;
+        if (file->use_direct_io()) {
+          req.scratch = nullptr;
+        } else if (use_shared_buffer) {
+          req.scratch = scratch + buf_offset;
+          buf_offset += req.len;
+        } else {
+          req.scratch = new char[req.len];
+        }
+        read_reqs.emplace_back(req);
+      }
+
+      // Step 2, remeber the previous block info
+      prev_offset = handle.offset();
+      prev_len = BlockSizeWithTrailer(handle);
+      req_offset_for_block.emplace_back(0);
+    }
+    req_idx_for_block.emplace_back(read_reqs.size());
+
+    PERF_COUNTER_ADD(block_read_count, 1);
+    PERF_COUNTER_ADD(block_read_byte, BlockSizeWithTrailer(handle));
+  }
+  // Handle the last block and process the pending last request
+  if (prev_len != 0) {
+    FSReadRequest req;
+    req.offset = prev_offset;
+    req.len = prev_len;
+    if (file->use_direct_io()) {
+      req.scratch = nullptr;
+    } else if (use_shared_buffer) {
+      req.scratch = scratch + buf_offset;
+    } else {
+      req.scratch = new char[req.len];
+    }
+    read_reqs.emplace_back(req);
+  }
+
+  AlignedBuf direct_io_buf;
+  {
+    IOOptions opts;
+    IOStatus s = file->PrepareIOOptions(options, opts);
+    if (s.ok()) {
+#if defined(WITH_COROUTINES)
+      if (file->use_direct_io()) {
+#endif  // WITH_COROUTINES
+        s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(),
+                            &direct_io_buf, options.rate_limiter_priority);
+#if defined(WITH_COROUTINES)
+      } else {
+        co_await batch->context()->reader().MultiReadAsync(
+            file, opts, &read_reqs[0], read_reqs.size(), &direct_io_buf);
+      }
+#endif  // WITH_COROUTINES
+    }
+    if (!s.ok()) {
+      // Discard all the results in this batch if there is any time out
+      // or overall MultiRead error
+      for (FSReadRequest& req : read_reqs) {
+        req.status = s;
+      }
+    }
+  }
+
+  idx_in_batch = 0;
+  size_t valid_batch_idx = 0;
+  for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+       ++mget_iter, ++idx_in_batch) {
+    const BlockHandle& handle = (*handles)[idx_in_batch];
+
+    if (handle.IsNull()) {
+      continue;
+    }
+
+    assert(valid_batch_idx < req_idx_for_block.size());
+    assert(valid_batch_idx < req_offset_for_block.size());
+    assert(req_idx_for_block[valid_batch_idx] < read_reqs.size());
+    size_t& req_idx = req_idx_for_block[valid_batch_idx];
+    size_t& req_offset = req_offset_for_block[valid_batch_idx];
+    valid_batch_idx++;
+    FSReadRequest& req = read_reqs[req_idx];
+    Status s = req.status;
+    if (s.ok()) {
+      if ((req.result.size() != req.len) ||
+          (req_offset + BlockSizeWithTrailer(handle) > req.result.size())) {
+        s = Status::Corruption("truncated block read from " +
+                               rep_->file->file_name() + " offset " +
+                               std::to_string(handle.offset()) + ", expected " +
+                               std::to_string(req.len) + " bytes, got " +
+                               std::to_string(req.result.size()));
+      }
+    }
+
+    BlockContents serialized_block;
+    if (s.ok()) {
+      if (!use_shared_buffer) {
+        // We allocated a buffer for this block. Give ownership of it to
+        // BlockContents so it can free the memory
+        assert(req.result.data() == req.scratch);
+        assert(req.result.size() == BlockSizeWithTrailer(handle));
+        assert(req_offset == 0);
+        serialized_block =
+            BlockContents(std::unique_ptr<char[]>(req.scratch), handle.size());
+      } else {
+        // We used the scratch buffer or direct io buffer
+        // which are shared by the blocks.
+        // serialized_block does not have the ownership.
+        serialized_block =
+            BlockContents(Slice(req.result.data() + req_offset, handle.size()));
+      }
+#ifndef NDEBUG
+      serialized_block.has_trailer = true;
+#endif
+
+      if (options.verify_checksums) {
+        PERF_TIMER_GUARD(block_checksum_time);
+        const char* data = req.result.data();
+        // Since the scratch might be shared, the offset of the data block in
+        // the buffer might not be 0. req.result.data() only point to the
+        // begin address of each read request, we need to add the offset
+        // in each read request. Checksum is stored in the block trailer,
+        // beyond the payload size.
+        s = VerifyBlockChecksum(footer.checksum_type(), data + req_offset,
+                                handle.size(), rep_->file->file_name(),
+                                handle.offset());
+        TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
+      }
+    } else if (!use_shared_buffer) {
+      // Free the allocated scratch buffer.
+      delete[] req.scratch;
+    }
+
+    if (s.ok()) {
+      // When the blocks share the same underlying buffer (scratch or direct io
+      // buffer), we may need to manually copy the block into heap if the
+      // serialized block has to be inserted into a cache. That falls into the
+      // following cases -
+      // 1. serialized block is not compressed, it needs to be inserted into
+      //    the uncompressed block cache if there is one
+      // 2. If the serialized block is compressed, it needs to be inserted
+      //    into the compressed block cache if there is one
+      //
+      // In all other cases, the serialized block is either uncompressed into a
+      // heap buffer or there is no cache at all.
+      CompressionType compression_type =
+          GetBlockCompressionType(serialized_block);
+      if (use_shared_buffer && (compression_type == kNoCompression ||
+                                (compression_type != kNoCompression &&
+                                 rep_->table_options.block_cache_compressed))) {
+        Slice serialized =
+            Slice(req.result.data() + req_offset, BlockSizeWithTrailer(handle));
+        serialized_block = BlockContents(
+            CopyBufferToHeap(GetMemoryAllocator(rep_->table_options),
+                             serialized),
+            handle.size());
+#ifndef NDEBUG
+        serialized_block.has_trailer = true;
+#endif
+      }
+    }
+
+    if (s.ok()) {
+      if (options.fill_cache) {
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet);
+        CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
+        // MaybeReadBlockAndLoadToCache will insert into the block caches if
+        // necessary. Since we're passing the serialized block contents, it
+        // will avoid looking up the block cache
+        s = MaybeReadBlockAndLoadToCache(
+            nullptr, options, handle, uncompression_dict, /*wait=*/true,
+            /*for_compaction=*/false, block_entry, BlockType::kData,
+            mget_iter->get_context, &lookup_data_block_context,
+            &serialized_block, /*async_read=*/false);
+
+        // block_entry value could be null if no block cache is present, i.e
+        // BlockBasedTableOptions::no_block_cache is true and no compressed
+        // block cache is configured. In that case, fall
+        // through and set up the block explicitly
+        if (block_entry->GetValue() != nullptr) {
+          s.PermitUncheckedError();
+          continue;
+        }
+      }
+
+      CompressionType compression_type =
+          GetBlockCompressionType(serialized_block);
+      BlockContents contents;
+      if (compression_type != kNoCompression) {
+        UncompressionContext context(compression_type);
+        UncompressionInfo info(context, uncompression_dict, compression_type);
+        s = UncompressSerializedBlock(
+            info, req.result.data() + req_offset, handle.size(), &contents,
+            footer.format_version(), rep_->ioptions, memory_allocator);
+      } else {
+        // There are two cases here:
+        // 1) caller uses the shared buffer (scratch or direct io buffer);
+        // 2) we use the requst buffer.
+        // If scratch buffer or direct io buffer is used, we ensure that
+        // all serialized blocks are copyed to the heap as single blocks. If
+        // scratch buffer is not used, we also have no combined read, so the
+        // serialized block can be used directly.
+        contents = std::move(serialized_block);
+      }
+      if (s.ok()) {
+        (*results)[idx_in_batch].SetOwnedValue(std::make_unique<Block>(
+            std::move(contents), read_amp_bytes_per_bit, ioptions.stats));
+      }
+    }
+    (*statuses)[idx_in_batch] = s;
+  }
+}
+
+using MultiGetRange = MultiGetContext::Range;
+DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
+(const ReadOptions& read_options, const MultiGetRange* mget_range,
+ const SliceTransform* prefix_extractor, bool skip_filters) {
+  if (mget_range->empty()) {
+    // Caller should ensure non-empty (performance bug)
+    assert(false);
+    CO_RETURN;  // Nothing to do
+  }
+
+  FilterBlockReader* const filter =
+      !skip_filters ? rep_->filter.get() : nullptr;
+  MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
+                               mget_range->end());
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+  if (sst_file_range.begin()->get_context) {
+    tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
+  }
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserMultiGet, tracing_mget_id,
+      /*_get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+  FullFilterKeysMayMatch(filter, &sst_file_range, no_io, prefix_extractor,
+                         &lookup_context, read_options.rate_limiter_priority);
+
+  if (!sst_file_range.empty()) {
+    IndexBlockIter iiter_on_stack;
+    // if prefix_extractor found in block differs from options, disable
+    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
+    bool need_upper_bound_check = false;
+    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
+      need_upper_bound_check = PrefixExtractorChanged(prefix_extractor);
+    }
+    auto iiter =
+        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+                         sst_file_range.begin()->get_context, &lookup_context);
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+    if (iiter != &iiter_on_stack) {
+      iiter_unique_ptr.reset(iiter);
+    }
+
+    uint64_t prev_offset = std::numeric_limits<uint64_t>::max();
+    autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
+    autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
+    autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
+    MultiGetContext::Mask reused_mask = 0;
+    char stack_buf[kMultiGetReadStackBufSize];
+    std::unique_ptr<char[]> block_buf;
+    {
+      MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
+                                     sst_file_range.end());
+      std::vector<Cache::Handle*> cache_handles;
+      bool wait_for_cache_results = false;
+
+      CachableEntry<UncompressionDict> uncompression_dict;
+      Status uncompression_dict_status;
+      uncompression_dict_status.PermitUncheckedError();
+      bool uncompression_dict_inited = false;
+      size_t total_len = 0;
+      ReadOptions ro = read_options;
+      ro.read_tier = kBlockCacheTier;
+
+      for (auto miter = data_block_range.begin();
+           miter != data_block_range.end(); ++miter) {
+        const Slice& key = miter->ikey;
+        iiter->Seek(miter->ikey);
+
+        IndexValue v;
+        if (iiter->Valid()) {
+          v = iiter->value();
+        }
+        if (!iiter->Valid() ||
+            (!v.first_internal_key.empty() && !skip_filters &&
+             UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                     .CompareWithoutTimestamp(
+                         ExtractUserKey(key),
+                         ExtractUserKey(v.first_internal_key)) < 0)) {
+          // The requested key falls between highest key in previous block and
+          // lowest key in current block.
+          if (!iiter->status().IsNotFound()) {
+            *(miter->s) = iiter->status();
+          }
+          data_block_range.SkipKey(miter);
+          sst_file_range.SkipKey(miter);
+          continue;
+        }
+
+        if (!uncompression_dict_inited && rep_->uncompression_dict_reader) {
+          uncompression_dict_status =
+              rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+                  nullptr /* prefetch_buffer */, no_io,
+                  read_options.verify_checksums,
+                  sst_file_range.begin()->get_context, &lookup_context,
+                  &uncompression_dict);
+          uncompression_dict_inited = true;
+        }
+
+        if (!uncompression_dict_status.ok()) {
+          assert(!uncompression_dict_status.IsNotFound());
+          *(miter->s) = uncompression_dict_status;
+          data_block_range.SkipKey(miter);
+          sst_file_range.SkipKey(miter);
+          continue;
+        }
+
+        statuses.emplace_back();
+        results.emplace_back();
+        if (v.handle.offset() == prev_offset) {
+          // This key can reuse the previous block (later on).
+          // Mark previous as "reused"
+          reused_mask |= MultiGetContext::Mask{1} << (block_handles.size() - 1);
+          // Use null handle to indicate this one reuses same block as
+          // previous.
+          block_handles.emplace_back(BlockHandle::NullBlockHandle());
+          continue;
+        }
+        // Lookup the cache for the given data block referenced by an index
+        // iterator value (i.e BlockHandle). If it exists in the cache,
+        // initialize block to the contents of the data block.
+        prev_offset = v.handle.offset();
+        BlockHandle handle = v.handle;
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet);
+        const UncompressionDict& dict = uncompression_dict.GetValue()
+                                            ? *uncompression_dict.GetValue()
+                                            : UncompressionDict::GetEmptyDict();
+        Status s = RetrieveBlock(
+            nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
+            miter->get_context, &lookup_data_block_context,
+            /* for_compaction */ false, /* use_cache */ true,
+            /* wait_for_cache */ false, /* async_read */ false);
+        if (s.IsIncomplete()) {
+          s = Status::OK();
+        }
+        if (s.ok() && !results.back().IsEmpty()) {
+          // Since we have a valid handle, check the value. If its nullptr,
+          // it means the cache is waiting for the final result and we're
+          // supposed to call WaitAll() to wait for the result.
+          if (results.back().GetValue() != nullptr) {
+            // Found it in the cache. Add NULL handle to indicate there is
+            // nothing to read from disk.
+            if (results.back().GetCacheHandle()) {
+              results.back().UpdateCachedValue();
+            }
+            block_handles.emplace_back(BlockHandle::NullBlockHandle());
+          } else {
+            // We have to wait for the cache lookup to finish in the
+            // background, and then we may have to read the block from disk
+            // anyway
+            assert(results.back().GetCacheHandle());
+            wait_for_cache_results = true;
+            block_handles.emplace_back(handle);
+            cache_handles.emplace_back(results.back().GetCacheHandle());
+          }
+        } else {
+          block_handles.emplace_back(handle);
+          total_len += BlockSizeWithTrailer(handle);
+        }
+      }
+
+      if (wait_for_cache_results) {
+        Cache* block_cache = rep_->table_options.block_cache.get();
+        block_cache->WaitAll(cache_handles);
+        for (size_t i = 0; i < block_handles.size(); ++i) {
+          // If this block was a success or failure or not needed because
+          // the corresponding key is in the same block as a prior key, skip
+          if (block_handles[i] == BlockHandle::NullBlockHandle() ||
+              results[i].IsEmpty()) {
+            continue;
+          }
+          results[i].UpdateCachedValue();
+          void* val = results[i].GetValue();
+          Cache::Handle* handle = results[i].GetCacheHandle();
+          // GetContext for any key will do, as the stats will be aggregated
+          // anyway
+          GetContext* get_context = sst_file_range.begin()->get_context;
+          if (!val) {
+            // The async cache lookup failed - could be due to an error
+            // or a false positive. We need to read the data block from
+            // the SST file
+            results[i].Reset();
+            total_len += BlockSizeWithTrailer(block_handles[i]);
+            UpdateCacheMissMetrics(BlockType::kData, get_context);
+          } else {
+            block_handles[i] = BlockHandle::NullBlockHandle();
+            UpdateCacheHitMetrics(BlockType::kData, get_context,
+                                  block_cache->GetUsage(handle));
+          }
+        }
+      }
+
+      if (total_len) {
+        char* scratch = nullptr;
+        const UncompressionDict& dict = uncompression_dict.GetValue()
+                                            ? *uncompression_dict.GetValue()
+                                            : UncompressionDict::GetEmptyDict();
+        assert(uncompression_dict_inited || !rep_->uncompression_dict_reader);
+        assert(uncompression_dict_status.ok());
+        // If using direct IO, then scratch is not used, so keep it nullptr.
+        // If the blocks need to be uncompressed and we don't need the
+        // compressed blocks, then we can use a contiguous block of
+        // memory to read in all the blocks as it will be temporary
+        // storage
+        // 1. If blocks are compressed and compressed block cache is there,
+        //    alloc heap bufs
+        // 2. If blocks are uncompressed, alloc heap bufs
+        // 3. If blocks are compressed and no compressed block cache, use
+        //    stack buf
+        if (!rep_->file->use_direct_io() &&
+            rep_->table_options.block_cache_compressed == nullptr &&
+            rep_->blocks_maybe_compressed) {
+          if (total_len <= kMultiGetReadStackBufSize) {
+            scratch = stack_buf;
+          } else {
+            scratch = new char[total_len];
+            block_buf.reset(scratch);
+          }
+        }
+        CO_AWAIT(RetrieveMultipleBlocks)
+        (read_options, &data_block_range, &block_handles, &statuses, &results,
+         scratch, dict);
+        if (sst_file_range.begin()->get_context) {
+          ++(sst_file_range.begin()
+                 ->get_context->get_context_stats_.num_sst_read);
+        }
+      }
+    }
+
+    DataBlockIter first_biter;
+    DataBlockIter next_biter;
+    size_t idx_in_batch = 0;
+    SharedCleanablePtr shared_cleanable;
+    for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
+         ++miter) {
+      Status s;
+      GetContext* get_context = miter->get_context;
+      const Slice& key = miter->ikey;
+      bool matched = false;  // if such user key matched a key in SST
+      bool done = false;
+      bool first_block = true;
+      do {
+        DataBlockIter* biter = nullptr;
+        bool reusing_prev_block;
+        bool later_reused;
+        uint64_t referenced_data_size = 0;
+        bool does_referenced_key_exist = false;
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet, tracing_mget_id,
+            /*_get_from_user_specified_snapshot=*/read_options.snapshot !=
+                nullptr);
+        if (first_block) {
+          if (!block_handles[idx_in_batch].IsNull() ||
+              !results[idx_in_batch].IsEmpty()) {
+            first_biter.Invalidate(Status::OK());
+            NewDataBlockIterator<DataBlockIter>(
+                read_options, results[idx_in_batch], &first_biter,
+                statuses[idx_in_batch]);
+            reusing_prev_block = false;
+          } else {
+            // If handler is null and result is empty, then the status is never
+            // set, which should be the initial value: ok().
+            assert(statuses[idx_in_batch].ok());
+            reusing_prev_block = true;
+          }
+          biter = &first_biter;
+          later_reused =
+              (reused_mask & (MultiGetContext::Mask{1} << idx_in_batch)) != 0;
+          idx_in_batch++;
+        } else {
+          IndexValue v = iiter->value();
+          if (!v.first_internal_key.empty() && !skip_filters &&
+              UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                      .CompareWithoutTimestamp(
+                          ExtractUserKey(key),
+                          ExtractUserKey(v.first_internal_key)) < 0) {
+            // The requested key falls between highest key in previous block and
+            // lowest key in current block.
+            break;
+          }
+
+          next_biter.Invalidate(Status::OK());
+          Status tmp_s;
+          NewDataBlockIterator<DataBlockIter>(
+              read_options, iiter->value().handle, &next_biter,
+              BlockType::kData, get_context, &lookup_data_block_context,
+              /* prefetch_buffer= */ nullptr, /* for_compaction = */ false,
+              /*async_read = */ false, tmp_s);
+          biter = &next_biter;
+          reusing_prev_block = false;
+          later_reused = false;
+        }
+
+        if (read_options.read_tier == kBlockCacheTier &&
+            biter->status().IsIncomplete()) {
+          // couldn't get block from block_cache
+          // Update Saver.state to Found because we are only looking for
+          // whether we can guarantee the key is not there when "no_io" is set
+          get_context->MarkKeyMayExist();
+          break;
+        }
+        if (!biter->status().ok()) {
+          s = biter->status();
+          break;
+        }
+
+        // Reusing blocks complicates pinning/Cleanable, because the cache
+        // entry referenced by biter can only be released once all returned
+        // pinned values are released. This code previously did an extra
+        // block_cache Ref for each reuse, but that unnecessarily increases
+        // block cache contention. Instead we can use a variant of shared_ptr
+        // to release in block cache only once.
+        //
+        // Although the biter loop below might SaveValue multiple times for
+        // merges, just one value_pinner suffices, as MultiGet will merge
+        // the operands before returning to the API user.
+        Cleanable* value_pinner;
+        if (biter->IsValuePinned()) {
+          if (reusing_prev_block) {
+            // Note that we don't yet know if the MultiGet results will need
+            // to pin this block, so we might wrap a block for sharing and
+            // still end up with 1 (or 0) pinning ref. Not ideal but OK.
+            //
+            // Here we avoid adding redundant cleanups if we didn't end up
+            // delegating the cleanup from last time around.
+            if (!biter->HasCleanups()) {
+              assert(shared_cleanable.get());
+              if (later_reused) {
+                shared_cleanable.RegisterCopyWith(biter);
+              } else {
+                shared_cleanable.MoveAsCleanupTo(biter);
+              }
+            }
+          } else if (later_reused) {
+            assert(biter->HasCleanups());
+            // Make the existing cleanups on `biter` sharable:
+            shared_cleanable.Allocate();
+            // Move existing `biter` cleanup(s) to `shared_cleanable`
+            biter->DelegateCleanupsTo(&*shared_cleanable);
+            // Reference `shared_cleanable` as new cleanup for `biter`
+            shared_cleanable.RegisterCopyWith(biter);
+          }
+          assert(biter->HasCleanups());
+          value_pinner = biter;
+        } else {
+          value_pinner = nullptr;
+        }
+
+        bool may_exist = biter->SeekForGet(key);
+        if (!may_exist) {
+          // HashSeek cannot find the key this block and the the iter is not
+          // the end of the block, i.e. cannot be in the following blocks
+          // either. In this case, the seek_key cannot be found, so we break
+          // from the top level for-loop.
+          break;
+        }
+
+        // Call the *saver function on each entry/block until it returns false
+        for (; biter->Valid(); biter->Next()) {
+          ParsedInternalKey parsed_key;
+          Status pik_status = ParseInternalKey(
+              biter->key(), &parsed_key, false /* log_err_key */);  // TODO
+          if (!pik_status.ok()) {
+            s = pik_status;
+          }
+          if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
+                                      value_pinner)) {
+            if (get_context->State() == GetContext::GetState::kFound) {
+              does_referenced_key_exist = true;
+              referenced_data_size =
+                  biter->key().size() + biter->value().size();
+            }
+            done = true;
+            break;
+          }
+          s = biter->status();
+        }
+        // Write the block cache access.
+        // XXX: There appear to be 'break' statements above that bypass this
+        // writing of the block cache trace record
+        if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
+            !reusing_prev_block) {
+          // Avoid making copy of block_key, cf_name, and referenced_key when
+          // constructing the access record.
+          Slice referenced_key;
+          if (does_referenced_key_exist) {
+            referenced_key = biter->key();
+          } else {
+            referenced_key = key;
+          }
+          BlockCacheTraceRecord access_record(
+              rep_->ioptions.clock->NowMicros(),
+              /*_block_key=*/"", lookup_data_block_context.block_type,
+              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+              /*_cf_name=*/"", rep_->level_for_tracing(),
+              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+              lookup_data_block_context.is_cache_hit,
+              lookup_data_block_context.no_insert,
+              lookup_data_block_context.get_id,
+              lookup_data_block_context.get_from_user_specified_snapshot,
+              /*_referenced_key=*/"", referenced_data_size,
+              lookup_data_block_context.num_keys_in_block,
+              does_referenced_key_exist);
+          // TODO: Should handle status here?
+          block_cache_tracer_
+              ->WriteBlockAccess(access_record,
+                                 lookup_data_block_context.block_key,
+                                 rep_->cf_name_for_tracing(), referenced_key)
+              .PermitUncheckedError();
+        }
+        s = biter->status();
+        if (done) {
+          // Avoid the extra Next which is expensive in two-level indexes
+          break;
+        }
+        if (first_block) {
+          iiter->Seek(key);
+          if (!iiter->Valid()) {
+            break;
+          }
+        }
+        first_block = false;
+        iiter->Next();
+      } while (iiter->Valid());
+
+      if (matched && filter != nullptr) {
+        RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+                                  rep_->level);
+      }
+      if (s.ok() && !iiter->status().IsNotFound()) {
+        s = iiter->status();
+      }
+      *(miter->s) = s;
+    }
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    // Not sure why we need to do it. Should investigate more.
+    for (auto& st : statuses) {
+      st.PermitUncheckedError();
+    }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/table/block_based/block_based_table_reader_test.cc b/src/rocksdb/table/block_based/block_based_table_reader_test.cc
new file mode 100644
index 000000000..c5a615dfc
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader_test.cc
@@ -0,0 +1,572 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/block_based_table_reader.h"
+
+#include <cmath>
+#include <memory>
+#include <string>
+
+#include "cache/cache_reservation_manager.h"
+#include "db/db_test_util.h"
+#include "db/table_properties_collector.h"
+#include "file/file_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/partitioned_index_iterator.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBasedTableReaderBaseTest : public testing::Test {
+ protected:
+  // Prepare key-value pairs to occupy multiple blocks.
+  // Each value is 256B, every 16 pairs constitute 1 block.
+  // If mixed_with_human_readable_string_value == true,
+  // then adjacent blocks contain values with different compression
+  // complexity: human readable strings are easier to compress than random
+  // strings.
+  static std::map<std::string, std::string> GenerateKVMap(
+      int num_block = 100,
+      bool mixed_with_human_readable_string_value = false) {
+    std::map<std::string, std::string> kv;
+
+    Random rnd(101);
+    uint32_t key = 0;
+    for (int block = 0; block < num_block; block++) {
+      for (int i = 0; i < 16; i++) {
+        char k[9] = {0};
+        // Internal key is constructed directly from this key,
+        // and internal key size is required to be >= 8 bytes,
+        // so use %08u as the format string.
+        sprintf(k, "%08u", key);
+        std::string v;
+        if (mixed_with_human_readable_string_value) {
+          v = (block % 2) ? rnd.HumanReadableString(256)
+                          : rnd.RandomString(256);
+        } else {
+          v = rnd.RandomString(256);
+        }
+        kv[std::string(k)] = v;
+        key++;
+      }
+    }
+    return kv;
+  }
+
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    test_dir_ = test::PerThreadDBPath("block_based_table_reader_test");
+    env_ = Env::Default();
+    fs_ = FileSystem::Default();
+    ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+    ConfigureTableFactory();
+  }
+
+  virtual void ConfigureTableFactory() = 0;
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  // Creates a table with the specificied key value pairs (kv).
+  void CreateTable(const std::string& table_name,
+                   const CompressionType& compression_type,
+                   const std::map<std::string, std::string>& kv) {
+    std::unique_ptr<WritableFileWriter> writer;
+    NewFileWriter(table_name, &writer);
+
+    // Create table builder.
+    ImmutableOptions ioptions(options_);
+    InternalKeyComparator comparator(options_.comparator);
+    ColumnFamilyOptions cf_options;
+    MutableCFOptions moptions(cf_options);
+    IntTblPropCollectorFactories factories;
+    std::unique_ptr<TableBuilder> table_builder(
+        options_.table_factory->NewTableBuilder(
+            TableBuilderOptions(ioptions, moptions, comparator, &factories,
+                                compression_type, CompressionOptions(),
+                                0 /* column_family_id */,
+                                kDefaultColumnFamilyName, -1 /* level */),
+            writer.get()));
+
+    // Build table.
+    for (auto it = kv.begin(); it != kv.end(); it++) {
+      std::string k = ToInternalKey(it->first);
+      std::string v = it->second;
+      table_builder->Add(k, v);
+    }
+    ASSERT_OK(table_builder->Finish());
+  }
+
+  void NewBlockBasedTableReader(const FileOptions& foptions,
+                                const ImmutableOptions& ioptions,
+                                const InternalKeyComparator& comparator,
+                                const std::string& table_name,
+                                std::unique_ptr<BlockBasedTable>* table,
+                                bool prefetch_index_and_filter_in_cache = true,
+                                Status* status = nullptr) {
+    const MutableCFOptions moptions(options_);
+    TableReaderOptions table_reader_options = TableReaderOptions(
+        ioptions, moptions.prefix_extractor, EnvOptions(), comparator);
+
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, foptions, &file);
+
+    uint64_t file_size = 0;
+    ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size));
+
+    std::unique_ptr<TableReader> general_table;
+    Status s = options_.table_factory->NewTableReader(
+        ReadOptions(), table_reader_options, std::move(file), file_size,
+        &general_table, prefetch_index_and_filter_in_cache);
+
+    if (s.ok()) {
+      table->reset(reinterpret_cast<BlockBasedTable*>(general_table.release()));
+    }
+
+    if (status) {
+      *status = s;
+    }
+  }
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+
+  std::string test_dir_;
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  Options options_;
+
+ private:
+  void WriteToFile(const std::string& content, const std::string& filename) {
+    std::unique_ptr<FSWritableFile> f;
+    ASSERT_OK(fs_->NewWritableFile(Path(filename), FileOptions(), &f, nullptr));
+    ASSERT_OK(f->Append(content, IOOptions(), nullptr));
+    ASSERT_OK(f->Close(IOOptions(), nullptr));
+  }
+
+  void NewFileWriter(const std::string& filename,
+                     std::unique_ptr<WritableFileWriter>* writer) {
+    std::string path = Path(filename);
+    EnvOptions env_options;
+    FileOptions foptions;
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(fs_->NewWritableFile(path, foptions, &file, nullptr));
+    writer->reset(new WritableFileWriter(std::move(file), path, env_options));
+  }
+
+  void NewFileReader(const std::string& filename, const FileOptions& opt,
+                     std::unique_ptr<RandomAccessFileReader>* reader) {
+    std::string path = Path(filename);
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), path,
+                                             env_->GetSystemClock().get()));
+  }
+
+  std::string ToInternalKey(const std::string& key) {
+    InternalKey internal_key(key, 0, ValueType::kTypeValue);
+    return internal_key.Encode().ToString();
+  }
+};
+
+class BlockBasedTableReaderTest
+    : public BlockBasedTableReaderBaseTest,
+      public testing::WithParamInterface<std::tuple<
+          CompressionType, bool, BlockBasedTableOptions::IndexType, bool>> {
+ protected:
+  void SetUp() override {
+    compression_type_ = std::get<0>(GetParam());
+    use_direct_reads_ = std::get<1>(GetParam());
+    BlockBasedTableReaderBaseTest::SetUp();
+  }
+
+  void ConfigureTableFactory() override {
+    BlockBasedTableOptions opts;
+    opts.index_type = std::get<2>(GetParam());
+    opts.no_block_cache = std::get<3>(GetParam());
+    options_.table_factory.reset(
+        static_cast<BlockBasedTableFactory*>(NewBlockBasedTableFactory(opts)));
+  }
+
+  CompressionType compression_type_;
+  bool use_direct_reads_;
+};
+
+// Tests MultiGet in both direct IO and non-direct IO mode.
+// The keys should be in cache after MultiGet.
+TEST_P(BlockBasedTableReaderTest, MultiGet) {
+  std::map<std::string, std::string> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(
+          100 /* num_block */,
+          true /* mixed_with_human_readable_string_value */);
+
+  // Prepare keys, values, and statuses for MultiGet.
+  autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> keys;
+  autovector<PinnableSlice, MultiGetContext::MAX_BATCH_SIZE> values;
+  autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
+  {
+    const int step =
+        static_cast<int>(kv.size()) / MultiGetContext::MAX_BATCH_SIZE;
+    auto it = kv.begin();
+    for (int i = 0; i < MultiGetContext::MAX_BATCH_SIZE; i++) {
+      keys.emplace_back(it->first);
+      values.emplace_back();
+      statuses.emplace_back();
+      std::advance(it, step);
+    }
+  }
+
+  std::string table_name =
+      "BlockBasedTableReaderTest" + CompressionTypeToString(compression_type_);
+  CreateTable(table_name, compression_type_, kv);
+
+  std::unique_ptr<BlockBasedTable> table;
+  Options options;
+  ImmutableOptions ioptions(options);
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table);
+
+  // Ensure that keys are not in cache before MultiGet.
+  for (auto& key : keys) {
+    ASSERT_FALSE(table->TEST_KeyInCache(ReadOptions(), key));
+  }
+
+  // Prepare MultiGetContext.
+  autovector<GetContext, MultiGetContext::MAX_BATCH_SIZE> get_context;
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    get_context.emplace_back(BytewiseComparator(), nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, keys[i], &values[i],
+                             nullptr, nullptr, nullptr, nullptr,
+                             true /* do_merge */, nullptr, nullptr, nullptr,
+                             nullptr, nullptr, nullptr);
+    key_context.emplace_back(nullptr, keys[i], &values[i], nullptr,
+                             &statuses.back());
+    key_context.back().get_context = &get_context.back();
+  }
+  for (auto& key_ctx : key_context) {
+    sorted_keys.emplace_back(&key_ctx);
+  }
+  MultiGetContext ctx(&sorted_keys, 0, sorted_keys.size(), 0, ReadOptions(),
+                      fs_.get(), nullptr);
+
+  // Execute MultiGet.
+  MultiGetContext::Range range = ctx.GetMultiGetRange();
+  PerfContext* perf_ctx = get_perf_context();
+  perf_ctx->Reset();
+  table->MultiGet(ReadOptions(), &range, nullptr);
+
+  ASSERT_GE(perf_ctx->block_read_count - perf_ctx->index_block_read_count -
+                perf_ctx->filter_block_read_count -
+                perf_ctx->compression_dict_block_read_count,
+            1);
+  ASSERT_GE(perf_ctx->block_read_byte, 1);
+
+  for (const Status& status : statuses) {
+    ASSERT_OK(status);
+  }
+  // Check that keys are in cache after MultiGet.
+  for (size_t i = 0; i < keys.size(); i++) {
+    ASSERT_TRUE(table->TEST_KeyInCache(ReadOptions(), keys[i]));
+    ASSERT_EQ(values[i].ToString(), kv[keys[i].ToString()]);
+  }
+}
+
+class ChargeTableReaderTest
+    : public BlockBasedTableReaderBaseTest,
+      public testing::WithParamInterface<
+          CacheEntryRoleOptions::Decision /* charge_table_reader_mem */> {
+ protected:
+  static std::size_t CalculateMaxTableReaderNumBeforeCacheFull(
+      std::size_t cache_capacity, std::size_t approx_table_reader_mem) {
+    // To make calculation easier for testing
+    assert(cache_capacity % CacheReservationManagerImpl<
+                                CacheEntryRole::kBlockBasedTableReader>::
+                                GetDummyEntrySize() ==
+               0 &&
+           cache_capacity >= 2 * CacheReservationManagerImpl<
+                                     CacheEntryRole::kBlockBasedTableReader>::
+                                     GetDummyEntrySize());
+
+    // We need to subtract 1 for max_num_dummy_entry to account for dummy
+    // entries' overhead, assumed the overhead is no greater than 1 dummy entry
+    // size
+    std::size_t max_num_dummy_entry =
+        (size_t)std::floor((
+            1.0 * cache_capacity /
+            CacheReservationManagerImpl<
+                CacheEntryRole::kBlockBasedTableReader>::GetDummyEntrySize())) -
+        1;
+    std::size_t cache_capacity_rounded_to_dummy_entry_multiples =
+        max_num_dummy_entry *
+        CacheReservationManagerImpl<
+            CacheEntryRole::kBlockBasedTableReader>::GetDummyEntrySize();
+    std::size_t max_table_reader_num_capped = static_cast<std::size_t>(
+        std::floor(1.0 * cache_capacity_rounded_to_dummy_entry_multiples /
+                   approx_table_reader_mem));
+
+    return max_table_reader_num_capped;
+  }
+
+  void SetUp() override {
+    // To cache and re-use the same kv map and compression type in the test
+    // suite for elimiating variance caused by these two factors
+    kv_ = BlockBasedTableReaderBaseTest::GenerateKVMap();
+    compression_type_ = CompressionType::kNoCompression;
+
+    table_reader_charge_tracking_cache_ = std::make_shared<
+        TargetCacheChargeTrackingCache<
+            CacheEntryRole::kBlockBasedTableReader>>((NewLRUCache(
+        4 * CacheReservationManagerImpl<
+                CacheEntryRole::kBlockBasedTableReader>::GetDummyEntrySize(),
+        0 /* num_shard_bits */, true /* strict_capacity_limit */)));
+
+    // To ApproximateTableReaderMem() without being affected by
+    // the feature of charging its memory, we turn off the feature
+    charge_table_reader_ = CacheEntryRoleOptions::Decision::kDisabled;
+    BlockBasedTableReaderBaseTest::SetUp();
+    approx_table_reader_mem_ = ApproximateTableReaderMem();
+
+    // Now we condtionally turn on the feature to test
+    charge_table_reader_ = GetParam();
+    ConfigureTableFactory();
+  }
+
+  void ConfigureTableFactory() override {
+    BlockBasedTableOptions table_options;
+    table_options.cache_usage_options.options_overrides.insert(
+        {CacheEntryRole::kBlockBasedTableReader,
+         {/*.charged = */ charge_table_reader_}});
+    table_options.block_cache = table_reader_charge_tracking_cache_;
+
+    table_options.cache_index_and_filter_blocks = false;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    table_options.partition_filters = true;
+    table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+
+  CacheEntryRoleOptions::Decision charge_table_reader_;
+  std::shared_ptr<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kBlockBasedTableReader>>
+      table_reader_charge_tracking_cache_;
+  std::size_t approx_table_reader_mem_;
+  std::map<std::string, std::string> kv_;
+  CompressionType compression_type_;
+
+ private:
+  std::size_t ApproximateTableReaderMem() {
+    std::size_t approx_table_reader_mem = 0;
+
+    std::string table_name = "table_for_approx_table_reader_mem";
+    CreateTable(table_name, compression_type_, kv_);
+
+    std::unique_ptr<BlockBasedTable> table;
+    Status s;
+    NewBlockBasedTableReader(
+        FileOptions(), ImmutableOptions(options_),
+        InternalKeyComparator(options_.comparator), table_name, &table,
+        false /* prefetch_index_and_filter_in_cache */, &s);
+    assert(s.ok());
+
+    approx_table_reader_mem = table->ApproximateMemoryUsage();
+    assert(approx_table_reader_mem > 0);
+    return approx_table_reader_mem;
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    ChargeTableReaderTest, ChargeTableReaderTest,
+    ::testing::Values(CacheEntryRoleOptions::Decision::kEnabled,
+                      CacheEntryRoleOptions::Decision::kDisabled));
+
+TEST_P(ChargeTableReaderTest, Basic) {
+  const std::size_t max_table_reader_num_capped =
+      ChargeTableReaderTest::CalculateMaxTableReaderNumBeforeCacheFull(
+          table_reader_charge_tracking_cache_->GetCapacity(),
+          approx_table_reader_mem_);
+
+  // Acceptable estimtation errors coming from
+  // 1. overstimate max_table_reader_num_capped due to # dummy entries is high
+  // and results in metadata charge overhead greater than 1 dummy entry size
+  // (violating our assumption in calculating max_table_reader_num_capped)
+  // 2. overestimate/underestimate max_table_reader_num_capped due to the gap
+  // between ApproximateTableReaderMem() and actual table reader mem
+  std::size_t max_table_reader_num_capped_upper_bound =
+      (std::size_t)(max_table_reader_num_capped * 1.05);
+  std::size_t max_table_reader_num_capped_lower_bound =
+      (std::size_t)(max_table_reader_num_capped * 0.95);
+  std::size_t max_table_reader_num_uncapped =
+      (std::size_t)(max_table_reader_num_capped * 1.1);
+  ASSERT_GT(max_table_reader_num_uncapped,
+            max_table_reader_num_capped_upper_bound)
+      << "We need `max_table_reader_num_uncapped` > "
+         "`max_table_reader_num_capped_upper_bound` to differentiate cases "
+         "between "
+         "charge_table_reader_ == kDisabled and == kEnabled)";
+
+  Status s = Status::OK();
+  std::size_t opened_table_reader_num = 0;
+  std::string table_name;
+  std::vector<std::unique_ptr<BlockBasedTable>> tables;
+  // Keep creating BlockBasedTableReader till hiting the memory limit based on
+  // cache capacity and creation fails (when charge_table_reader_ ==
+  // kEnabled) or reaching a specfied big number of table readers (when
+  // charge_table_reader_ == kDisabled)
+  while (s.ok() && opened_table_reader_num < max_table_reader_num_uncapped) {
+    table_name = "table_" + std::to_string(opened_table_reader_num);
+    CreateTable(table_name, compression_type_, kv_);
+    tables.push_back(std::unique_ptr<BlockBasedTable>());
+    NewBlockBasedTableReader(
+        FileOptions(), ImmutableOptions(options_),
+        InternalKeyComparator(options_.comparator), table_name, &tables.back(),
+        false /* prefetch_index_and_filter_in_cache */, &s);
+    if (s.ok()) {
+      ++opened_table_reader_num;
+    }
+  }
+
+  if (charge_table_reader_ == CacheEntryRoleOptions::Decision::kEnabled) {
+    EXPECT_TRUE(s.IsMemoryLimit()) << "s: " << s.ToString();
+    EXPECT_TRUE(s.ToString().find(
+                    kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+                        CacheEntryRole::kBlockBasedTableReader)]) !=
+                std::string::npos);
+    EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
+                std::string::npos);
+
+    EXPECT_GE(opened_table_reader_num, max_table_reader_num_capped_lower_bound);
+    EXPECT_LE(opened_table_reader_num, max_table_reader_num_capped_upper_bound);
+
+    std::size_t updated_max_table_reader_num_capped =
+        ChargeTableReaderTest::CalculateMaxTableReaderNumBeforeCacheFull(
+            table_reader_charge_tracking_cache_->GetCapacity() / 2,
+            approx_table_reader_mem_);
+
+    // Keep deleting BlockBasedTableReader to lower down memory usage from the
+    // memory limit to make the next creation succeeds
+    while (opened_table_reader_num >= updated_max_table_reader_num_capped) {
+      tables.pop_back();
+      --opened_table_reader_num;
+    }
+    table_name = "table_for_successful_table_reader_open";
+    CreateTable(table_name, compression_type_, kv_);
+    tables.push_back(std::unique_ptr<BlockBasedTable>());
+    NewBlockBasedTableReader(
+        FileOptions(), ImmutableOptions(options_),
+        InternalKeyComparator(options_.comparator), table_name, &tables.back(),
+        false /* prefetch_index_and_filter_in_cache */, &s);
+    EXPECT_TRUE(s.ok()) << s.ToString();
+
+    tables.clear();
+    EXPECT_EQ(table_reader_charge_tracking_cache_->GetCacheCharge(), 0);
+  } else {
+    EXPECT_TRUE(s.ok() &&
+                opened_table_reader_num == max_table_reader_num_uncapped)
+        << "s: " << s.ToString() << " opened_table_reader_num: "
+        << std::to_string(opened_table_reader_num);
+    EXPECT_EQ(table_reader_charge_tracking_cache_->GetCacheCharge(), 0);
+  }
+}
+
+class BlockBasedTableReaderTestVerifyChecksum
+    : public BlockBasedTableReaderTest {
+ public:
+  BlockBasedTableReaderTestVerifyChecksum() : BlockBasedTableReaderTest() {}
+};
+
+TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
+  std::map<std::string, std::string> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(800 /* num_block */);
+
+  std::string table_name =
+      "BlockBasedTableReaderTest" + CompressionTypeToString(compression_type_);
+  CreateTable(table_name, compression_type_, kv);
+
+  std::unique_ptr<BlockBasedTable> table;
+  Options options;
+  ImmutableOptions ioptions(options);
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table);
+
+  // Use the top level iterator to find the offset/size of the first
+  // 2nd level index block and corrupt the block
+  IndexBlockIter iiter_on_stack;
+  BlockCacheLookupContext context{TableReaderCaller::kUserVerifyChecksum};
+  InternalIteratorBase<IndexValue>* iiter = table->NewIndexIterator(
+      ReadOptions(), /*disable_prefix_seek=*/false, &iiter_on_stack,
+      /*get_context=*/nullptr, &context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+  }
+  ASSERT_OK(iiter->status());
+  iiter->SeekToFirst();
+  BlockHandle handle = static_cast<PartitionedIndexIterator*>(iiter)
+                           ->index_iter_->value()
+                           .handle;
+  table.reset();
+
+  // Corrupt the block pointed to by handle
+  ASSERT_OK(test::CorruptFile(options.env, Path(table_name),
+                              static_cast<int>(handle.offset()), 128));
+
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table);
+  Status s = table->VerifyChecksum(ReadOptions(),
+                                   TableReaderCaller::kUserVerifyChecksum);
+  ASSERT_EQ(s.code(), Status::kCorruption);
+}
+
+// Param 1: compression type
+// Param 2: whether to use direct reads
+// Param 3: Block Based Table Index type
+// Param 4: BBTO no_block_cache option
+#ifdef ROCKSDB_LITE
+// Skip direct I/O tests in lite mode since direct I/O is unsupported.
+INSTANTIATE_TEST_CASE_P(
+    MultiGet, BlockBasedTableReaderTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetSupportedCompressions()),
+        ::testing::Values(false),
+        ::testing::Values(BlockBasedTableOptions::IndexType::kBinarySearch),
+        ::testing::Values(false)));
+#else   // ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(
+    MultiGet, BlockBasedTableReaderTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(),
+        ::testing::Values(BlockBasedTableOptions::IndexType::kBinarySearch),
+        ::testing::Values(false)));
+#endif  // ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(
+    VerifyChecksum, BlockBasedTableReaderTestVerifyChecksum,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetSupportedCompressions()),
+        ::testing::Values(false),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch),
+        ::testing::Values(true)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/block_builder.cc b/src/rocksdb/table/block_based/block_builder.cc
new file mode 100644
index 000000000..92702b17d
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_builder.cc
@@ -0,0 +1,234 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// BlockBuilder generates blocks where keys are prefix-compressed:
+//
+// When we store a key, we drop the prefix shared with the previous
+// string.  This helps reduce the space requirement significantly.
+// Furthermore, once every K keys, we do not apply the prefix
+// compression and store the entire key.  We call this a "restart
+// point".  The tail end of the block stores the offsets of all of the
+// restart points, and can be used to do a binary search when looking
+// for a particular key.  Values are stored as-is (without compression)
+// immediately following the corresponding key.
+//
+// An entry for a particular key-value pair has the form:
+//     shared_bytes: varint32
+//     unshared_bytes: varint32
+//     value_length: varint32
+//     key_delta: char[unshared_bytes]
+//     value: char[value_length]
+// shared_bytes == 0 for restart points.
+//
+// The trailer of the block has the form:
+//     restarts: uint32[num_restarts]
+//     num_restarts: uint32
+// restarts[i] contains the offset within the block of the ith restart point.
+
+#include "table/block_based/block_builder.h"
+
+#include <assert.h>
+
+#include <algorithm>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "table/block_based/data_block_footer.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlockBuilder::BlockBuilder(
+    int block_restart_interval, bool use_delta_encoding,
+    bool use_value_delta_encoding,
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    double data_block_hash_table_util_ratio)
+    : block_restart_interval_(block_restart_interval),
+      use_delta_encoding_(use_delta_encoding),
+      use_value_delta_encoding_(use_value_delta_encoding),
+      restarts_(1, 0),  // First restart point is at offset 0
+      counter_(0),
+      finished_(false) {
+  switch (index_type) {
+    case BlockBasedTableOptions::kDataBlockBinarySearch:
+      break;
+    case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+      data_block_hash_index_builder_.Initialize(
+          data_block_hash_table_util_ratio);
+      break;
+    default:
+      assert(0);
+  }
+  assert(block_restart_interval_ >= 1);
+  estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
+}
+
+void BlockBuilder::Reset() {
+  buffer_.clear();
+  restarts_.resize(1);  // First restart point is at offset 0
+  assert(restarts_[0] == 0);
+  estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
+  counter_ = 0;
+  finished_ = false;
+  last_key_.clear();
+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Reset();
+  }
+#ifndef NDEBUG
+  add_with_last_key_called_ = false;
+#endif
+}
+
+void BlockBuilder::SwapAndReset(std::string& buffer) {
+  std::swap(buffer_, buffer);
+  Reset();
+}
+
+size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key,
+                                         const Slice& value) const {
+  size_t estimate = CurrentSizeEstimate();
+  // Note: this is an imprecise estimate as it accounts for the whole key size
+  // instead of non-shared key size.
+  estimate += key.size();
+  // In value delta encoding we estimate the value delta size as half the full
+  // value size since only the size field of block handle is encoded.
+  estimate +=
+      !use_value_delta_encoding_ || (counter_ >= block_restart_interval_)
+          ? value.size()
+          : value.size() / 2;
+
+  if (counter_ >= block_restart_interval_) {
+    estimate += sizeof(uint32_t);  // a new restart entry.
+  }
+
+  estimate += sizeof(int32_t);  // varint for shared prefix length.
+  // Note: this is an imprecise estimate as we will have to encoded size, one
+  // for shared key and one for non-shared key.
+  estimate += VarintLength(key.size());  // varint for key length.
+  if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) {
+    estimate += VarintLength(value.size());  // varint for value length.
+  }
+
+  return estimate;
+}
+
+Slice BlockBuilder::Finish() {
+  // Append restart array
+  for (size_t i = 0; i < restarts_.size(); i++) {
+    PutFixed32(&buffer_, restarts_[i]);
+  }
+
+  uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
+  BlockBasedTableOptions::DataBlockIndexType index_type =
+      BlockBasedTableOptions::kDataBlockBinarySearch;
+  if (data_block_hash_index_builder_.Valid() &&
+      CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
+    data_block_hash_index_builder_.Finish(buffer_);
+    index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  }
+
+  // footer is a packed format of data_block_index_type and num_restarts
+  uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
+
+  PutFixed32(&buffer_, block_footer);
+  finished_ = true;
+  return Slice(buffer_);
+}
+
+void BlockBuilder::Add(const Slice& key, const Slice& value,
+                       const Slice* const delta_value) {
+  // Ensure no unsafe mixing of Add and AddWithLastKey
+  assert(!add_with_last_key_called_);
+
+  AddWithLastKeyImpl(key, value, last_key_, delta_value, buffer_.size());
+  if (use_delta_encoding_) {
+    // Update state
+    // We used to just copy the changed data, but it appears to be
+    // faster to just copy the whole thing.
+    last_key_.assign(key.data(), key.size());
+  }
+}
+
+void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value,
+                                  const Slice& last_key_param,
+                                  const Slice* const delta_value) {
+  // Ensure no unsafe mixing of Add and AddWithLastKey
+  assert(last_key_.empty());
+#ifndef NDEBUG
+  add_with_last_key_called_ = false;
+#endif
+
+  // Here we make sure to use an empty `last_key` on first call after creation
+  // or Reset. This is more convenient for the caller and we can be more
+  // clever inside BlockBuilder. On this hot code path, we want to avoid
+  // conditional jumps like `buffer_.empty() ? ... : ...` so we can use a
+  // fast min operation instead, with an assertion to be sure our logic is
+  // sound.
+  size_t buffer_size = buffer_.size();
+  size_t last_key_size = last_key_param.size();
+  assert(buffer_size == 0 || buffer_size >= last_key_size);
+
+  Slice last_key(last_key_param.data(), std::min(buffer_size, last_key_size));
+
+  AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size);
+}
+
+inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key,
+                                             const Slice& value,
+                                             const Slice& last_key,
+                                             const Slice* const delta_value,
+                                             size_t buffer_size) {
+  assert(!finished_);
+  assert(counter_ <= block_restart_interval_);
+  assert(!use_value_delta_encoding_ || delta_value);
+  size_t shared = 0;  // number of bytes shared with prev key
+  if (counter_ >= block_restart_interval_) {
+    // Restart compression
+    restarts_.push_back(static_cast<uint32_t>(buffer_size));
+    estimate_ += sizeof(uint32_t);
+    counter_ = 0;
+  } else if (use_delta_encoding_) {
+    // See how much sharing to do with previous string
+    shared = key.difference_offset(last_key);
+  }
+
+  const size_t non_shared = key.size() - shared;
+
+  if (use_value_delta_encoding_) {
+    // Add "<shared><non_shared>" to buffer_
+    PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+                        static_cast<uint32_t>(non_shared));
+  } else {
+    // Add "<shared><non_shared><value_size>" to buffer_
+    PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+                                static_cast<uint32_t>(non_shared),
+                                static_cast<uint32_t>(value.size()));
+  }
+
+  // Add string delta to buffer_ followed by value
+  buffer_.append(key.data() + shared, non_shared);
+  // Use value delta encoding only when the key has shared bytes. This would
+  // simplify the decoding, where it can figure which decoding to use simply by
+  // looking at the shared bytes size.
+  if (shared != 0 && use_value_delta_encoding_) {
+    buffer_.append(delta_value->data(), delta_value->size());
+  } else {
+    buffer_.append(value.data(), value.size());
+  }
+
+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Add(ExtractUserKey(key),
+                                       restarts_.size() - 1);
+  }
+
+  counter_++;
+  estimate_ += buffer_.size() - buffer_size;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_builder.h b/src/rocksdb/table/block_based/block_builder.h
new file mode 100644
index 000000000..5f68b449b
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_builder.h
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <vector>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "table/block_based/data_block_hash_index.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder {
+ public:
+  BlockBuilder(const BlockBuilder&) = delete;
+  void operator=(const BlockBuilder&) = delete;
+
+  explicit BlockBuilder(int block_restart_interval,
+                        bool use_delta_encoding = true,
+                        bool use_value_delta_encoding = false,
+                        BlockBasedTableOptions::DataBlockIndexType index_type =
+                            BlockBasedTableOptions::kDataBlockBinarySearch,
+                        double data_block_hash_table_util_ratio = 0.75);
+
+  // Reset the contents as if the BlockBuilder was just constructed.
+  void Reset();
+
+  // Swap the contents in BlockBuilder with buffer, then reset the BlockBuilder.
+  void SwapAndReset(std::string& buffer);
+
+  // REQUIRES: Finish() has not been called since the last call to Reset().
+  // REQUIRES: key is larger than any previously added key
+  // DO NOT mix with AddWithLastKey() between Resets. For efficiency, use
+  // AddWithLastKey() in contexts where previous added key is already known
+  // and delta encoding might be used.
+  void Add(const Slice& key, const Slice& value,
+           const Slice* const delta_value = nullptr);
+
+  // A faster version of Add() if the previous key is already known for all
+  // Add()s.
+  // REQUIRES: Finish() has not been called since the last call to Reset().
+  // REQUIRES: key is larger than any previously added key
+  // REQUIRES: if AddWithLastKey has been called since last Reset(), last_key
+  // is the key from most recent AddWithLastKey. (For convenience, last_key
+  // is ignored on first call after creation or Reset().)
+  // DO NOT mix with Add() between Resets.
+  void AddWithLastKey(const Slice& key, const Slice& value,
+                      const Slice& last_key,
+                      const Slice* const delta_value = nullptr);
+
+  // Finish building the block and return a slice that refers to the
+  // block contents.  The returned slice will remain valid for the
+  // lifetime of this builder or until Reset() is called.
+  Slice Finish();
+
+  // Returns an estimate of the current (uncompressed) size of the block
+  // we are building.
+  inline size_t CurrentSizeEstimate() const {
+    return estimate_ + (data_block_hash_index_builder_.Valid()
+                            ? data_block_hash_index_builder_.EstimateSize()
+                            : 0);
+  }
+
+  // Returns an estimated block size after appending key and value.
+  size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const;
+
+  // Return true iff no entries have been added since the last Reset()
+  bool empty() const { return buffer_.empty(); }
+
+ private:
+  inline void AddWithLastKeyImpl(const Slice& key, const Slice& value,
+                                 const Slice& last_key,
+                                 const Slice* const delta_value,
+                                 size_t buffer_size);
+
+  const int block_restart_interval_;
+  // TODO(myabandeh): put it into a separate IndexBlockBuilder
+  const bool use_delta_encoding_;
+  // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values
+  const bool use_value_delta_encoding_;
+
+  std::string buffer_;              // Destination buffer
+  std::vector<uint32_t> restarts_;  // Restart points
+  size_t estimate_;
+  int counter_;    // Number of entries emitted since restart
+  bool finished_;  // Has Finish() been called?
+  std::string last_key_;
+  DataBlockHashIndexBuilder data_block_hash_index_builder_;
+#ifndef NDEBUG
+  bool add_with_last_key_called_ = false;
+#endif
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_like_traits.h b/src/rocksdb/table/block_based/block_like_traits.h
new file mode 100644
index 000000000..d406dbb5d
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_like_traits.h
@@ -0,0 +1,182 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "cache/cache_entry_roles.h"
+#include "port/lang.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename TBlocklike>
+class BlocklikeTraits;
+
+template <typename T, CacheEntryRole R>
+Cache::CacheItemHelper* GetCacheItemHelperForRole();
+
+template <typename TBlocklike>
+Cache::CreateCallback GetCreateCallback(size_t read_amp_bytes_per_bit,
+                                        Statistics* statistics, bool using_zstd,
+                                        const FilterPolicy* filter_policy) {
+  return [read_amp_bytes_per_bit, statistics, using_zstd, filter_policy](
+             const void* buf, size_t size, void** out_obj,
+             size_t* charge) -> Status {
+    assert(buf != nullptr);
+    std::unique_ptr<char[]> buf_data(new char[size]());
+    memcpy(buf_data.get(), buf, size);
+    BlockContents bc = BlockContents(std::move(buf_data), size);
+    TBlocklike* ucd_ptr = BlocklikeTraits<TBlocklike>::Create(
+        std::move(bc), read_amp_bytes_per_bit, statistics, using_zstd,
+        filter_policy);
+    *out_obj = reinterpret_cast<void*>(ucd_ptr);
+    *charge = size;
+    return Status::OK();
+  };
+}
+
+template <>
+class BlocklikeTraits<ParsedFullFilterBlock> {
+ public:
+  static ParsedFullFilterBlock* Create(BlockContents&& contents,
+                                       size_t /* read_amp_bytes_per_bit */,
+                                       Statistics* /* statistics */,
+                                       bool /* using_zstd */,
+                                       const FilterPolicy* filter_policy) {
+    return new ParsedFullFilterBlock(filter_policy, std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
+    return 0;
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    ParsedFullFilterBlock* ptr = static_cast<ParsedFullFilterBlock*>(obj);
+    return ptr->GetBlockContentsData().size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    ParsedFullFilterBlock* ptr = static_cast<ParsedFullFilterBlock*>(from_obj);
+    const char* buf = ptr->GetBlockContentsData().data();
+    assert(length == ptr->GetBlockContentsData().size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    (void)block_type;
+    assert(block_type == BlockType::kFilter);
+    return GetCacheItemHelperForRole<ParsedFullFilterBlock,
+                                     CacheEntryRole::kFilterBlock>();
+  }
+};
+
+template <>
+class BlocklikeTraits<Block> {
+ public:
+  static Block* Create(BlockContents&& contents, size_t read_amp_bytes_per_bit,
+                       Statistics* statistics, bool /* using_zstd */,
+                       const FilterPolicy* /* filter_policy */) {
+    return new Block(std::move(contents), read_amp_bytes_per_bit, statistics);
+  }
+
+  static uint32_t GetNumRestarts(const Block& block) {
+    return block.NumRestarts();
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    Block* ptr = static_cast<Block*>(obj);
+    return ptr->size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    Block* ptr = static_cast<Block*>(from_obj);
+    const char* buf = ptr->data();
+    assert(length == ptr->size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    switch (block_type) {
+      case BlockType::kData:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kDataBlock>();
+      case BlockType::kIndex:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kIndexBlock>();
+      case BlockType::kFilterPartitionIndex:
+        return GetCacheItemHelperForRole<Block,
+                                         CacheEntryRole::kFilterMetaBlock>();
+      default:
+        // Not a recognized combination
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case BlockType::kRangeDeletion:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kOtherBlock>();
+    }
+  }
+};
+
+template <>
+class BlocklikeTraits<UncompressionDict> {
+ public:
+  static UncompressionDict* Create(BlockContents&& contents,
+                                   size_t /* read_amp_bytes_per_bit */,
+                                   Statistics* /* statistics */,
+                                   bool using_zstd,
+                                   const FilterPolicy* /* filter_policy */) {
+    return new UncompressionDict(contents.data, std::move(contents.allocation),
+                                 using_zstd);
+  }
+
+  static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
+    return 0;
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    UncompressionDict* ptr = static_cast<UncompressionDict*>(obj);
+    return ptr->slice_.size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    UncompressionDict* ptr = static_cast<UncompressionDict*>(from_obj);
+    const char* buf = ptr->slice_.data();
+    assert(length == ptr->slice_.size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    (void)block_type;
+    assert(block_type == BlockType::kCompressionDictionary);
+    return GetCacheItemHelperForRole<UncompressionDict,
+                                     CacheEntryRole::kOtherBlock>();
+  }
+};
+
+// Get an CacheItemHelper pointer for value type T and role R.
+template <typename T, CacheEntryRole R>
+Cache::CacheItemHelper* GetCacheItemHelperForRole() {
+  static Cache::CacheItemHelper cache_helper(
+      BlocklikeTraits<T>::SizeCallback, BlocklikeTraits<T>::SaveToCallback,
+      GetCacheEntryDeleterForRole<T, R>());
+  return &cache_helper;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefetcher.cc b/src/rocksdb/table/block_based/block_prefetcher.cc
new file mode 100644
index 000000000..83ec2cb06
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefetcher.cc
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_prefetcher.h"
+
+#include "rocksdb/file_system.h"
+#include "table/block_based/block_based_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+void BlockPrefetcher::PrefetchIfNeeded(
+    const BlockBasedTable::Rep* rep, const BlockHandle& handle,
+    const size_t readahead_size, bool is_for_compaction,
+    const bool no_sequential_checking,
+    const Env::IOPriority rate_limiter_priority) {
+  // num_file_reads is used  by FilePrefetchBuffer only when
+  // implicit_auto_readahead is set.
+  if (is_for_compaction) {
+    rep->CreateFilePrefetchBufferIfNotExists(
+        compaction_readahead_size_, compaction_readahead_size_,
+        &prefetch_buffer_, /*implicit_auto_readahead=*/false,
+        /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0);
+    return;
+  }
+
+  // Explicit user requested readahead.
+  if (readahead_size > 0) {
+    rep->CreateFilePrefetchBufferIfNotExists(
+        readahead_size, readahead_size, &prefetch_buffer_,
+        /*implicit_auto_readahead=*/false, /*num_file_reads=*/0,
+        /*num_file_reads_for_auto_readahead=*/0);
+    return;
+  }
+
+  // Implicit readahead.
+
+  // If max_auto_readahead_size is set to be 0 by user, no data will be
+  // prefetched.
+  size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size;
+  if (max_auto_readahead_size == 0 || initial_auto_readahead_size_ == 0) {
+    return;
+  }
+
+  if (initial_auto_readahead_size_ > max_auto_readahead_size) {
+    initial_auto_readahead_size_ = max_auto_readahead_size;
+  }
+
+  // In case of no_sequential_checking, it will skip the num_file_reads_ and
+  // will always creates the FilePrefetchBuffer.
+  if (no_sequential_checking) {
+    rep->CreateFilePrefetchBufferIfNotExists(
+        initial_auto_readahead_size_, max_auto_readahead_size,
+        &prefetch_buffer_, /*implicit_auto_readahead=*/true,
+        /*num_file_reads=*/0,
+        rep->table_options.num_file_reads_for_auto_readahead);
+    return;
+  }
+
+  size_t len = BlockBasedTable::BlockSizeWithTrailer(handle);
+  size_t offset = handle.offset();
+
+  // If FS supports prefetching (readahead_limit_ will be non zero in that case)
+  // and current block exists in prefetch buffer then return.
+  if (offset + len <= readahead_limit_) {
+    UpdateReadPattern(offset, len);
+    return;
+  }
+
+  if (!IsBlockSequential(offset)) {
+    UpdateReadPattern(offset, len);
+    ResetValues(rep->table_options.initial_auto_readahead_size);
+    return;
+  }
+  UpdateReadPattern(offset, len);
+
+  // Implicit auto readahead, which will be enabled if the number of reads
+  // reached `table_options.num_file_reads_for_auto_readahead` (default: 2)  and
+  // scans are sequential.
+  num_file_reads_++;
+  if (num_file_reads_ <= rep->table_options.num_file_reads_for_auto_readahead) {
+    return;
+  }
+
+  if (rep->file->use_direct_io()) {
+    rep->CreateFilePrefetchBufferIfNotExists(
+        initial_auto_readahead_size_, max_auto_readahead_size,
+        &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_,
+        rep->table_options.num_file_reads_for_auto_readahead);
+    return;
+  }
+
+  if (readahead_size_ > max_auto_readahead_size) {
+    readahead_size_ = max_auto_readahead_size;
+  }
+
+  // If prefetch is not supported, fall back to use internal prefetch buffer.
+  // Discarding other return status of Prefetch calls intentionally, as
+  // we can fallback to reading from disk if Prefetch fails.
+  Status s = rep->file->Prefetch(
+      handle.offset(),
+      BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_,
+      rate_limiter_priority);
+  if (s.IsNotSupported()) {
+    rep->CreateFilePrefetchBufferIfNotExists(
+        initial_auto_readahead_size_, max_auto_readahead_size,
+        &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_,
+        rep->table_options.num_file_reads_for_auto_readahead);
+    return;
+  }
+
+  readahead_limit_ = offset + len + readahead_size_;
+  // Keep exponentially increasing readahead size until
+  // max_auto_readahead_size.
+  readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefetcher.h b/src/rocksdb/table/block_based/block_prefetcher.h
new file mode 100644
index 000000000..518868a30
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefetcher.h
@@ -0,0 +1,72 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+class BlockPrefetcher {
+ public:
+  explicit BlockPrefetcher(size_t compaction_readahead_size,
+                           size_t initial_auto_readahead_size)
+      : compaction_readahead_size_(compaction_readahead_size),
+        readahead_size_(initial_auto_readahead_size),
+        initial_auto_readahead_size_(initial_auto_readahead_size) {}
+
+  void PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
+                        const BlockHandle& handle, size_t readahead_size,
+                        bool is_for_compaction,
+                        const bool no_sequential_checking,
+                        Env::IOPriority rate_limiter_priority);
+  FilePrefetchBuffer* prefetch_buffer() { return prefetch_buffer_.get(); }
+
+  void UpdateReadPattern(const uint64_t& offset, const size_t& len) {
+    prev_offset_ = offset;
+    prev_len_ = len;
+  }
+
+  bool IsBlockSequential(const uint64_t& offset) {
+    return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset));
+  }
+
+  void ResetValues(size_t initial_auto_readahead_size) {
+    num_file_reads_ = 1;
+    // Since initial_auto_readahead_size_ can be different from
+    // the value passed to BlockBasedTableOptions.initial_auto_readahead_size in
+    // case of adaptive_readahead, so fallback the readahead_size_ to that value
+    // in case of reset.
+    initial_auto_readahead_size_ = initial_auto_readahead_size;
+    readahead_size_ = initial_auto_readahead_size_;
+    readahead_limit_ = 0;
+    return;
+  }
+
+  void SetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) {
+    num_file_reads_ = readahead_info->num_file_reads;
+    initial_auto_readahead_size_ = readahead_info->readahead_size;
+    TEST_SYNC_POINT_CALLBACK("BlockPrefetcher::SetReadaheadState",
+                             &initial_auto_readahead_size_);
+  }
+
+ private:
+  // Readahead size used in compaction, its value is used only if
+  // lookup_context_.caller = kCompaction.
+  size_t compaction_readahead_size_;
+
+  // readahead_size_ is used if underlying FS supports prefetching.
+  size_t readahead_size_;
+  size_t readahead_limit_ = 0;
+  // initial_auto_readahead_size_ is used if RocksDB uses internal prefetch
+  // buffer.
+  uint64_t initial_auto_readahead_size_;
+  uint64_t num_file_reads_ = 0;
+  uint64_t prev_offset_ = 0;
+  size_t prev_len_ = 0;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefix_index.cc b/src/rocksdb/table/block_based/block_prefix_index.cc
new file mode 100644
index 000000000..c83701d69
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefix_index.cc
@@ -0,0 +1,226 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/block_prefix_index.h"
+
+#include <vector>
+
+#include "memory/arena.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+inline uint32_t Hash(const Slice& s) {
+  return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0);
+}
+
+inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
+  return Hash(prefix) % num_buckets;
+}
+
+// The prefix block index is simply a bucket array, with each entry pointing to
+// the blocks that span the prefixes hashed to this bucket.
+//
+// To reduce memory footprint, if there is only one block per bucket, the entry
+// stores the block id directly. If there are more than one blocks per bucket,
+// because of hash collision or a single prefix spanning multiple blocks,
+// the entry points to an array of block ids. The block array is an array of
+// uint32_t's. The first uint32_t indicates the total number of blocks, followed
+// by the block ids.
+//
+// To differentiate the two cases, the high order bit of the entry indicates
+// whether it is a 'pointer' into a separate block array.
+// 0x7FFFFFFF is reserved for empty bucket.
+
+const uint32_t kNoneBlock = 0x7FFFFFFF;
+const uint32_t kBlockArrayMask = 0x80000000;
+
+inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; }
+
+inline bool IsBlockId(uint32_t block_id) {
+  return (block_id & kBlockArrayMask) == 0;
+}
+
+inline uint32_t DecodeIndex(uint32_t block_id) {
+  uint32_t index = block_id ^ kBlockArrayMask;
+  assert(index < kBlockArrayMask);
+  return index;
+}
+
+inline uint32_t EncodeIndex(uint32_t index) {
+  assert(index < kBlockArrayMask);
+  return index | kBlockArrayMask;
+}
+
+// temporary storage for prefix information during index building
+struct PrefixRecord {
+  Slice prefix;
+  uint32_t start_block;
+  uint32_t end_block;
+  uint32_t num_blocks;
+  PrefixRecord* next;
+};
+
+class BlockPrefixIndex::Builder {
+ public:
+  void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) {
+    PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
+        arena_.AllocateAligned(sizeof(PrefixRecord)));
+    record->prefix = key_prefix;
+    record->start_block = start_block;
+    record->end_block = start_block + num_blocks - 1;
+    record->num_blocks = num_blocks;
+    prefixes_.push_back(record);
+  }
+
+  BlockPrefixIndex* Finish(const SliceTransform* prefix_extractor) {
+    // For now, use roughly 1:1 prefix to bucket ratio.
+    uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
+
+    // Collect prefix records that hash to the same bucket, into a single
+    // linklist.
+    std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
+    std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
+    for (PrefixRecord* current : prefixes_) {
+      uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
+      // merge the prefix block span if the first block of this prefix is
+      // connected to the last block of the previous prefix.
+      PrefixRecord* prev = prefixes_per_bucket[bucket];
+      if (prev) {
+        assert(current->start_block >= prev->end_block);
+        auto distance = current->start_block - prev->end_block;
+        if (distance <= 1) {
+          prev->end_block = current->end_block;
+          prev->num_blocks = prev->end_block - prev->start_block + 1;
+          num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
+          continue;
+        }
+      }
+      current->next = prev;
+      prefixes_per_bucket[bucket] = current;
+      num_blocks_per_bucket[bucket] += current->num_blocks;
+    }
+
+    // Calculate the block array buffer size
+    uint32_t total_block_array_entries = 0;
+    for (uint32_t i = 0; i < num_buckets; i++) {
+      uint32_t num_blocks = num_blocks_per_bucket[i];
+      if (num_blocks > 1) {
+        total_block_array_entries += (num_blocks + 1);
+      }
+    }
+
+    // Populate the final prefix block index
+    uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
+    uint32_t* buckets = new uint32_t[num_buckets];
+    uint32_t offset = 0;
+    for (uint32_t i = 0; i < num_buckets; i++) {
+      uint32_t num_blocks = num_blocks_per_bucket[i];
+      if (num_blocks == 0) {
+        assert(prefixes_per_bucket[i] == nullptr);
+        buckets[i] = kNoneBlock;
+      } else if (num_blocks == 1) {
+        assert(prefixes_per_bucket[i] != nullptr);
+        assert(prefixes_per_bucket[i]->next == nullptr);
+        buckets[i] = prefixes_per_bucket[i]->start_block;
+      } else {
+        assert(total_block_array_entries > 0);
+        assert(prefixes_per_bucket[i] != nullptr);
+        buckets[i] = EncodeIndex(offset);
+        block_array_buffer[offset] = num_blocks;
+        uint32_t* last_block = &block_array_buffer[offset + num_blocks];
+        auto current = prefixes_per_bucket[i];
+        // populate block ids from largest to smallest
+        while (current != nullptr) {
+          for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
+            *last_block = current->end_block - iter;
+            last_block--;
+          }
+          current = current->next;
+        }
+        assert(last_block == &block_array_buffer[offset]);
+        offset += (num_blocks + 1);
+      }
+    }
+
+    assert(offset == total_block_array_entries);
+
+    return new BlockPrefixIndex(prefix_extractor, num_buckets, buckets,
+                                total_block_array_entries, block_array_buffer);
+  }
+
+ private:
+  std::vector<PrefixRecord*> prefixes_;
+  Arena arena_;
+};
+
+Status BlockPrefixIndex::Create(const SliceTransform* prefix_extractor,
+                                const Slice& prefixes, const Slice& prefix_meta,
+                                BlockPrefixIndex** prefix_index) {
+  uint64_t pos = 0;
+  auto meta_pos = prefix_meta;
+  Status s;
+  Builder builder;
+
+  while (!meta_pos.empty()) {
+    uint32_t prefix_size = 0;
+    uint32_t entry_index = 0;
+    uint32_t num_blocks = 0;
+    if (!GetVarint32(&meta_pos, &prefix_size) ||
+        !GetVarint32(&meta_pos, &entry_index) ||
+        !GetVarint32(&meta_pos, &num_blocks)) {
+      s = Status::Corruption(
+          "Corrupted prefix meta block: unable to read from it.");
+      break;
+    }
+    if (pos + prefix_size > prefixes.size()) {
+      s = Status::Corruption(
+          "Corrupted prefix meta block: size inconsistency.");
+      break;
+    }
+    Slice prefix(prefixes.data() + pos, prefix_size);
+    builder.Add(prefix, entry_index, num_blocks);
+
+    pos += prefix_size;
+  }
+
+  if (s.ok() && pos != prefixes.size()) {
+    s = Status::Corruption("Corrupted prefix meta block");
+  }
+
+  if (s.ok()) {
+    *prefix_index = builder.Finish(prefix_extractor);
+  }
+
+  return s;
+}
+
+uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) {
+  Slice prefix = internal_prefix_extractor_.Transform(key);
+
+  uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
+  uint32_t block_id = buckets_[bucket];
+
+  if (IsNone(block_id)) {
+    return 0;
+  } else if (IsBlockId(block_id)) {
+    *blocks = &buckets_[bucket];
+    return 1;
+  } else {
+    uint32_t index = DecodeIndex(block_id);
+    assert(index < num_block_array_buffer_entries_);
+    *blocks = &block_array_buffer_[index + 1];
+    uint32_t num_blocks = block_array_buffer_[index];
+    assert(num_blocks > 1);
+    assert(index + num_blocks < num_block_array_buffer_entries_);
+    return num_blocks;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefix_index.h b/src/rocksdb/table/block_based/block_prefix_index.h
new file mode 100644
index 000000000..4db8e2c65
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefix_index.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+
+#include "db/dbformat.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+class Iterator;
+class Slice;
+class SliceTransform;
+
+// Build a hash-based index to speed up the lookup for "index block".
+// BlockHashIndex accepts a key and, if found, returns its restart index within
+// that index block.
+class BlockPrefixIndex {
+ public:
+  // Maps a key to a list of data blocks that could potentially contain
+  // the key, based on the prefix.
+  // Returns the total number of relevant blocks, 0 means the key does
+  // not exist.
+  uint32_t GetBlocks(const Slice& key, uint32_t** blocks);
+
+  size_t ApproximateMemoryUsage() const {
+    return sizeof(BlockPrefixIndex) +
+           (num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t);
+  }
+
+  // Create hash index by reading from the metadata blocks.
+  // Note: table reader (caller) is responsible for keeping shared_ptr to
+  // underlying prefix extractor
+  // @params prefixes: a sequence of prefixes.
+  // @params prefix_meta: contains the "metadata" to of the prefixes.
+  static Status Create(const SliceTransform* hash_key_extractor,
+                       const Slice& prefixes, const Slice& prefix_meta,
+                       BlockPrefixIndex** prefix_index);
+
+  ~BlockPrefixIndex() {
+    delete[] buckets_;
+    delete[] block_array_buffer_;
+  }
+
+ private:
+  class Builder;
+  friend Builder;
+
+  BlockPrefixIndex(const SliceTransform* prefix_extractor, uint32_t num_buckets,
+                   uint32_t* buckets, uint32_t num_block_array_buffer_entries,
+                   uint32_t* block_array_buffer)
+      : internal_prefix_extractor_(prefix_extractor),
+        num_buckets_(num_buckets),
+        num_block_array_buffer_entries_(num_block_array_buffer_entries),
+        buckets_(buckets),
+        block_array_buffer_(block_array_buffer) {}
+
+  InternalKeySliceTransform internal_prefix_extractor_;
+
+  uint32_t num_buckets_;
+  uint32_t num_block_array_buffer_entries_;
+  uint32_t* buckets_;
+  uint32_t* block_array_buffer_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_test.cc b/src/rocksdb/table/block_based/block_test.cc
new file mode 100644
index 000000000..83b87fe79
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_test.cc
@@ -0,0 +1,627 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/block.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string GenerateInternalKey(int primary_key, int secondary_key,
+                                int padding_size, Random *rnd) {
+  char buf[50];
+  char *p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += rnd->RandomString(padding_size);
+  }
+  AppendInternalKeyFooter(&k, 0 /* seqno */, kTypeValue);
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string> *keys,
+                       std::vector<std::string> *values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      // `DataBlockIter` assumes it reads only internal keys.
+      keys->emplace_back(GenerateInternalKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(rnd.RandomString(100));
+    }
+  }
+}
+
+class BlockTest : public testing::Test {};
+
+// block test
+TEST_F(BlockTest, SimpleTest) {
+  Random rnd(301);
+  Options options = Options();
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  BlockBuilder builder(16);
+  int num_records = 100000;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records);
+  // add a bunch of records to a block
+  for (int i = 0; i < num_records; i++) {
+    builder.Add(keys[i], values[i]);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+
+  // read contents of block sequentially
+  int count = 0;
+  InternalIterator *iter =
+      reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber);
+  for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
+    // read kv from block
+    Slice k = iter->key();
+    Slice v = iter->value();
+
+    // compare with lookaside array
+    ASSERT_EQ(k.ToString().compare(keys[count]), 0);
+    ASSERT_EQ(v.ToString().compare(values[count]), 0);
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter =
+      reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber);
+  for (int i = 0; i < num_records; i++) {
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(keys[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    Slice v = iter->value();
+    ASSERT_EQ(v.ToString().compare(values[index]), 0);
+  }
+  delete iter;
+}
+
+// return the block contents
+BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
+                               const std::vector<std::string> &keys,
+                               const std::vector<std::string> &values,
+                               const int /*prefix_group_size*/ = 1) {
+  builder->reset(new BlockBuilder(1 /* restart interval */));
+
+  // Add only half of the keys
+  for (size_t i = 0; i < keys.size(); ++i) {
+    (*builder)->Add(keys[i], values[i]);
+  }
+  Slice rawblock = (*builder)->Finish();
+
+  BlockContents contents;
+  contents.data = rawblock;
+
+  return contents;
+}
+
+void CheckBlockContents(BlockContents contents, const int max_key,
+                        const std::vector<std::string> &keys,
+                        const std::vector<std::string> &values) {
+  const size_t prefix_size = 6;
+  // create block reader
+  BlockContents contents_ref(contents.data);
+  Block reader1(std::move(contents));
+  Block reader2(std::move(contents_ref));
+
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(prefix_size));
+
+  std::unique_ptr<InternalIterator> regular_iter(reader2.NewDataIterator(
+      BytewiseComparator(), kDisableGlobalSequenceNumber));
+
+  // Seek existent keys
+  for (size_t i = 0; i < keys.size(); i++) {
+    regular_iter->Seek(keys[i]);
+    ASSERT_OK(regular_iter->status());
+    ASSERT_TRUE(regular_iter->Valid());
+
+    Slice v = regular_iter->value();
+    ASSERT_EQ(v.ToString().compare(values[i]), 0);
+  }
+
+  // Seek non-existent keys.
+  // For hash index, if no key with a given prefix is not found, iterator will
+  // simply be set as invalid; whereas the binary search based iterator will
+  // return the one that is closest.
+  for (int i = 1; i < max_key - 1; i += 2) {
+    // `DataBlockIter` assumes its APIs receive only internal keys.
+    auto key = GenerateInternalKey(i, 0, 0, nullptr);
+    regular_iter->Seek(key);
+    ASSERT_TRUE(regular_iter->Valid());
+  }
+}
+
+// In this test case, no two key share same prefix.
+TEST_F(BlockTest, SimpleIndexHash) {
+  const int kMaxKey = 100000;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  GenerateRandomKVs(&keys, &values, 0 /* first key id */,
+                    kMaxKey /* last key id */, 2 /* step */,
+                    8 /* padding size (8 bytes randomly generated suffix) */);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values);
+
+  CheckBlockContents(std::move(contents), kMaxKey, keys, values);
+}
+
+TEST_F(BlockTest, IndexHashWithSharedPrefix) {
+  const int kMaxKey = 100000;
+  // for each prefix, there will be 5 keys starts with it.
+  const int kPrefixGroup = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  // Generate keys with same prefix.
+  GenerateRandomKVs(&keys, &values, 0,  // first key id
+                    kMaxKey,            // last key id
+                    2,                  // step
+                    10,                 // padding size,
+                    kPrefixGroup);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
+
+  CheckBlockContents(std::move(contents), kMaxKey, keys, values);
+}
+
+// A slow and accurate version of BlockReadAmpBitmap that simply store
+// all the marked ranges in a set.
+class BlockReadAmpBitmapSlowAndAccurate {
+ public:
+  void Mark(size_t start_offset, size_t end_offset) {
+    assert(end_offset >= start_offset);
+    marked_ranges_.emplace(end_offset, start_offset);
+  }
+
+  void ResetCheckSequence() { iter_valid_ = false; }
+
+  // Return true if any byte in this range was Marked
+  // This does linear search from the previous position. When calling
+  // multiple times, `offset` needs to be incremental to get correct results.
+  // Call ResetCheckSequence() to reset it.
+  bool IsPinMarked(size_t offset) {
+    if (iter_valid_) {
+      // Has existing iterator, try linear search from
+      // the iterator.
+      for (int i = 0; i < 64; i++) {
+        if (offset < iter_->second) {
+          return false;
+        }
+        if (offset <= iter_->first) {
+          return true;
+        }
+
+        iter_++;
+        if (iter_ == marked_ranges_.end()) {
+          iter_valid_ = false;
+          return false;
+        }
+      }
+    }
+    // Initial call or have linear searched too many times.
+    // Do binary search.
+    iter_ = marked_ranges_.lower_bound(
+        std::make_pair(offset, static_cast<size_t>(0)));
+    if (iter_ == marked_ranges_.end()) {
+      iter_valid_ = false;
+      return false;
+    }
+    iter_valid_ = true;
+    return offset <= iter_->first && offset >= iter_->second;
+  }
+
+ private:
+  std::set<std::pair<size_t, size_t>> marked_ranges_;
+  std::set<std::pair<size_t, size_t>>::iterator iter_;
+  bool iter_valid_ = false;
+};
+
+TEST_F(BlockTest, BlockReadAmpBitmap) {
+  uint32_t pin_offset = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockReadAmpBitmap:rnd", [&pin_offset](void *arg) {
+        pin_offset = *(static_cast<uint32_t *>(arg));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  std::vector<size_t> block_sizes = {
+      1,                // 1 byte
+      32,               // 32 bytes
+      61,               // 61 bytes
+      64,               // 64 bytes
+      512,              // 0.5 KB
+      1024,             // 1 KB
+      1024 * 4,         // 4 KB
+      1024 * 10,        // 10 KB
+      1024 * 50,        // 50 KB
+      1024 * 1024 * 4,  // 5 MB
+      777,
+      124653,
+  };
+  const size_t kBytesPerBit = 64;
+
+  Random rnd(301);
+  for (size_t block_size : block_sizes) {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    BlockReadAmpBitmap read_amp_bitmap(block_size, kBytesPerBit, stats.get());
+    BlockReadAmpBitmapSlowAndAccurate read_amp_slow_and_accurate;
+
+    size_t needed_bits = (block_size / kBytesPerBit);
+    if (block_size % kBytesPerBit != 0) {
+      needed_bits++;
+    }
+
+    ASSERT_EQ(stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES), block_size);
+
+    // Generate some random entries
+    std::vector<size_t> random_entry_offsets;
+    for (int i = 0; i < 1000; i++) {
+      random_entry_offsets.push_back(rnd.Next() % block_size);
+    }
+    std::sort(random_entry_offsets.begin(), random_entry_offsets.end());
+    auto it =
+        std::unique(random_entry_offsets.begin(), random_entry_offsets.end());
+    random_entry_offsets.resize(
+        std::distance(random_entry_offsets.begin(), it));
+
+    std::vector<std::pair<size_t, size_t>> random_entries;
+    for (size_t i = 0; i < random_entry_offsets.size(); i++) {
+      size_t entry_start = random_entry_offsets[i];
+      size_t entry_end;
+      if (i + 1 < random_entry_offsets.size()) {
+        entry_end = random_entry_offsets[i + 1] - 1;
+      } else {
+        entry_end = block_size - 1;
+      }
+      random_entries.emplace_back(entry_start, entry_end);
+    }
+
+    for (size_t i = 0; i < random_entries.size(); i++) {
+      read_amp_slow_and_accurate.ResetCheckSequence();
+      auto &current_entry = random_entries[rnd.Next() % random_entries.size()];
+
+      read_amp_bitmap.Mark(static_cast<uint32_t>(current_entry.first),
+                           static_cast<uint32_t>(current_entry.second));
+      read_amp_slow_and_accurate.Mark(current_entry.first,
+                                      current_entry.second);
+
+      size_t total_bits = 0;
+      for (size_t bit_idx = 0; bit_idx < needed_bits; bit_idx++) {
+        total_bits += read_amp_slow_and_accurate.IsPinMarked(
+            bit_idx * kBytesPerBit + pin_offset);
+      }
+      size_t expected_estimate_useful = total_bits * kBytesPerBit;
+      size_t got_estimate_useful =
+          stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+      ASSERT_EQ(expected_estimate_useful, got_estimate_useful);
+    }
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlockTest, BlockWithReadAmpBitmap) {
+  Random rnd(301);
+  Options options = Options();
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  BlockBuilder builder(16);
+  int num_records = 10000;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records, 1);
+  // add a bunch of records to a block
+  for (int i = 0; i < num_records; i++) {
+    builder.Add(keys[i], values[i]);
+  }
+
+  Slice rawblock = builder.Finish();
+  const size_t kBytesPerBit = 8;
+
+  // Read the block sequentially using Next()
+  {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kBytesPerBit, stats.get());
+
+    // read contents of block sequentially
+    size_t read_bytes = 0;
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      iter->value();
+      read_bytes += iter->TEST_CurrentEntrySize();
+
+      double semi_acc_read_amp =
+          static_cast<double>(read_bytes) / rawblock.size();
+      double read_amp = static_cast<double>(stats->getTickerCount(
+                            READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+                        stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      // Error in read amplification will be less than 1% if we are reading
+      // sequentially
+      double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+      EXPECT_LT(error_pct, 1);
+    }
+
+    delete iter;
+  }
+
+  // Read the block sequentially using Seek()
+  {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kBytesPerBit, stats.get());
+
+    size_t read_bytes = 0;
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
+    for (int i = 0; i < num_records; i++) {
+      Slice k(keys[i]);
+
+      // search in block for this key
+      iter->Seek(k);
+      iter->value();
+      read_bytes += iter->TEST_CurrentEntrySize();
+
+      double semi_acc_read_amp =
+          static_cast<double>(read_bytes) / rawblock.size();
+      double read_amp = static_cast<double>(stats->getTickerCount(
+                            READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+                        stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      // Error in read amplification will be less than 1% if we are reading
+      // sequentially
+      double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+      EXPECT_LT(error_pct, 1);
+    }
+    delete iter;
+  }
+
+  // Read the block randomly
+  {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kBytesPerBit, stats.get());
+
+    size_t read_bytes = 0;
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
+    std::unordered_set<int> read_keys;
+    for (int i = 0; i < num_records; i++) {
+      int index = rnd.Uniform(num_records);
+      Slice k(keys[index]);
+
+      iter->Seek(k);
+      iter->value();
+      if (read_keys.find(index) == read_keys.end()) {
+        read_keys.insert(index);
+        read_bytes += iter->TEST_CurrentEntrySize();
+      }
+
+      double semi_acc_read_amp =
+          static_cast<double>(read_bytes) / rawblock.size();
+      double read_amp = static_cast<double>(stats->getTickerCount(
+                            READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+                        stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+      // Error in read amplification will be less than 2% if we are reading
+      // randomly
+      EXPECT_LT(error_pct, 2);
+    }
+    delete iter;
+  }
+}
+
+TEST_F(BlockTest, ReadAmpBitmapPow2) {
+  std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32u);
+
+  ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u);
+}
+
+class IndexBlockTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  IndexBlockTest() = default;
+
+  bool useValueDeltaEncoding() const { return std::get<0>(GetParam()); }
+  bool includeFirstKey() const { return std::get<1>(GetParam()); }
+};
+
+// Similar to GenerateRandomKVs but for index block contents.
+void GenerateRandomIndexEntries(std::vector<std::string> *separators,
+                                std::vector<BlockHandle> *block_handles,
+                                std::vector<std::string> *first_keys,
+                                const int len) {
+  Random rnd(42);
+
+  // For each of `len` blocks, we need to generate a first and last key.
+  // Let's generate n*2 random keys, sort them, group into consecutive pairs.
+  std::set<std::string> keys;
+  while ((int)keys.size() < len * 2) {
+    // Keys need to be at least 8 bytes long to look like internal keys.
+    keys.insert(test::RandomKey(&rnd, 12));
+  }
+
+  uint64_t offset = 0;
+  for (auto it = keys.begin(); it != keys.end();) {
+    first_keys->emplace_back(*it++);
+    separators->emplace_back(*it++);
+    uint64_t size = rnd.Uniform(1024 * 16);
+    BlockHandle handle(offset, size);
+    offset += size + BlockBasedTable::kBlockTrailerSize;
+    block_handles->emplace_back(handle);
+  }
+}
+
+TEST_P(IndexBlockTest, IndexValueEncodingTest) {
+  Random rnd(301);
+  Options options = Options();
+
+  std::vector<std::string> separators;
+  std::vector<BlockHandle> block_handles;
+  std::vector<std::string> first_keys;
+  const bool kUseDeltaEncoding = true;
+  BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding());
+  int num_records = 100;
+
+  GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
+                             num_records);
+  BlockHandle last_encoded_handle;
+  for (int i = 0; i < num_records; i++) {
+    IndexValue entry(block_handles[i], first_keys[i]);
+    std::string encoded_entry;
+    std::string delta_encoded_entry;
+    entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr);
+    if (useValueDeltaEncoding() && i > 0) {
+      entry.EncodeTo(&delta_encoded_entry, includeFirstKey(),
+                     &last_encoded_handle);
+    }
+    last_encoded_handle = entry.handle;
+    const Slice delta_encoded_entry_slice(delta_encoded_entry);
+    builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+
+  const bool kTotalOrderSeek = true;
+  const bool kIncludesSeq = true;
+  const bool kValueIsFull = !useValueDeltaEncoding();
+  IndexBlockIter *kNullIter = nullptr;
+  Statistics *kNullStats = nullptr;
+  // read contents of block sequentially
+  InternalIteratorBase<IndexValue> *iter = reader.NewIndexIterator(
+      options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats,
+      kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull);
+  iter->SeekToFirst();
+  for (int index = 0; index < num_records; ++index) {
+    ASSERT_TRUE(iter->Valid());
+
+    Slice k = iter->key();
+    IndexValue v = iter->value();
+
+    EXPECT_EQ(separators[index], k.ToString());
+    EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+    EXPECT_EQ(block_handles[index].size(), v.handle.size());
+    EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+              v.first_internal_key.ToString());
+
+    iter->Next();
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter = reader.NewIndexIterator(
+      options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats,
+      kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull);
+  for (int i = 0; i < num_records * 2; i++) {
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(separators[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    IndexValue v = iter->value();
+    EXPECT_EQ(separators[index], iter->key().ToString());
+    EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+    EXPECT_EQ(block_handles[index].size(), v.handle.size());
+    EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+              v.first_internal_key.ToString());
+  }
+  delete iter;
+}
+
+INSTANTIATE_TEST_CASE_P(P, IndexBlockTest,
+                        ::testing::Values(std::make_tuple(false, false),
+                                          std::make_tuple(false, true),
+                                          std::make_tuple(true, false),
+                                          std::make_tuple(true, true)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char **argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/block_type.h b/src/rocksdb/table/block_based/block_type.h
new file mode 100644
index 000000000..a9d6a1a77
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_type.h
@@ -0,0 +1,34 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Represents the types of blocks used in the block based table format.
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details.
+// For code sanity, BlockType should imply a specific TBlocklike for
+// BlocklikeTraits.
+enum class BlockType : uint8_t {
+  kData,
+  kFilter,  // for second level partitioned filters and full filters
+  kFilterPartitionIndex,  // for top-level index of filter partitions
+  kProperties,
+  kCompressionDictionary,
+  kRangeDeletion,
+  kHashIndexPrefixes,
+  kHashIndexMetadata,
+  kMetaIndex,
+  kIndex,
+  // Note: keep kInvalid the last value when adding new enum values.
+  kInvalid
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/cachable_entry.h b/src/rocksdb/table/block_based/cachable_entry.h
new file mode 100644
index 000000000..ad8acb18d
--- /dev/null
+++ b/src/rocksdb/table/block_based/cachable_entry.h
@@ -0,0 +1,232 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+
+#include "port/likely.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/cleanable.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// CachableEntry is a handle to an object that may or may not be in the block
+// cache. It is used in a variety of ways:
+//
+// 1) It may refer to an object in the block cache. In this case, cache_ and
+// cache_handle_ are not nullptr, and the cache handle has to be released when
+// the CachableEntry is destroyed (the lifecycle of the cached object, on the
+// other hand, is managed by the cache itself).
+// 2) It may uniquely own the (non-cached) object it refers to (examples include
+// a block read directly from file, or uncompressed blocks when there is a
+// compressed block cache but no uncompressed block cache). In such cases, the
+// object has to be destroyed when the CachableEntry is destroyed.
+// 3) It may point to an object (cached or not) without owning it. In this case,
+// no action is needed when the CachableEntry is destroyed.
+// 4) Sometimes, management of a cached or owned object (see #1 and #2 above)
+// is transferred to some other object. This is used for instance with iterators
+// (where cleanup is performed using a chain of cleanup functions,
+// see Cleanable).
+//
+// Because of #1 and #2 above, copying a CachableEntry is not safe (and thus not
+// allowed); hence, this is a move-only type, where a move transfers the
+// management responsibilities, and leaves the source object in an empty state.
+
+template <class T>
+class CachableEntry {
+ public:
+  CachableEntry() = default;
+
+  CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle,
+                bool own_value)
+      : value_(value),
+        cache_(cache),
+        cache_handle_(cache_handle),
+        own_value_(own_value) {
+    assert(value_ != nullptr ||
+           (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+  }
+
+  CachableEntry(const CachableEntry&) = delete;
+  CachableEntry& operator=(const CachableEntry&) = delete;
+
+  CachableEntry(CachableEntry&& rhs) noexcept
+      : value_(rhs.value_),
+        cache_(rhs.cache_),
+        cache_handle_(rhs.cache_handle_),
+        own_value_(rhs.own_value_) {
+    assert(value_ != nullptr ||
+           (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+
+    rhs.ResetFields();
+  }
+
+  CachableEntry& operator=(CachableEntry&& rhs) noexcept {
+    if (UNLIKELY(this == &rhs)) {
+      return *this;
+    }
+
+    ReleaseResource();
+
+    value_ = rhs.value_;
+    cache_ = rhs.cache_;
+    cache_handle_ = rhs.cache_handle_;
+    own_value_ = rhs.own_value_;
+
+    assert(value_ != nullptr ||
+           (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+
+    rhs.ResetFields();
+
+    return *this;
+  }
+
+  ~CachableEntry() { ReleaseResource(); }
+
+  bool IsEmpty() const {
+    return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr &&
+           !own_value_;
+  }
+
+  bool IsCached() const {
+    assert(!!cache_ == !!cache_handle_);
+
+    return cache_handle_ != nullptr;
+  }
+
+  T* GetValue() const { return value_; }
+  Cache* GetCache() const { return cache_; }
+  Cache::Handle* GetCacheHandle() const { return cache_handle_; }
+  bool GetOwnValue() const { return own_value_; }
+
+  void Reset() {
+    ReleaseResource();
+    ResetFields();
+  }
+
+  void TransferTo(Cleanable* cleanable) {
+    if (cleanable) {
+      if (cache_handle_ != nullptr) {
+        assert(cache_ != nullptr);
+        cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, cache_handle_);
+      } else if (own_value_) {
+        cleanable->RegisterCleanup(&DeleteValue, value_, nullptr);
+      }
+    }
+
+    ResetFields();
+  }
+
+  void SetOwnedValue(std::unique_ptr<T>&& value) {
+    assert(value.get() != nullptr);
+
+    if (UNLIKELY(value_ == value.get() && own_value_)) {
+      assert(cache_ == nullptr && cache_handle_ == nullptr);
+      return;
+    }
+
+    Reset();
+
+    value_ = value.release();
+    own_value_ = true;
+  }
+
+  void SetUnownedValue(T* value) {
+    assert(value != nullptr);
+
+    if (UNLIKELY(value_ == value && cache_ == nullptr &&
+                 cache_handle_ == nullptr && !own_value_)) {
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    assert(!own_value_);
+  }
+
+  void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) {
+    assert(cache != nullptr);
+    assert(cache_handle != nullptr);
+
+    if (UNLIKELY(value_ == value && cache_ == cache &&
+                 cache_handle_ == cache_handle && !own_value_)) {
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    cache_ = cache;
+    cache_handle_ = cache_handle;
+    assert(!own_value_);
+  }
+
+  void UpdateCachedValue() {
+    assert(cache_ != nullptr);
+    assert(cache_handle_ != nullptr);
+
+    value_ = static_cast<T*>(cache_->Value(cache_handle_));
+  }
+
+  bool IsReady() {
+    if (!own_value_) {
+      assert(cache_ != nullptr);
+      assert(cache_handle_ != nullptr);
+      return cache_->IsReady(cache_handle_);
+    }
+    return true;
+  }
+
+ private:
+  void ReleaseResource() noexcept {
+    if (LIKELY(cache_handle_ != nullptr)) {
+      assert(cache_ != nullptr);
+      cache_->Release(cache_handle_);
+    } else if (own_value_) {
+      delete value_;
+    }
+  }
+
+  void ResetFields() noexcept {
+    value_ = nullptr;
+    cache_ = nullptr;
+    cache_handle_ = nullptr;
+    own_value_ = false;
+  }
+
+  static void ReleaseCacheHandle(void* arg1, void* arg2) {
+    Cache* const cache = static_cast<Cache*>(arg1);
+    assert(cache);
+
+    Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
+    assert(cache_handle);
+
+    cache->Release(cache_handle);
+  }
+
+  static void DeleteValue(void* arg1, void* /* arg2 */) {
+    delete static_cast<T*>(arg1);
+  }
+
+ private:
+  T* value_ = nullptr;
+  Cache* cache_ = nullptr;
+  Cache::Handle* cache_handle_ = nullptr;
+  bool own_value_ = false;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_footer.cc b/src/rocksdb/table/block_based/data_block_footer.cc
new file mode 100644
index 000000000..5d5d8ed55
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_footer.cc
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/data_block_footer.h"
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const int kDataBlockIndexTypeBitShift = 31;
+
+// 0x7FFFFFFF
+const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+// 0x7FFFFFFF
+const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts) {
+  if (num_restarts > kMaxNumRestarts) {
+    assert(0);  // mute travis "unused" warning
+  }
+
+  uint32_t block_footer = num_restarts;
+  if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) {
+    block_footer |= 1u << kDataBlockIndexTypeBitShift;
+  } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) {
+    assert(0);
+  }
+
+  return block_footer;
+}
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts) {
+  if (index_type) {
+    if (block_footer & 1u << kDataBlockIndexTypeBitShift) {
+      *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+    } else {
+      *index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
+    }
+  }
+
+  if (num_restarts) {
+    *num_restarts = block_footer & kNumRestartsMask;
+    assert(*num_restarts <= kMaxNumRestarts);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_footer.h b/src/rocksdb/table/block_based/data_block_footer.h
new file mode 100644
index 000000000..c1cfd4730
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_footer.h
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts);
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index.cc b/src/rocksdb/table/block_based/data_block_hash_index.cc
new file mode 100644
index 000000000..c579dcc43
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "table/block_based/data_block_hash_index.h"
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DataBlockHashIndexBuilder::Add(const Slice& key,
+                                    const size_t restart_index) {
+  assert(Valid());
+  if (restart_index > kMaxRestartSupportedByHashIndex) {
+    valid_ = false;
+    return;
+  }
+
+  uint32_t hash_value = GetSliceHash(key);
+  hash_and_restart_pairs_.emplace_back(hash_value,
+                                       static_cast<uint8_t>(restart_index));
+  estimated_num_buckets_ += bucket_per_key_;
+}
+
+void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
+  assert(Valid());
+  uint16_t num_buckets = static_cast<uint16_t>(estimated_num_buckets_);
+
+  if (num_buckets == 0) {
+    num_buckets = 1;  // sanity check
+  }
+
+  // The build-in hash cannot well distribute strings when into different
+  // buckets when num_buckets is power of two, resulting in high hash
+  // collision.
+  // We made the num_buckets to be odd to avoid this issue.
+  num_buckets |= 1;
+
+  std::vector<uint8_t> buckets(num_buckets, kNoEntry);
+  // write the restart_index array
+  for (auto& entry : hash_and_restart_pairs_) {
+    uint32_t hash_value = entry.first;
+    uint8_t restart_index = entry.second;
+    uint16_t buck_idx = static_cast<uint16_t>(hash_value % num_buckets);
+    if (buckets[buck_idx] == kNoEntry) {
+      buckets[buck_idx] = restart_index;
+    } else if (buckets[buck_idx] != restart_index) {
+      // same bucket cannot store two different restart_index, mark collision
+      buckets[buck_idx] = kCollision;
+    }
+  }
+
+  for (uint8_t restart_index : buckets) {
+    buffer.append(
+        const_cast<const char*>(reinterpret_cast<char*>(&restart_index)),
+        sizeof(restart_index));
+  }
+
+  // write NUM_BUCK
+  PutFixed16(&buffer, num_buckets);
+
+  assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex);
+}
+
+void DataBlockHashIndexBuilder::Reset() {
+  estimated_num_buckets_ = 0;
+  valid_ = true;
+  hash_and_restart_pairs_.clear();
+}
+
+void DataBlockHashIndex::Initialize(const char* data, uint16_t size,
+                                    uint16_t* map_offset) {
+  assert(size >= sizeof(uint16_t));  // NUM_BUCKETS
+  num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t));
+  assert(num_buckets_ > 0);
+  assert(size > num_buckets_ * sizeof(uint8_t));
+  *map_offset = static_cast<uint16_t>(size - sizeof(uint16_t) -
+                                      num_buckets_ * sizeof(uint8_t));
+}
+
+uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset,
+                                   const Slice& key) const {
+  uint32_t hash_value = GetSliceHash(key);
+  uint16_t idx = static_cast<uint16_t>(hash_value % num_buckets_);
+  const char* bucket_table = data + map_offset;
+  return static_cast<uint8_t>(*(bucket_table + idx * sizeof(uint8_t)));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index.h b/src/rocksdb/table/block_based/data_block_hash_index.h
new file mode 100644
index 000000000..321522175
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+// This is an experimental feature aiming to reduce the CPU utilization of
+// point-lookup within a data-block. It is only used in data blocks, and not
+// in meta-data blocks or per-table index blocks.
+//
+// It only used to support BlockBasedTable::Get().
+//
+// A serialized hash index is appended to the data-block. The new block data
+// format is as follows:
+//
+// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
+//
+// RI:       Restart Interval (the same as the default data-block format)
+// RI_IDX:   Restart Interval index (the same as the default data-block format)
+// HASH_IDX: The new data-block hash index feature.
+// FOOTER:   A 32bit block footer, which is the NUM_RESTARTS with the MSB as
+//           the flag indicating if this hash index is in use. Note that
+//           given a data block < 32KB, the MSB is never used. So we can
+//           borrow the MSB as the hash index flag. Therefore, this format is
+//           compatible with the legacy data-blocks with num_restarts < 32768,
+//           as the MSB is 0.
+//
+// The format of the data-block hash index is as follows:
+//
+// HASH_IDX: [B B B ... B NUM_BUCK]
+//
+// B:         bucket, an array of restart index. Each buckets is uint8_t.
+// NUM_BUCK:  Number of buckets, which is the length of the bucket array.
+//
+// We reserve two special flag:
+//    kNoEntry=255,
+//    kCollision=254.
+//
+// Therefore, the max number of restarts this hash index can supoport is 253.
+//
+// Buckets are initialized to be kNoEntry.
+//
+// When storing a key in the hash index, the key is first hashed to a bucket.
+// If there the bucket is empty (kNoEntry), the restart index is stored in
+// the bucket. If there is already a restart index there, we will update the
+// existing restart index to a collision marker (kCollision). If the
+// the bucket is already marked as collision, we do not store the restart
+// index either.
+//
+// During query process, a key is first hashed to a bucket. Then we examine if
+// the buckets store nothing (kNoEntry) or the bucket had a collision
+// (kCollision). If either of those happens, we get the restart index of
+// the key and will directly go to the restart interval to search the key.
+//
+// Note that we only support blocks with #restart_interval < 254. If a block
+// has more restart interval than that, hash index will not be create for it.
+
+const uint8_t kNoEntry = 255;
+const uint8_t kCollision = 254;
+const uint8_t kMaxRestartSupportedByHashIndex = 253;
+
+// Because we use uint16_t address, we only support block no more than 64KB
+const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
+const double kDefaultUtilRatio = 0.75;
+
+class DataBlockHashIndexBuilder {
+ public:
+  DataBlockHashIndexBuilder()
+      : bucket_per_key_(-1 /*uninitialized marker*/),
+        estimated_num_buckets_(0),
+        valid_(false) {}
+
+  void Initialize(double util_ratio) {
+    if (util_ratio <= 0) {
+      util_ratio = kDefaultUtilRatio;  // sanity check
+    }
+    bucket_per_key_ = 1 / util_ratio;
+    valid_ = true;
+  }
+
+  inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
+  void Add(const Slice& key, const size_t restart_index);
+  void Finish(std::string& buffer);
+  void Reset();
+  inline size_t EstimateSize() const {
+    uint16_t estimated_num_buckets =
+        static_cast<uint16_t>(estimated_num_buckets_);
+
+    // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
+    estimated_num_buckets |= 1;
+
+    return sizeof(uint16_t) +
+           static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
+  }
+
+ private:
+  double bucket_per_key_;  // is the multiplicative inverse of util_ratio_
+  double estimated_num_buckets_;
+
+  // Now the only usage for `valid_` is to mark false when the inserted
+  // restart_index is larger than supported. In this case HashIndex is not
+  // appended to the block content.
+  bool valid_;
+
+  std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
+  friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
+};
+
+class DataBlockHashIndex {
+ public:
+  DataBlockHashIndex() : num_buckets_(0) {}
+
+  void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
+
+  uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
+
+  inline bool Valid() { return num_buckets_ != 0; }
+
+ private:
+  // To make the serialized hash index compact and to save the space overhead,
+  // here all the data fields persisted in the block are in uint16 format.
+  // We find that a uint16 is large enough to index every offset of a 64KiB
+  // block.
+  // So in other words, DataBlockHashIndex does not support block size equal
+  // or greater then 64KiB.
+  uint16_t num_buckets_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index_test.cc b/src/rocksdb/table/block_based/data_block_hash_index_test.cc
new file mode 100644
index 000000000..cd2e30833
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index_test.cc
@@ -0,0 +1,717 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/data_block_hash_index.h"
+
+#include <cstdlib>
+#include <string>
+#include <unordered_map>
+
+#include "db/table_properties_collector.h"
+#include "rocksdb/slice.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/get_context.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool SearchForOffset(DataBlockHashIndex& index, const char* data,
+                     uint16_t map_offset, const Slice& key,
+                     uint8_t& restart_point) {
+  uint8_t entry = index.Lookup(data, map_offset, key);
+  if (entry == kCollision) {
+    return true;
+  }
+
+  if (entry == kNoEntry) {
+    return false;
+  }
+
+  return entry == restart_point;
+}
+
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random* rnd) {
+  char buf[50];
+  char* p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += rnd->RandomString(padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string>* keys,
+                       std::vector<std::string>* values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(rnd.RandomString(100));
+    }
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestSmall) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  for (int j = 0; j < 5; j++) {
+    for (uint8_t i = 0; i < 2 + j; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      builder.Add(key, restart_point);
+    }
+
+    size_t estimated_size = builder.EstimateSize();
+
+    std::string buffer("fake"), buffer2;
+    size_t original_size = buffer.size();
+    estimated_size += original_size;
+    builder.Finish(buffer);
+
+    ASSERT_EQ(buffer.size(), estimated_size);
+
+    buffer2 = buffer;  // test for the correctness of relative offset
+
+    Slice s(buffer2);
+    DataBlockHashIndex index;
+    uint16_t map_offset;
+    index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+    // the additional hash map should start at the end of the buffer
+    ASSERT_EQ(original_size, map_offset);
+    for (uint8_t i = 0; i < 2; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
+    }
+    builder.Reset();
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTest) {
+  // bucket_num = 200, #keys = 100. 50% utilization
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("fake content"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer;  // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestCollision) {
+  // bucket_num = 2. There will be intense hash collisions
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("some other fake content to take up space"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer;  // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestLarge) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;
+
+  for (uint8_t i = 0; i < 100; i++) {
+    if (i % 2) {
+      continue;  // leave half of the keys out
+    }
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+    m[key] = restart_point;
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("filling stuff"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer;  // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    if (m.count(key)) {
+      ASSERT_TRUE(m[key] == restart_point);
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
+    } else {
+      // we allow false positve, so don't test the nonexisting keys.
+      // when false positive happens, the search will continue to the
+      // restart intervals to see if the key really exist.
+    }
+  }
+}
+
+TEST(DataBlockHashIndex, RestartIndexExceedMax) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;
+
+  for (uint8_t i = 0; i <= 253; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+  ASSERT_TRUE(builder.Valid());
+
+  builder.Reset();
+
+  for (uint8_t i = 0; i <= 254; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  ASSERT_FALSE(builder.Valid());
+
+  builder.Reset();
+  ASSERT_TRUE(builder.Valid());
+}
+
+TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) {
+  Options options = Options();
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  // #restarts <= 253. HashIndex is valid
+  for (int i = 0; i <= 253; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents));
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+  builder.Reset();
+
+  // #restarts > 253. HashIndex is not used
+  for (int i = 0; i <= 254; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents));
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockSizeExceedMax) {
+  Options options = Options();
+  std::string ukey(10, 'k');
+  InternalKey ikey(ukey, 0, kTypeValue);
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       false /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  {
+    // insert a large value. The block size plus HashIndex is 65536.
+    std::string value(65502, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents));
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+  builder.Reset();
+
+  {
+    // insert a large value. The block size plus HashIndex would be 65537.
+    // This excceed the max block size supported by HashIndex (65536).
+    // So when build finishes HashIndex will not be created for the block.
+    std::string value(65503, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents));
+
+    // the index type have fallen back to binary when build finish.
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockTestSingleKey) {
+  Options options = Options();
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  std::string ukey("gopher");
+  std::string value("gold");
+  InternalKey ikey(ukey, 10, kTypeValue);
+  builder.Add(ikey.Encode().ToString(), value /*value*/);
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+
+  const InternalKeyComparator icmp(BytewiseComparator());
+  auto iter = reader.NewDataIterator(icmp.user_comparator(),
+                                     kDisableGlobalSequenceNumber);
+  bool may_exist;
+  // search in block for the key just inserted
+  {
+    InternalKey seek_ikey(ukey, 10, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(
+        options.comparator->Compare(iter->key(), ikey.Encode().ToString()), 0);
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // search in block for the existing ukey, but with higher seqno
+  {
+    InternalKey seek_ikey(ukey, 20, kValueTypeForSeek);
+
+    // HashIndex should be able to set the iter correctly
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+
+    // user key should match
+    ASSERT_EQ(options.comparator->Compare(ExtractUserKey(iter->key()), ukey),
+              0);
+
+    // seek_key seqno number should be greater than that of iter result
+    ASSERT_GT(GetInternalKeySeqno(seek_ikey.Encode()),
+              GetInternalKeySeqno(iter->key()));
+
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // Search in block for the existing ukey, but with lower seqno
+  // in this case, hash can find the only occurrence of the user_key, but
+  // ParseNextDataKey() will skip it as it does not have a older seqno.
+  // In this case, GetForSeek() is effective to locate the user_key, and
+  // iter->Valid() == false indicates that we've reached to the end of
+  // the block and the caller should continue searching the next block.
+  {
+    InternalKey seek_ikey(ukey, 5, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_FALSE(iter->Valid());  // should have reached to the end of block
+  }
+
+  delete iter;
+}
+
+TEST(DataBlockHashIndex, BlockTestLarge) {
+  Random rnd(1019);
+  Options options = Options();
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  int num_records = 500;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records);
+
+  // Generate keys. Adding a trailing "1" to indicate existent keys.
+  // Later will Seeking for keys with a trailing "0" to test seeking
+  // non-existent keys.
+  for (int i = 0; i < num_records; i++) {
+    std::string ukey(keys[i] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), values[i]);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+  const InternalKeyComparator icmp(BytewiseComparator());
+
+  // random seek existent keys
+  for (int i = 0; i < num_records; i++) {
+    auto iter = reader.NewDataIterator(icmp.user_comparator(),
+                                       kDisableGlobalSequenceNumber);
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(values[index], iter->value());
+
+    delete iter;
+  }
+
+  // random seek non-existent user keys
+  // In this case A), the user_key cannot be found in HashIndex. The key may
+  // exist in the next block. So the iter is set invalidated to tell the
+  // caller to search the next block. This test case belongs to this case A).
+  //
+  // Note that for non-existent keys, there is possibility of false positive,
+  // i.e. the key is still hashed into some restart interval.
+  // Two additional possible outcome:
+  // B) linear seek the restart interval and not found, the iter stops at the
+  //    starting of the next restart interval. The key does not exist
+  //    anywhere.
+  // C) linear seek the restart interval and not found, the iter stops at the
+  //    the end of the block, i.e. restarts_. The key may exist in the next
+  //    block.
+  // So these combinations are possible when searching non-existent user_key:
+  //
+  // case#    may_exist  iter->Valid()
+  //     A         true          false
+  //     B        false           true
+  //     C         true          false
+
+  for (int i = 0; i < num_records; i++) {
+    auto iter = reader.NewDataIterator(icmp.user_comparator(),
+                                       kDisableGlobalSequenceNumber);
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "0" /* non-existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    if (!may_exist) {
+      ASSERT_TRUE(iter->Valid());
+    }
+    if (!iter->Valid()) {
+      ASSERT_TRUE(may_exist);
+    }
+
+    delete iter;
+  }
+}
+
+// helper routine for DataBlockHashIndex.BlockBoundary
+void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
+                  std::string& v2, InternalKey& seek_ikey,
+                  GetContext& get_context, Options& options) {
+  std::unique_ptr<WritableFileWriter> file_writer;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  std::unique_ptr<TableReader> table_reader;
+  int level_ = -1;
+
+  std::vector<std::string> keys;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+
+  EnvOptions soptions;
+
+  soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> f(sink);
+  file_writer.reset(
+      new WritableFileWriter(std::move(f), "" /* don't care */, FileOptions()));
+  std::unique_ptr<TableBuilder> builder;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  builder.reset(ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(
+          ioptions, moptions, internal_comparator,
+          &int_tbl_prop_collector_factories, options.compression,
+          CompressionOptions(),
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+          column_family_name, level_),
+      file_writer.get()));
+
+  builder->Add(ik1.Encode().ToString(), v1);
+  builder->Add(ik2.Encode().ToString(), v2);
+  EXPECT_TRUE(builder->status().ok());
+
+  Status s = builder->Finish();
+  ASSERT_OK(file_writer->Flush());
+  EXPECT_TRUE(s.ok()) << s.ToString();
+
+  EXPECT_EQ(sink->contents().size(), builder->FileSize());
+
+  // Open the table
+  test::StringSource* source = new test::StringSource(
+      sink->contents(), 0 /*uniq_id*/, ioptions.allow_mmap_reads);
+  std::unique_ptr<FSRandomAccessFile> file(source);
+  file_reader.reset(new RandomAccessFileReader(std::move(file), "test"));
+  const bool kSkipFilters = true;
+  const bool kImmortal = true;
+  ASSERT_OK(ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
+                         internal_comparator, !kSkipFilters, !kImmortal,
+                         level_),
+      std::move(file_reader), sink->contents().size(), &table_reader));
+  // Search using Get()
+  ReadOptions ro;
+
+  ASSERT_OK(table_reader->Get(ro, seek_ikey.Encode().ToString(), &get_context,
+                              moptions.prefix_extractor.get()));
+}
+
+TEST(DataBlockHashIndex, BlockBoundary) {
+  BlockBasedTableOptions table_options;
+  table_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  table_options.block_restart_interval = 1;
+  table_options.block_size = 4096;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // insert two large k/v pair. Given that the block_size is 4096, one k/v
+  // pair will take up one block.
+  // [    k1/v1   ][    k2/v2  ]
+  // [   Block N  ][ Block N+1 ]
+
+  {
+    // [ "aab"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("aab");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@120
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 120, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v1);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@5
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 5, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+    value.Reset();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/filter_block.h b/src/rocksdb/table/block_based/filter_block.h
new file mode 100644
index 000000000..e1e206990
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block.h
@@ -0,0 +1,182 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file.  It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/format.h"
+#include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint64_t kNotValid = ULLONG_MAX;
+class FilterPolicy;
+
+class GetContext;
+using MultiGetRange = MultiGetContext::Range;
+
+// A FilterBlockBuilder is used to construct all of the filters for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table, or partitioned into smaller filters.
+//
+// The sequence of calls to FilterBlockBuilder must match the regexp:
+//      Add* Finish
+class FilterBlockBuilder {
+ public:
+  explicit FilterBlockBuilder() {}
+  // No copying allowed
+  FilterBlockBuilder(const FilterBlockBuilder&) = delete;
+  void operator=(const FilterBlockBuilder&) = delete;
+
+  virtual ~FilterBlockBuilder() {}
+
+  virtual void Add(
+      const Slice& key_without_ts) = 0;  // Add a key to current filter
+  virtual bool IsEmpty() const = 0;      // Empty == none added
+  // For reporting stats on how many entries the builder considered unique
+  virtual size_t EstimateEntriesAdded() = 0;
+  Slice Finish() {  // Generate Filter
+    const BlockHandle empty_handle;
+    Status dont_care_status;
+    auto ret = Finish(empty_handle, &dont_care_status);
+    assert(dont_care_status.ok());
+    return ret;
+  }
+  // If filter_data is not nullptr, Finish() may transfer ownership of
+  // underlying filter data to the caller,  so that it can be freed as soon as
+  // possible. BlockBasedFilterBlock will ignore this parameter.
+  //
+  virtual Slice Finish(
+      const BlockHandle& tmp /* only used in PartitionedFilterBlock as
+                                last_partition_block_handle */
+      ,
+      Status* status, std::unique_ptr<const char[]>* filter_data = nullptr) = 0;
+
+  // This is called when finishes using the FilterBitsBuilder
+  // in order to release memory usage and cache charge
+  // associated with it timely
+  virtual void ResetFilterBitsBuilder() {}
+
+  // To optionally post-verify the filter returned from
+  // FilterBlockBuilder::Finish.
+  // Return Status::OK() if skipped.
+  virtual Status MaybePostVerifyFilter(const Slice& /* filter_content */) {
+    return Status::OK();
+  }
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+//
+// BlockBased/Full FilterBlock would be called in the same way.
+class FilterBlockReader {
+ public:
+  FilterBlockReader() = default;
+  virtual ~FilterBlockReader() = default;
+
+  FilterBlockReader(const FilterBlockReader&) = delete;
+  FilterBlockReader& operator=(const FilterBlockReader&) = delete;
+
+  /**
+   * If no_io is set, then it returns true if it cannot answer the query without
+   * reading data from disk. This is used in PartitionedFilterBlockReader to
+   * avoid reading partitions that are not in block cache already
+   *
+   * Normally filters are built on only the user keys and the InternalKey is not
+   * needed for a query. The index in PartitionedFilterBlockReader however is
+   * built upon InternalKey and must be provided via const_ikey_ptr when running
+   * queries.
+   */
+  virtual bool KeyMayMatch(const Slice& key, const bool no_io,
+                           const Slice* const const_ikey_ptr,
+                           GetContext* get_context,
+                           BlockCacheLookupContext* lookup_context,
+                           Env::IOPriority rate_limiter_priority) = 0;
+
+  virtual void KeysMayMatch(MultiGetRange* range, const bool no_io,
+                            BlockCacheLookupContext* lookup_context,
+                            Env::IOPriority rate_limiter_priority) {
+    for (auto iter = range->begin(); iter != range->end(); ++iter) {
+      const Slice ukey_without_ts = iter->ukey_without_ts;
+      const Slice ikey = iter->ikey;
+      GetContext* const get_context = iter->get_context;
+      if (!KeyMayMatch(ukey_without_ts, no_io, &ikey, get_context,
+                       lookup_context, rate_limiter_priority)) {
+        range->SkipKey(iter);
+      }
+    }
+  }
+
+  /**
+   * no_io and const_ikey_ptr here means the same as in KeyMayMatch
+   */
+  virtual bool PrefixMayMatch(const Slice& prefix, const bool no_io,
+                              const Slice* const const_ikey_ptr,
+                              GetContext* get_context,
+                              BlockCacheLookupContext* lookup_context,
+                              Env::IOPriority rate_limiter_priority) = 0;
+
+  virtual void PrefixesMayMatch(MultiGetRange* range,
+                                const SliceTransform* prefix_extractor,
+                                const bool no_io,
+                                BlockCacheLookupContext* lookup_context,
+                                Env::IOPriority rate_limiter_priority) {
+    for (auto iter = range->begin(); iter != range->end(); ++iter) {
+      const Slice ukey_without_ts = iter->ukey_without_ts;
+      const Slice ikey = iter->ikey;
+      GetContext* const get_context = iter->get_context;
+      if (prefix_extractor->InDomain(ukey_without_ts) &&
+          !PrefixMayMatch(prefix_extractor->Transform(ukey_without_ts), no_io,
+                          &ikey, get_context, lookup_context,
+                          rate_limiter_priority)) {
+        range->SkipKey(iter);
+      }
+    }
+  }
+
+  virtual size_t ApproximateMemoryUsage() const = 0;
+
+  // convert this object to a human readable form
+  virtual std::string ToString() const {
+    std::string error_msg("Unsupported filter \n");
+    return error_msg;
+  }
+
+  virtual Status CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) {
+    return Status::OK();
+  }
+
+  virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/,
+                             const Slice& user_key_without_ts,
+                             const SliceTransform* prefix_extractor,
+                             const Comparator* /*comparator*/,
+                             const Slice* const const_ikey_ptr,
+                             bool* filter_checked, bool need_upper_bound_check,
+                             bool no_io,
+                             BlockCacheLookupContext* lookup_context,
+                             Env::IOPriority rate_limiter_priority) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_block_reader_common.cc b/src/rocksdb/table/block_based/filter_block_reader_common.cc
new file mode 100644
index 000000000..7dc49e83e
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block_reader_common.cc
@@ -0,0 +1,164 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/filter_block_reader_common.h"
+
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/parsed_full_filter_block.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<TBlocklike>* filter_block, BlockType block_type) {
+  PERF_TIMER_GUARD(read_filter_block_nanos);
+
+  assert(table);
+  assert(filter_block);
+  assert(filter_block->IsEmpty());
+
+  const BlockBasedTable::Rep* const rep = table->get_rep();
+  assert(rep);
+
+  const Status s =
+      table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle,
+                           UncompressionDict::GetEmptyDict(), filter_block,
+                           block_type, get_context, lookup_context,
+                           /* for_compaction */ false, use_cache,
+                           /* wait_for_cache */ true, /* async_read */ false);
+
+  return s;
+}
+
+template <typename TBlocklike>
+const SliceTransform*
+FilterBlockReaderCommon<TBlocklike>::table_prefix_extractor() const {
+  assert(table_);
+
+  const BlockBasedTable::Rep* const rep = table_->get_rep();
+  assert(rep);
+
+  return rep->prefix_filtering ? rep->table_prefix_extractor.get() : nullptr;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::whole_key_filtering() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->whole_key_filtering;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::cache_filter_blocks() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::GetOrReadFilterBlock(
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<TBlocklike>* filter_block, BlockType block_type,
+    Env::IOPriority rate_limiter_priority) const {
+  assert(filter_block);
+
+  if (!filter_block_.IsEmpty()) {
+    filter_block->SetUnownedValue(filter_block_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  read_options.rate_limiter_priority = rate_limiter_priority;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options,
+                         cache_filter_blocks(), get_context, lookup_context,
+                         filter_block, block_type);
+}
+
+template <typename TBlocklike>
+size_t FilterBlockReaderCommon<TBlocklike>::ApproximateFilterBlockMemoryUsage()
+    const {
+  assert(!filter_block_.GetOwnValue() || filter_block_.GetValue() != nullptr);
+  return filter_block_.GetOwnValue()
+             ? filter_block_.GetValue()->ApproximateMemoryUsage()
+             : 0;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::RangeMayExist(
+    const Slice* iterate_upper_bound, const Slice& user_key_without_ts,
+    const SliceTransform* prefix_extractor, const Comparator* comparator,
+    const Slice* const const_ikey_ptr, bool* filter_checked,
+    bool need_upper_bound_check, bool no_io,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) {
+    *filter_checked = false;
+    return true;
+  }
+  Slice prefix = prefix_extractor->Transform(user_key_without_ts);
+  if (need_upper_bound_check &&
+      !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
+    *filter_checked = false;
+    return true;
+  } else {
+    *filter_checked = true;
+    return PrefixMayMatch(prefix, no_io, const_ikey_ptr,
+                          /* get_context */ nullptr, lookup_context,
+                          rate_limiter_priority);
+  }
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::IsFilterCompatible(
+    const Slice* iterate_upper_bound, const Slice& prefix,
+    const Comparator* comparator) const {
+  // Try to reuse the bloom filter in the SST table if prefix_extractor in
+  // mutable_cf_options has changed. If range [user_key, upper_bound) all
+  // share the same prefix then we may still be able to use the bloom filter.
+  const SliceTransform* const prefix_extractor = table_prefix_extractor();
+  if (iterate_upper_bound != nullptr && prefix_extractor) {
+    if (!prefix_extractor->InDomain(*iterate_upper_bound)) {
+      return false;
+    }
+    Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
+    // first check if user_key and upper_bound all share the same prefix
+    if (comparator->CompareWithoutTimestamp(prefix, false, upper_bound_xform,
+                                            false) != 0) {
+      // second check if user_key's prefix is the immediate predecessor of
+      // upper_bound and have the same length. If so, we know for sure all
+      // keys in the range [user_key, upper_bound) share the same prefix.
+      // Also need to make sure upper_bound are full length to ensure
+      // correctness
+      if (!full_length_enabled_ ||
+          iterate_upper_bound->size() != prefix_extractor_full_length_ ||
+          !comparator->IsSameLengthImmediateSuccessor(prefix,
+                                                      *iterate_upper_bound)) {
+        return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Explicitly instantiate templates for both "blocklike" types we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template class FilterBlockReaderCommon<Block>;
+template class FilterBlockReaderCommon<ParsedFullFilterBlock>;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_block_reader_common.h b/src/rocksdb/table/block_based/filter_block_reader_common.h
new file mode 100644
index 000000000..ca07f5050
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block_reader_common.h
@@ -0,0 +1,79 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+
+#include "block_type.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBasedTable;
+class FilePrefetchBuffer;
+
+// Encapsulates common functionality for the various filter block reader
+// implementations. Provides access to the filter block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+template <typename TBlocklike>
+class FilterBlockReaderCommon : public FilterBlockReader {
+ public:
+  FilterBlockReaderCommon(const BlockBasedTable* t,
+                          CachableEntry<TBlocklike>&& filter_block)
+      : table_(t), filter_block_(std::move(filter_block)) {
+    assert(table_);
+    const SliceTransform* const prefix_extractor = table_prefix_extractor();
+    if (prefix_extractor) {
+      full_length_enabled_ =
+          prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_);
+    }
+  }
+
+  bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
+                     const SliceTransform* prefix_extractor,
+                     const Comparator* comparator,
+                     const Slice* const const_ikey_ptr, bool* filter_checked,
+                     bool need_upper_bound_check, bool no_io,
+                     BlockCacheLookupContext* lookup_context,
+                     Env::IOPriority rate_limiter_priority) override;
+
+ protected:
+  static Status ReadFilterBlock(const BlockBasedTable* table,
+                                FilePrefetchBuffer* prefetch_buffer,
+                                const ReadOptions& read_options, bool use_cache,
+                                GetContext* get_context,
+                                BlockCacheLookupContext* lookup_context,
+                                CachableEntry<TBlocklike>* filter_block,
+                                BlockType block_type);
+
+  const BlockBasedTable* table() const { return table_; }
+  const SliceTransform* table_prefix_extractor() const;
+  bool whole_key_filtering() const;
+  bool cache_filter_blocks() const;
+
+  Status GetOrReadFilterBlock(bool no_io, GetContext* get_context,
+                              BlockCacheLookupContext* lookup_context,
+                              CachableEntry<TBlocklike>* filter_block,
+                              BlockType block_type,
+                              Env::IOPriority rate_limiter_priority) const;
+
+  size_t ApproximateFilterBlockMemoryUsage() const;
+
+ private:
+  bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix,
+                          const Comparator* comparator) const;
+
+ private:
+  const BlockBasedTable* table_;
+  CachableEntry<TBlocklike> filter_block_;
+  size_t prefix_extractor_full_length_ = 0;
+  bool full_length_enabled_ = false;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_policy.cc b/src/rocksdb/table/block_based/filter_policy.cc
new file mode 100644
index 000000000..f84f804dd
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_policy.cc
@@ -0,0 +1,1973 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/filter_policy.h"
+
+#include <array>
+#include <climits>
+#include <cstring>
+#include <deque>
+#include <limits>
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "logging/logging.h"
+#include "port/lang.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "util/bloom_impl.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/math.h"
+#include "util/ribbon_config.h"
+#include "util/ribbon_impl.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Metadata trailer size for built-in filters. (This is separate from
+// block-based table block trailer.)
+//
+// Originally this was 1 byte for num_probes and 4 bytes for number of
+// cache lines in the Bloom filter, but now the first trailer byte is
+// usually an implementation marker and remaining 4 bytes have various
+// meanings.
+static constexpr uint32_t kMetadataLen = 5;
+
+Slice FinishAlwaysFalse(std::unique_ptr<const char[]>* /*buf*/) {
+  // Missing metadata, treated as zero entries
+  return Slice(nullptr, 0);
+}
+
+Slice FinishAlwaysTrue(std::unique_ptr<const char[]>* /*buf*/) {
+  return Slice("\0\0\0\0\0\0", 6);
+}
+
+// Base class for filter builders using the XXH3 preview hash,
+// also known as Hash64 or GetSliceHash64.
+class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  explicit XXPH3FilterBitsBuilder(
+      std::atomic<int64_t>* aggregate_rounding_balance,
+      std::shared_ptr<CacheReservationManager> cache_res_mgr,
+      bool detect_filter_construct_corruption)
+      : aggregate_rounding_balance_(aggregate_rounding_balance),
+        cache_res_mgr_(cache_res_mgr),
+        detect_filter_construct_corruption_(
+            detect_filter_construct_corruption) {}
+
+  ~XXPH3FilterBitsBuilder() override {}
+
+  virtual void AddKey(const Slice& key) override {
+    uint64_t hash = GetSliceHash64(key);
+    // Especially with prefixes, it is common to have repetition,
+    // though only adjacent repetition, which we want to immediately
+    // recognize and collapse for estimating true filter space
+    // requirements.
+    if (hash_entries_info_.entries.empty() ||
+        hash != hash_entries_info_.entries.back()) {
+      if (detect_filter_construct_corruption_) {
+        hash_entries_info_.xor_checksum ^= hash;
+      }
+      hash_entries_info_.entries.push_back(hash);
+      if (cache_res_mgr_ &&
+          // Traditional rounding to whole bucket size
+          ((hash_entries_info_.entries.size() %
+            kUint64tHashEntryCacheResBucketSize) ==
+           kUint64tHashEntryCacheResBucketSize / 2)) {
+        hash_entries_info_.cache_res_bucket_handles.emplace_back(nullptr);
+        Status s = cache_res_mgr_->MakeCacheReservation(
+            kUint64tHashEntryCacheResBucketSize * sizeof(hash),
+            &hash_entries_info_.cache_res_bucket_handles.back());
+        s.PermitUncheckedError();
+      }
+    }
+  }
+
+  virtual size_t EstimateEntriesAdded() override {
+    return hash_entries_info_.entries.size();
+  }
+
+  virtual Status MaybePostVerify(const Slice& filter_content) override;
+
+ protected:
+  static constexpr uint32_t kMetadataLen = 5;
+
+  // Number of hash entries to accumulate before charging their memory usage to
+  // the cache when cache charging is available
+  static const std::size_t kUint64tHashEntryCacheResBucketSize =
+      CacheReservationManagerImpl<
+          CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() /
+      sizeof(uint64_t);
+
+  // For delegating between XXPH3FilterBitsBuilders
+  void SwapEntriesWith(XXPH3FilterBitsBuilder* other) {
+    assert(other != nullptr);
+    hash_entries_info_.Swap(&(other->hash_entries_info_));
+  }
+
+  void ResetEntries() { hash_entries_info_.Reset(); }
+
+  virtual size_t RoundDownUsableSpace(size_t available_size) = 0;
+
+  // To choose size using malloc_usable_size, we have to actually allocate.
+  size_t AllocateMaybeRounding(size_t target_len_with_metadata,
+                               size_t num_entries,
+                               std::unique_ptr<char[]>* buf) {
+    // Return value set to a default; overwritten in some cases
+    size_t rv = target_len_with_metadata;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    if (aggregate_rounding_balance_ != nullptr) {
+      // Do optimize_filters_for_memory, using malloc_usable_size.
+      // Approach: try to keep FP rate balance better than or on
+      // target (negative aggregate_rounding_balance_). We can then select a
+      // lower bound filter size (within reasonable limits) that gets us as
+      // close to on target as possible. We request allocation for that filter
+      // size and use malloc_usable_size to "round up" to the actual
+      // allocation size.
+
+      // Although it can be considered bad practice to use malloc_usable_size
+      // to access an object beyond its original size, this approach should be
+      // quite general: working for all allocators that properly support
+      // malloc_usable_size.
+
+      // Race condition on balance is OK because it can only cause temporary
+      // skew in rounding up vs. rounding down, as long as updates are atomic
+      // and relative.
+      int64_t balance = aggregate_rounding_balance_->load();
+
+      double target_fp_rate =
+          EstimatedFpRate(num_entries, target_len_with_metadata);
+      double rv_fp_rate = target_fp_rate;
+
+      if (balance < 0) {
+        // See formula for BloomFilterPolicy::aggregate_rounding_balance_
+        double for_balance_fp_rate =
+            -balance / double{0x100000000} + target_fp_rate;
+
+        // To simplify, we just try a few modified smaller sizes. This also
+        // caps how much we vary filter size vs. target, to avoid outlier
+        // behavior from excessive variance.
+        size_t target_len = target_len_with_metadata - kMetadataLen;
+        assert(target_len < target_len_with_metadata);  // check underflow
+        for (uint64_t maybe_len_rough :
+             {uint64_t{3} * target_len / 4, uint64_t{13} * target_len / 16,
+              uint64_t{7} * target_len / 8, uint64_t{15} * target_len / 16}) {
+          size_t maybe_len_with_metadata =
+              RoundDownUsableSpace(maybe_len_rough + kMetadataLen);
+          double maybe_fp_rate =
+              EstimatedFpRate(num_entries, maybe_len_with_metadata);
+          if (maybe_fp_rate <= for_balance_fp_rate) {
+            rv = maybe_len_with_metadata;
+            rv_fp_rate = maybe_fp_rate;
+            break;
+          }
+        }
+      }
+
+      // Filter blocks are loaded into block cache with their block trailer.
+      // We need to make sure that's accounted for in choosing a
+      // fragmentation-friendly size.
+      const size_t kExtraPadding = BlockBasedTable::kBlockTrailerSize;
+      size_t requested = rv + kExtraPadding;
+
+      // Allocate and get usable size
+      buf->reset(new char[requested]);
+      size_t usable = malloc_usable_size(buf->get());
+
+      if (usable - usable / 4 > requested) {
+        // Ratio greater than 4/3 is too much for utilizing, if it's
+        // not a buggy or mislinked malloc_usable_size implementation.
+        // Non-linearity of FP rates with bits/key means rapidly
+        // diminishing returns in overall accuracy for additional
+        // storage on disk.
+        // Nothing to do, except assert that the result is accurate about
+        // the usable size. (Assignment never used.)
+        assert(((*buf)[usable - 1] = 'x'));
+      } else if (usable > requested) {
+        rv = RoundDownUsableSpace(usable - kExtraPadding);
+        assert(rv <= usable - kExtraPadding);
+        rv_fp_rate = EstimatedFpRate(num_entries, rv);
+      } else {
+        // Too small means bad malloc_usable_size
+        assert(usable == requested);
+      }
+      memset(buf->get(), 0, rv);
+
+      // Update balance
+      int64_t diff = static_cast<int64_t>((rv_fp_rate - target_fp_rate) *
+                                          double{0x100000000});
+      *aggregate_rounding_balance_ += diff;
+    } else {
+      buf->reset(new char[rv]());
+    }
+#else
+    (void)num_entries;
+    buf->reset(new char[rv]());
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return rv;
+  }
+
+  // TODO: Ideally we want to verify the hash entry
+  // as it is added to the filter and eliminate this function
+  // for speeding up and leaving fewer spaces for undetected memory/CPU
+  // corruption. For Ribbon Filter, it's bit harder.
+  // Possible solution:
+  // pass a custom iterator that tracks the xor checksum as
+  // it iterates to ResetAndFindSeedToSolve
+  Status MaybeVerifyHashEntriesChecksum() {
+    if (!detect_filter_construct_corruption_) {
+      return Status::OK();
+    }
+
+    uint64_t actual_hash_entries_xor_checksum = 0;
+    for (uint64_t h : hash_entries_info_.entries) {
+      actual_hash_entries_xor_checksum ^= h;
+    }
+
+    if (actual_hash_entries_xor_checksum == hash_entries_info_.xor_checksum) {
+      return Status::OK();
+    } else {
+      // Since these hash entries are corrupted and they will not be used
+      // anymore, we can reset them and release memory.
+      ResetEntries();
+      return Status::Corruption("Filter's hash entries checksum mismatched");
+    }
+  }
+
+  // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr,
+  // always "round up" like historic behavior.
+  std::atomic<int64_t>* aggregate_rounding_balance_;
+
+  // For reserving memory used in (new) Bloom and Ribbon Filter construction
+  std::shared_ptr<CacheReservationManager> cache_res_mgr_;
+
+  // For managing cache charge for final filter in (new) Bloom and Ribbon
+  // Filter construction
+  std::deque<std::unique_ptr<CacheReservationManager::CacheReservationHandle>>
+      final_filter_cache_res_handles_;
+
+  bool detect_filter_construct_corruption_;
+
+  struct HashEntriesInfo {
+    // A deque avoids unnecessary copying of already-saved values
+    // and has near-minimal peak memory use.
+    std::deque<uint64_t> entries;
+
+    // If cache_res_mgr_ != nullptr,
+    // it manages cache charge for buckets of hash entries in (new) Bloom
+    // or Ribbon Filter construction.
+    // Otherwise, it is empty.
+    std::deque<std::unique_ptr<CacheReservationManager::CacheReservationHandle>>
+        cache_res_bucket_handles;
+
+    // If detect_filter_construct_corruption_ == true,
+    // it records the xor checksum of hash entries.
+    // Otherwise, it is 0.
+    uint64_t xor_checksum = 0;
+
+    void Swap(HashEntriesInfo* other) {
+      assert(other != nullptr);
+      std::swap(entries, other->entries);
+      std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles);
+      std::swap(xor_checksum, other->xor_checksum);
+    }
+
+    void Reset() {
+      entries.clear();
+      cache_res_bucket_handles.clear();
+      xor_checksum = 0;
+    }
+  };
+
+  HashEntriesInfo hash_entries_info_;
+};
+
+// #################### FastLocalBloom implementation ################## //
+// ############## also known as format_version=5 Bloom filter ########## //
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder {
+ public:
+  // Non-null aggregate_rounding_balance implies optimize_filters_for_memory
+  explicit FastLocalBloomBitsBuilder(
+      const int millibits_per_key,
+      std::atomic<int64_t>* aggregate_rounding_balance,
+      std::shared_ptr<CacheReservationManager> cache_res_mgr,
+      bool detect_filter_construct_corruption)
+      : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr,
+                               detect_filter_construct_corruption),
+        millibits_per_key_(millibits_per_key) {
+    assert(millibits_per_key >= 1000);
+  }
+
+  // No Copy allowed
+  FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete;
+  void operator=(const FastLocalBloomBitsBuilder&) = delete;
+
+  ~FastLocalBloomBitsBuilder() override {}
+
+  using FilterBitsBuilder::Finish;
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    return Finish(buf, nullptr);
+  }
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf,
+                       Status* status) override {
+    size_t num_entries = hash_entries_info_.entries.size();
+    size_t len_with_metadata = CalculateSpace(num_entries);
+
+    std::unique_ptr<char[]> mutable_buf;
+    std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+        final_filter_cache_res_handle;
+    len_with_metadata =
+        AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
+    // Cache charging for mutable_buf
+    if (cache_res_mgr_) {
+      Status s = cache_res_mgr_->MakeCacheReservation(
+          len_with_metadata * sizeof(char), &final_filter_cache_res_handle);
+      s.PermitUncheckedError();
+    }
+
+    assert(mutable_buf);
+    assert(len_with_metadata >= kMetadataLen);
+
+    // Max size supported by implementation
+    assert(len_with_metadata <= 0xffffffffU);
+
+    // Compute num_probes after any rounding / adjustments
+    int num_probes = GetNumProbes(num_entries, len_with_metadata);
+
+    uint32_t len = static_cast<uint32_t>(len_with_metadata - kMetadataLen);
+    if (len > 0) {
+      TEST_SYNC_POINT_CALLBACK(
+          "XXPH3FilterBitsBuilder::Finish::"
+          "TamperHashEntries",
+          &hash_entries_info_.entries);
+      AddAllEntries(mutable_buf.get(), len, num_probes);
+      Status verify_hash_entries_checksum_status =
+          MaybeVerifyHashEntriesChecksum();
+      if (!verify_hash_entries_checksum_status.ok()) {
+        if (status) {
+          *status = verify_hash_entries_checksum_status;
+        }
+        return FinishAlwaysTrue(buf);
+      }
+    }
+
+    bool keep_entries_for_postverify = detect_filter_construct_corruption_;
+    if (!keep_entries_for_postverify) {
+      ResetEntries();
+    }
+
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -1 = Marker for newer Bloom implementations
+    mutable_buf[len] = static_cast<char>(-1);
+    // 0 = Marker for this sub-implementation
+    mutable_buf[len + 1] = static_cast<char>(0);
+    // num_probes (and 0 in upper bits for 64-byte block size)
+    mutable_buf[len + 2] = static_cast<char>(num_probes);
+    // rest of metadata stays zero
+
+    auto TEST_arg_pair __attribute__((__unused__)) =
+        std::make_pair(&mutable_buf, len_with_metadata);
+    TEST_SYNC_POINT_CALLBACK("XXPH3FilterBitsBuilder::Finish::TamperFilter",
+                             &TEST_arg_pair);
+
+    Slice rv(mutable_buf.get(), len_with_metadata);
+    *buf = std::move(mutable_buf);
+    final_filter_cache_res_handles_.push_back(
+        std::move(final_filter_cache_res_handle));
+    if (status) {
+      *status = Status::OK();
+    }
+    return rv;
+  }
+
+  size_t ApproximateNumEntries(size_t bytes) override {
+    size_t bytes_no_meta =
+        bytes >= kMetadataLen ? RoundDownUsableSpace(bytes) - kMetadataLen : 0;
+    return static_cast<size_t>(uint64_t{8000} * bytes_no_meta /
+                               millibits_per_key_);
+  }
+
+  size_t CalculateSpace(size_t num_entries) override {
+    // If not for cache line blocks in the filter, what would the target
+    // length in bytes be?
+    size_t raw_target_len = static_cast<size_t>(
+        (uint64_t{num_entries} * millibits_per_key_ + 7999) / 8000);
+
+    if (raw_target_len >= size_t{0xffffffc0}) {
+      // Max supported for this data structure implementation
+      raw_target_len = size_t{0xffffffc0};
+    }
+
+    // Round up to nearest multiple of 64 (block size). This adjustment is
+    // used for target FP rate only so that we don't receive complaints about
+    // lower FP rate vs. historic Bloom filter behavior.
+    return ((raw_target_len + 63) & ~size_t{63}) + kMetadataLen;
+  }
+
+  double EstimatedFpRate(size_t keys, size_t len_with_metadata) override {
+    int num_probes = GetNumProbes(keys, len_with_metadata);
+    return FastLocalBloomImpl::EstimatedFpRate(
+        keys, len_with_metadata - kMetadataLen, num_probes, /*hash bits*/ 64);
+  }
+
+ protected:
+  size_t RoundDownUsableSpace(size_t available_size) override {
+    size_t rv = available_size - kMetadataLen;
+
+    if (rv >= size_t{0xffffffc0}) {
+      // Max supported for this data structure implementation
+      rv = size_t{0xffffffc0};
+    }
+
+    // round down to multiple of 64 (block size)
+    rv &= ~size_t{63};
+
+    return rv + kMetadataLen;
+  }
+
+ private:
+  // Compute num_probes after any rounding / adjustments
+  int GetNumProbes(size_t keys, size_t len_with_metadata) {
+    uint64_t millibits = uint64_t{len_with_metadata - kMetadataLen} * 8000;
+    int actual_millibits_per_key =
+        static_cast<int>(millibits / std::max(keys, size_t{1}));
+    // BEGIN XXX/TODO(peterd): preserving old/default behavior for now to
+    // minimize unit test churn. Remove this some time.
+    if (!aggregate_rounding_balance_) {
+      actual_millibits_per_key = millibits_per_key_;
+    }
+    // END XXX/TODO
+    return FastLocalBloomImpl::ChooseNumProbes(actual_millibits_per_key);
+  }
+
+  void AddAllEntries(char* data, uint32_t len, int num_probes) {
+    // Simple version without prefetching:
+    //
+    // for (auto h : hash_entries_info_.entries) {
+    //   FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len,
+    //                               num_probes, data);
+    // }
+
+    const size_t num_entries = hash_entries_info_.entries.size();
+    constexpr size_t kBufferMask = 7;
+    static_assert(((kBufferMask + 1) & kBufferMask) == 0,
+                  "Must be power of 2 minus 1");
+
+    std::array<uint32_t, kBufferMask + 1> hashes;
+    std::array<uint32_t, kBufferMask + 1> byte_offsets;
+
+    // Prime the buffer
+    size_t i = 0;
+    std::deque<uint64_t>::iterator hash_entries_it =
+        hash_entries_info_.entries.begin();
+    for (; i <= kBufferMask && i < num_entries; ++i) {
+      uint64_t h = *hash_entries_it;
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+                                      /*out*/ &byte_offsets[i]);
+      hashes[i] = Upper32of64(h);
+      ++hash_entries_it;
+    }
+
+    // Process and buffer
+    for (; i < num_entries; ++i) {
+      uint32_t& hash_ref = hashes[i & kBufferMask];
+      uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask];
+      // Process (add)
+      FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes,
+                                          data + byte_offset_ref);
+      // And buffer
+      uint64_t h = *hash_entries_it;
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+                                      /*out*/ &byte_offset_ref);
+      hash_ref = Upper32of64(h);
+      ++hash_entries_it;
+    }
+
+    // Finish processing
+    for (i = 0; i <= kBufferMask && i < num_entries; ++i) {
+      FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes,
+                                          data + byte_offsets[i]);
+    }
+  }
+
+  // Target allocation per added key, in thousandths of a bit.
+  int millibits_per_key_;
+};
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsReader : public BuiltinFilterBitsReader {
+ public:
+  FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes)
+      : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {}
+
+  // No Copy allowed
+  FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete;
+  void operator=(const FastLocalBloomBitsReader&) = delete;
+
+  ~FastLocalBloomBitsReader() override {}
+
+  bool MayMatch(const Slice& key) override {
+    uint64_t h = GetSliceHash64(key);
+    uint32_t byte_offset;
+    FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+                                    /*out*/ &byte_offset);
+    return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_,
+                                                    data_ + byte_offset);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+    for (int i = 0; i < num_keys; ++i) {
+      uint64_t h = GetSliceHash64(*keys[i]);
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+                                      /*out*/ &byte_offsets[i]);
+      hashes[i] = Upper32of64(h);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared(
+          hashes[i], num_probes_, data_ + byte_offsets[i]);
+    }
+  }
+
+  bool HashMayMatch(const uint64_t h) override {
+    return FastLocalBloomImpl::HashMayMatch(Lower32of64(h), Upper32of64(h),
+                                            len_bytes_, num_probes_, data_);
+  }
+
+ private:
+  const char* data_;
+  const int num_probes_;
+  const uint32_t len_bytes_;
+};
+
+// ##################### Ribbon filter implementation ################### //
+
+// Implements concept RehasherTypesAndSettings in ribbon_impl.h
+struct Standard128RibbonRehasherTypesAndSettings {
+  // These are schema-critical. Any change almost certainly changes
+  // underlying data.
+  static constexpr bool kIsFilter = true;
+  static constexpr bool kHomogeneous = false;
+  static constexpr bool kFirstCoeffAlwaysOne = true;
+  static constexpr bool kUseSmash = false;
+  using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
+  using Hash = uint64_t;
+  using Seed = uint32_t;
+  // Changing these doesn't necessarily change underlying data,
+  // but might affect supported scalability of those dimensions.
+  using Index = uint32_t;
+  using ResultRow = uint32_t;
+  // Save a conditional in Ribbon queries
+  static constexpr bool kAllowZeroStarts = false;
+};
+
+using Standard128RibbonTypesAndSettings =
+    ribbon::StandardRehasherAdapter<Standard128RibbonRehasherTypesAndSettings>;
+
+class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder {
+ public:
+  explicit Standard128RibbonBitsBuilder(
+      double desired_one_in_fp_rate, int bloom_millibits_per_key,
+      std::atomic<int64_t>* aggregate_rounding_balance,
+      std::shared_ptr<CacheReservationManager> cache_res_mgr,
+      bool detect_filter_construct_corruption, Logger* info_log)
+      : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr,
+                               detect_filter_construct_corruption),
+        desired_one_in_fp_rate_(desired_one_in_fp_rate),
+        info_log_(info_log),
+        bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance,
+                        cache_res_mgr, detect_filter_construct_corruption) {
+    assert(desired_one_in_fp_rate >= 1.0);
+  }
+
+  // No Copy allowed
+  Standard128RibbonBitsBuilder(const Standard128RibbonBitsBuilder&) = delete;
+  void operator=(const Standard128RibbonBitsBuilder&) = delete;
+
+  ~Standard128RibbonBitsBuilder() override {}
+
+  using FilterBitsBuilder::Finish;
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    return Finish(buf, nullptr);
+  }
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf,
+                       Status* status) override {
+    if (hash_entries_info_.entries.size() > kMaxRibbonEntries) {
+      ROCKS_LOG_WARN(
+          info_log_, "Too many keys for Ribbon filter: %llu",
+          static_cast<unsigned long long>(hash_entries_info_.entries.size()));
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_info_.entries.empty());
+      return bloom_fallback_.Finish(buf, status);
+    }
+    if (hash_entries_info_.entries.size() == 0) {
+      // Save a conditional in Ribbon queries by using alternate reader
+      // for zero entries added.
+      if (status) {
+        *status = Status::OK();
+      }
+      return FinishAlwaysFalse(buf);
+    }
+    uint32_t num_entries =
+        static_cast<uint32_t>(hash_entries_info_.entries.size());
+    uint32_t num_slots;
+    size_t len_with_metadata;
+
+    CalculateSpaceAndSlots(num_entries, &len_with_metadata, &num_slots);
+
+    // Bloom fall-back indicator
+    if (num_slots == 0) {
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_info_.entries.empty());
+      return bloom_fallback_.Finish(buf, status);
+    }
+
+    uint32_t entropy = 0;
+    if (!hash_entries_info_.entries.empty()) {
+      entropy = Lower32of64(hash_entries_info_.entries.front());
+    }
+
+    BandingType banding;
+    std::size_t bytes_banding = ribbon::StandardBanding<
+        Standard128RibbonTypesAndSettings>::EstimateMemoryUsage(num_slots);
+    Status status_banding_cache_res = Status::OK();
+
+    // Cache charging for banding
+    std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+        banding_res_handle;
+    if (cache_res_mgr_) {
+      status_banding_cache_res = cache_res_mgr_->MakeCacheReservation(
+          bytes_banding, &banding_res_handle);
+    }
+
+    if (status_banding_cache_res.IsMemoryLimit()) {
+      ROCKS_LOG_WARN(info_log_,
+                     "Cache charging for Ribbon filter banding failed due "
+                     "to cache full");
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_info_.entries.empty());
+      // Release cache for banding since the banding won't be allocated
+      banding_res_handle.reset();
+      return bloom_fallback_.Finish(buf, status);
+    }
+
+    TEST_SYNC_POINT_CALLBACK(
+        "XXPH3FilterBitsBuilder::Finish::"
+        "TamperHashEntries",
+        &hash_entries_info_.entries);
+
+    bool success = banding.ResetAndFindSeedToSolve(
+        num_slots, hash_entries_info_.entries.begin(),
+        hash_entries_info_.entries.end(),
+        /*starting seed*/ entropy & 255, /*seed mask*/ 255);
+    if (!success) {
+      ROCKS_LOG_WARN(
+          info_log_, "Too many re-seeds (256) for Ribbon filter, %llu / %llu",
+          static_cast<unsigned long long>(hash_entries_info_.entries.size()),
+          static_cast<unsigned long long>(num_slots));
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_info_.entries.empty());
+      return bloom_fallback_.Finish(buf, status);
+    }
+
+    Status verify_hash_entries_checksum_status =
+        MaybeVerifyHashEntriesChecksum();
+    if (!verify_hash_entries_checksum_status.ok()) {
+      ROCKS_LOG_WARN(info_log_, "Verify hash entries checksum error: %s",
+                     verify_hash_entries_checksum_status.getState());
+      if (status) {
+        *status = verify_hash_entries_checksum_status;
+      }
+      return FinishAlwaysTrue(buf);
+    }
+
+    bool keep_entries_for_postverify = detect_filter_construct_corruption_;
+    if (!keep_entries_for_postverify) {
+      ResetEntries();
+    }
+
+    uint32_t seed = banding.GetOrdinalSeed();
+    assert(seed < 256);
+
+    std::unique_ptr<char[]> mutable_buf;
+    std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+        final_filter_cache_res_handle;
+    len_with_metadata =
+        AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
+    // Cache charging for mutable_buf
+    if (cache_res_mgr_) {
+      Status s = cache_res_mgr_->MakeCacheReservation(
+          len_with_metadata * sizeof(char), &final_filter_cache_res_handle);
+      s.PermitUncheckedError();
+    }
+
+    SolnType soln(mutable_buf.get(), len_with_metadata);
+    soln.BackSubstFrom(banding);
+    uint32_t num_blocks = soln.GetNumBlocks();
+    // This should be guaranteed:
+    // num_entries < 2^30
+    // => (overhead_factor < 2.0)
+    // num_entries * overhead_factor == num_slots < 2^31
+    // => (num_blocks = num_slots / 128)
+    // num_blocks < 2^24
+    assert(num_blocks < 0x1000000U);
+
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -2 = Marker for Standard128 Ribbon
+    mutable_buf[len_with_metadata - 5] = static_cast<char>(-2);
+    // Hash seed
+    mutable_buf[len_with_metadata - 4] = static_cast<char>(seed);
+    // Number of blocks, in 24 bits
+    // (Along with bytes, we can derive other settings)
+    mutable_buf[len_with_metadata - 3] = static_cast<char>(num_blocks & 255);
+    mutable_buf[len_with_metadata - 2] =
+        static_cast<char>((num_blocks >> 8) & 255);
+    mutable_buf[len_with_metadata - 1] =
+        static_cast<char>((num_blocks >> 16) & 255);
+
+    auto TEST_arg_pair __attribute__((__unused__)) =
+        std::make_pair(&mutable_buf, len_with_metadata);
+    TEST_SYNC_POINT_CALLBACK("XXPH3FilterBitsBuilder::Finish::TamperFilter",
+                             &TEST_arg_pair);
+
+    Slice rv(mutable_buf.get(), len_with_metadata);
+    *buf = std::move(mutable_buf);
+    final_filter_cache_res_handles_.push_back(
+        std::move(final_filter_cache_res_handle));
+    if (status) {
+      *status = Status::OK();
+    }
+    return rv;
+  }
+
+  // Setting num_slots to 0 means "fall back on Bloom filter."
+  // And note this implementation does not support num_entries or num_slots
+  // beyond uint32_t; see kMaxRibbonEntries.
+  void CalculateSpaceAndSlots(size_t num_entries,
+                              size_t* target_len_with_metadata,
+                              uint32_t* num_slots) {
+    if (num_entries > kMaxRibbonEntries) {
+      // More entries than supported by this Ribbon
+      *num_slots = 0;  // use Bloom
+      *target_len_with_metadata = bloom_fallback_.CalculateSpace(num_entries);
+      return;
+    }
+    uint32_t entropy = 0;
+    if (!hash_entries_info_.entries.empty()) {
+      entropy = Upper32of64(hash_entries_info_.entries.front());
+    }
+
+    *num_slots = NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
+    *target_len_with_metadata =
+        SolnType::GetBytesForOneInFpRate(*num_slots, desired_one_in_fp_rate_,
+                                         /*rounding*/ entropy) +
+        kMetadataLen;
+
+    // Consider possible Bloom fallback for small filters
+    if (*num_slots < 1024) {
+      size_t bloom = bloom_fallback_.CalculateSpace(num_entries);
+      if (bloom < *target_len_with_metadata) {
+        *num_slots = 0;  // use Bloom
+        *target_len_with_metadata = bloom;
+        return;
+      }
+    }
+  }
+
+  size_t CalculateSpace(size_t num_entries) override {
+    if (num_entries == 0) {
+      // See FinishAlwaysFalse
+      return 0;
+    }
+    size_t target_len_with_metadata;
+    uint32_t num_slots;
+    CalculateSpaceAndSlots(num_entries, &target_len_with_metadata, &num_slots);
+    (void)num_slots;
+    return target_len_with_metadata;
+  }
+
+  // This is a somewhat ugly but reasonably fast and reasonably accurate
+  // reversal of CalculateSpace.
+  size_t ApproximateNumEntries(size_t bytes) override {
+    size_t len_no_metadata =
+        RoundDownUsableSpace(std::max(bytes, size_t{kMetadataLen})) -
+        kMetadataLen;
+
+    if (!(desired_one_in_fp_rate_ > 1.0)) {
+      // Effectively asking for 100% FP rate, or NaN etc.
+      // Note that NaN is neither < 1.0 nor > 1.0
+      return kMaxRibbonEntries;
+    }
+
+    // Find a slight under-estimate for actual average bits per slot
+    double min_real_bits_per_slot;
+    if (desired_one_in_fp_rate_ >= 1.0 + std::numeric_limits<uint32_t>::max()) {
+      // Max of 32 solution columns (result bits)
+      min_real_bits_per_slot = 32.0;
+    } else {
+      // Account for mix of b and b+1 solution columns being slightly
+      // suboptimal vs. ideal log2(1/fp_rate) bits.
+      uint32_t rounded = static_cast<uint32_t>(desired_one_in_fp_rate_);
+      int upper_bits_per_key = 1 + FloorLog2(rounded);
+      double fp_rate_for_upper = std::pow(2.0, -upper_bits_per_key);
+      double portion_lower =
+          (1.0 / desired_one_in_fp_rate_ - fp_rate_for_upper) /
+          fp_rate_for_upper;
+      min_real_bits_per_slot = upper_bits_per_key - portion_lower;
+      assert(min_real_bits_per_slot > 0.0);
+      assert(min_real_bits_per_slot <= 32.0);
+    }
+
+    // An overestimate, but this should only be O(1) slots away from truth.
+    double max_slots = len_no_metadata * 8.0 / min_real_bits_per_slot;
+
+    // Let's not bother accounting for overflow to Bloom filter
+    // (Includes NaN case)
+    if (!(max_slots < ConfigHelper::GetNumSlots(kMaxRibbonEntries))) {
+      return kMaxRibbonEntries;
+    }
+
+    // Set up for short iteration
+    uint32_t slots = static_cast<uint32_t>(max_slots);
+    slots = SolnType::RoundUpNumSlots(slots);
+
+    // Assert that we have a valid upper bound on slots
+    assert(SolnType::GetBytesForOneInFpRate(
+               SolnType::RoundUpNumSlots(slots + 1), desired_one_in_fp_rate_,
+               /*rounding*/ 0) > len_no_metadata);
+
+    // Iterate up to a few times to rather precisely account for small effects
+    for (int i = 0; slots > 0; ++i) {
+      size_t reqd_bytes =
+          SolnType::GetBytesForOneInFpRate(slots, desired_one_in_fp_rate_,
+                                           /*rounding*/ 0);
+      if (reqd_bytes <= len_no_metadata) {
+        break;  // done
+      }
+      if (i >= 2) {
+        // should have been enough iterations
+        assert(false);
+        break;
+      }
+      slots = SolnType::RoundDownNumSlots(slots - 1);
+    }
+
+    uint32_t num_entries = ConfigHelper::GetNumToAdd(slots);
+
+    // Consider possible Bloom fallback for small filters
+    if (slots < 1024) {
+      size_t bloom = bloom_fallback_.ApproximateNumEntries(bytes);
+      if (bloom > num_entries) {
+        return bloom;
+      } else {
+        return num_entries;
+      }
+    } else {
+      return std::min(num_entries, kMaxRibbonEntries);
+    }
+  }
+
+  double EstimatedFpRate(size_t num_entries,
+                         size_t len_with_metadata) override {
+    if (num_entries > kMaxRibbonEntries) {
+      // More entries than supported by this Ribbon
+      return bloom_fallback_.EstimatedFpRate(num_entries, len_with_metadata);
+    }
+    uint32_t num_slots =
+        NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
+    SolnType fake_soln(nullptr, len_with_metadata);
+    fake_soln.ConfigureForNumSlots(num_slots);
+    return fake_soln.ExpectedFpRate();
+  }
+
+  Status MaybePostVerify(const Slice& filter_content) override {
+    bool fall_back = (bloom_fallback_.EstimateEntriesAdded() > 0);
+    return fall_back ? bloom_fallback_.MaybePostVerify(filter_content)
+                     : XXPH3FilterBitsBuilder::MaybePostVerify(filter_content);
+  }
+
+ protected:
+  size_t RoundDownUsableSpace(size_t available_size) override {
+    size_t rv = available_size - kMetadataLen;
+
+    // round down to multiple of 16 (segment size)
+    rv &= ~size_t{15};
+
+    return rv + kMetadataLen;
+  }
+
+ private:
+  using TS = Standard128RibbonTypesAndSettings;
+  using SolnType = ribbon::SerializableInterleavedSolution<TS>;
+  using BandingType = ribbon::StandardBanding<TS>;
+  using ConfigHelper = ribbon::BandingConfigHelper1TS<ribbon::kOneIn20, TS>;
+
+  static uint32_t NumEntriesToNumSlots(uint32_t num_entries) {
+    uint32_t num_slots1 = ConfigHelper::GetNumSlots(num_entries);
+    return SolnType::RoundUpNumSlots(num_slots1);
+  }
+
+  // Approximate num_entries to ensure number of bytes fits in 32 bits, which
+  // is not an inherent limitation but does ensure somewhat graceful Bloom
+  // fallback for crazy high number of entries, since the Bloom implementation
+  // does not support number of bytes bigger than fits in 32 bits. This is
+  // within an order of magnitude of implementation limit on num_slots
+  // fitting in 32 bits, and even closer for num_blocks fitting in 24 bits
+  // (for filter metadata).
+  static constexpr uint32_t kMaxRibbonEntries = 950000000;  // ~ 1 billion
+
+  // A desired value for 1/fp_rate. For example, 100 -> 1% fp rate.
+  double desired_one_in_fp_rate_;
+
+  // For warnings, or can be nullptr
+  Logger* info_log_;
+
+  // For falling back on Bloom filter in some exceptional cases and
+  // very small filter cases
+  FastLocalBloomBitsBuilder bloom_fallback_;
+};
+
+// for the linker, at least with DEBUG_LEVEL=2
+constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries;
+
+class Standard128RibbonBitsReader : public BuiltinFilterBitsReader {
+ public:
+  Standard128RibbonBitsReader(const char* data, size_t len_bytes,
+                              uint32_t num_blocks, uint32_t seed)
+      : soln_(const_cast<char*>(data), len_bytes) {
+    soln_.ConfigureForNumBlocks(num_blocks);
+    hasher_.SetOrdinalSeed(seed);
+  }
+
+  // No Copy allowed
+  Standard128RibbonBitsReader(const Standard128RibbonBitsReader&) = delete;
+  void operator=(const Standard128RibbonBitsReader&) = delete;
+
+  ~Standard128RibbonBitsReader() override {}
+
+  bool MayMatch(const Slice& key) override {
+    uint64_t h = GetSliceHash64(key);
+    return soln_.FilterQuery(h, hasher_);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    struct SavedData {
+      uint64_t seeded_hash;
+      uint32_t segment_num;
+      uint32_t num_columns;
+      uint32_t start_bits;
+    };
+    std::array<SavedData, MultiGetContext::MAX_BATCH_SIZE> saved;
+    for (int i = 0; i < num_keys; ++i) {
+      ribbon::InterleavedPrepareQuery(
+          GetSliceHash64(*keys[i]), hasher_, soln_, &saved[i].seeded_hash,
+          &saved[i].segment_num, &saved[i].num_columns, &saved[i].start_bits);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = ribbon::InterleavedFilterQuery(
+          saved[i].seeded_hash, saved[i].segment_num, saved[i].num_columns,
+          saved[i].start_bits, hasher_, soln_);
+    }
+  }
+
+  bool HashMayMatch(const uint64_t h) override {
+    return soln_.FilterQuery(h, hasher_);
+  }
+
+ private:
+  using TS = Standard128RibbonTypesAndSettings;
+  ribbon::SerializableInterleavedSolution<TS> soln_;
+  ribbon::StandardHasher<TS> hasher_;
+};
+
+// ##################### Legacy Bloom implementation ################### //
+
+using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
+
+class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log);
+
+  // No Copy allowed
+  LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete;
+  void operator=(const LegacyBloomBitsBuilder&) = delete;
+
+  ~LegacyBloomBitsBuilder() override;
+
+  void AddKey(const Slice& key) override;
+
+  virtual size_t EstimateEntriesAdded() override {
+    return hash_entries_.size();
+  }
+
+  using FilterBitsBuilder::Finish;
+
+  Slice Finish(std::unique_ptr<const char[]>* buf) override;
+
+  size_t CalculateSpace(size_t num_entries) override {
+    uint32_t dont_care1;
+    uint32_t dont_care2;
+    return CalculateSpace(num_entries, &dont_care1, &dont_care2);
+  }
+
+  double EstimatedFpRate(size_t keys, size_t bytes) override {
+    return LegacyBloomImpl::EstimatedFpRate(keys, bytes - kMetadataLen,
+                                            num_probes_);
+  }
+
+  size_t ApproximateNumEntries(size_t bytes) override;
+
+ private:
+  int bits_per_key_;
+  int num_probes_;
+  std::vector<uint32_t> hash_entries_;
+  Logger* info_log_;
+
+  // Get totalbits that optimized for cpu cache line
+  uint32_t GetTotalBitsForLocality(uint32_t total_bits);
+
+  // Reserve space for new filter
+  char* ReserveSpace(size_t num_entries, uint32_t* total_bits,
+                     uint32_t* num_lines);
+
+  // Implementation-specific variant of public CalculateSpace
+  uint32_t CalculateSpace(size_t num_entries, uint32_t* total_bits,
+                          uint32_t* num_lines);
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
+};
+
+LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key,
+                                               Logger* info_log)
+    : bits_per_key_(bits_per_key),
+      num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)),
+      info_log_(info_log) {
+  assert(bits_per_key_);
+}
+
+LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {}
+
+void LegacyBloomBitsBuilder::AddKey(const Slice& key) {
+  uint32_t hash = BloomHash(key);
+  if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
+    hash_entries_.push_back(hash);
+  }
+}
+
+Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
+  uint32_t total_bits, num_lines;
+  size_t num_entries = hash_entries_.size();
+  char* data =
+      ReserveSpace(static_cast<int>(num_entries), &total_bits, &num_lines);
+  assert(data);
+
+  if (total_bits != 0 && num_lines != 0) {
+    for (auto h : hash_entries_) {
+      AddHash(h, data, num_lines, total_bits);
+    }
+
+    // Check for excessive entries for 32-bit hash function
+    if (num_entries >= /* minimum of 3 million */ 3000000U) {
+      // More specifically, we can detect that the 32-bit hash function
+      // is causing significant increase in FP rate by comparing current
+      // estimated FP rate to what we would get with a normal number of
+      // keys at same memory ratio.
+      double est_fp_rate = LegacyBloomImpl::EstimatedFpRate(
+          num_entries, total_bits / 8, num_probes_);
+      double vs_fp_rate = LegacyBloomImpl::EstimatedFpRate(
+          1U << 16, (1U << 16) * bits_per_key_ / 8, num_probes_);
+
+      if (est_fp_rate >= 1.50 * vs_fp_rate) {
+        // For more details, see
+        // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
+        ROCKS_LOG_WARN(
+            info_log_,
+            "Using legacy SST/BBT Bloom filter with excessive key count "
+            "(%.1fM @ %dbpk), causing estimated %.1fx higher filter FP rate. "
+            "Consider using new Bloom with format_version>=5, smaller SST "
+            "file size, or partitioned filters.",
+            num_entries / 1000000.0, bits_per_key_, est_fp_rate / vs_fp_rate);
+      }
+    }
+  }
+  // See BloomFilterPolicy::GetFilterBitsReader for metadata
+  data[total_bits / 8] = static_cast<char>(num_probes_);
+  EncodeFixed32(data + total_bits / 8 + 1, static_cast<uint32_t>(num_lines));
+
+  const char* const_data = data;
+  buf->reset(const_data);
+  hash_entries_.clear();
+
+  return Slice(data, total_bits / 8 + kMetadataLen);
+}
+
+size_t LegacyBloomBitsBuilder::ApproximateNumEntries(size_t bytes) {
+  assert(bits_per_key_);
+  assert(bytes > 0);
+
+  uint64_t total_bits_tmp = bytes * 8;
+  // total bits, including temporary computations, cannot exceed 2^32
+  // for compatibility
+  total_bits_tmp = std::min(total_bits_tmp, uint64_t{0xffff0000});
+
+  uint32_t high = static_cast<uint32_t>(total_bits_tmp) /
+                      static_cast<uint32_t>(bits_per_key_) +
+                  1;
+  uint32_t low = 1;
+  uint32_t n = high;
+  for (; n >= low; n--) {
+    if (CalculateSpace(n) <= bytes) {
+      break;
+    }
+  }
+  return n;
+}
+
+uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_lines =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_lines an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_lines % 2 == 0) {
+    num_lines++;
+  }
+  return num_lines * (CACHE_LINE_SIZE * 8);
+}
+
+uint32_t LegacyBloomBitsBuilder::CalculateSpace(size_t num_entries,
+                                                uint32_t* total_bits,
+                                                uint32_t* num_lines) {
+  assert(bits_per_key_);
+  if (num_entries != 0) {
+    size_t total_bits_tmp = num_entries * bits_per_key_;
+    // total bits, including temporary computations, cannot exceed 2^32
+    // for compatibility
+    total_bits_tmp = std::min(total_bits_tmp, size_t{0xffff0000});
+
+    *total_bits =
+        GetTotalBitsForLocality(static_cast<uint32_t>(total_bits_tmp));
+    *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
+    assert(*total_bits > 0 && *total_bits % 8 == 0);
+  } else {
+    // filter is empty, just leave space for metadata
+    *total_bits = 0;
+    *num_lines = 0;
+  }
+
+  // Reserve space for Filter
+  uint32_t sz = *total_bits / 8;
+  sz += kMetadataLen;  // 4 bytes for num_lines, 1 byte for num_probes
+  return sz;
+}
+
+char* LegacyBloomBitsBuilder::ReserveSpace(size_t num_entries,
+                                           uint32_t* total_bits,
+                                           uint32_t* num_lines) {
+  uint32_t sz = CalculateSpace(num_entries, total_bits, num_lines);
+  char* data = new char[sz];
+  memset(data, 0, sz);
+  return data;
+}
+
+inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data,
+                                            uint32_t num_lines,
+                                            uint32_t total_bits) {
+#ifdef NDEBUG
+  static_cast<void>(total_bits);
+#endif
+  assert(num_lines > 0 && total_bits > 0);
+
+  LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data,
+                           ConstexprFloorLog2(CACHE_LINE_SIZE));
+}
+
+class LegacyBloomBitsReader : public BuiltinFilterBitsReader {
+ public:
+  LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines,
+                        uint32_t log2_cache_line_size)
+      : data_(data),
+        num_probes_(num_probes),
+        num_lines_(num_lines),
+        log2_cache_line_size_(log2_cache_line_size) {}
+
+  // No Copy allowed
+  LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete;
+  void operator=(const LegacyBloomBitsReader&) = delete;
+
+  ~LegacyBloomBitsReader() override {}
+
+  // "contents" contains the data built by a preceding call to
+  // FilterBitsBuilder::Finish. MayMatch must return true if the key was
+  // passed to FilterBitsBuilder::AddKey. This method may return true or false
+  // if the key was not on the list, but it should aim to return false with a
+  // high probability.
+  bool MayMatch(const Slice& key) override {
+    uint32_t hash = BloomHash(key);
+    uint32_t byte_offset;
+    LegacyBloomImpl::PrepareHashMayMatch(
+        hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
+    return LegacyBloomImpl::HashMayMatchPrepared(
+        hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+    for (int i = 0; i < num_keys; ++i) {
+      hashes[i] = BloomHash(*keys[i]);
+      LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
+                                           /*out*/ &byte_offsets[i],
+                                           log2_cache_line_size_);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = LegacyBloomImpl::HashMayMatchPrepared(
+          hashes[i], num_probes_, data_ + byte_offsets[i],
+          log2_cache_line_size_);
+    }
+  }
+
+  bool HashMayMatch(const uint64_t /* h */) override { return false; }
+
+ private:
+  const char* data_;
+  const int num_probes_;
+  const uint32_t num_lines_;
+  const uint32_t log2_cache_line_size_;
+};
+
+class AlwaysTrueFilter : public BuiltinFilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return true; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+  bool HashMayMatch(const uint64_t) override { return true; }
+  using BuiltinFilterBitsReader::HashMayMatch;  // inherit overload
+};
+
+class AlwaysFalseFilter : public BuiltinFilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return false; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+  bool HashMayMatch(const uint64_t) override { return false; }
+  using BuiltinFilterBitsReader::HashMayMatch;  // inherit overload
+};
+
+Status XXPH3FilterBitsBuilder::MaybePostVerify(const Slice& filter_content) {
+  Status s = Status::OK();
+
+  if (!detect_filter_construct_corruption_) {
+    return s;
+  }
+
+  std::unique_ptr<BuiltinFilterBitsReader> bits_reader(
+      BuiltinFilterPolicy::GetBuiltinFilterBitsReader(filter_content));
+
+  for (uint64_t h : hash_entries_info_.entries) {
+    // The current approach will not detect corruption from XXPH3Filter to
+    // AlwaysTrueFilter, which can lead to performance cost later due to
+    // AlwaysTrueFilter not filtering anything. But this cost is acceptable
+    // given the extra implementation complixity to detect such case.
+    bool may_match = bits_reader->HashMayMatch(h);
+    if (!may_match) {
+      s = Status::Corruption("Corrupted filter content");
+      break;
+    }
+  }
+
+  ResetEntries();
+  return s;
+}
+}  // namespace
+
+const char* BuiltinFilterPolicy::kClassName() {
+  return "rocksdb.internal.BuiltinFilter";
+}
+
+bool BuiltinFilterPolicy::IsInstanceOf(const std::string& name) const {
+  if (name == kClassName()) {
+    return true;
+  } else {
+    return FilterPolicy::IsInstanceOf(name);
+  }
+}
+
+static const char* kBuiltinFilterMetadataName = "rocksdb.BuiltinBloomFilter";
+
+const char* BuiltinFilterPolicy::kCompatibilityName() {
+  return kBuiltinFilterMetadataName;
+}
+
+const char* BuiltinFilterPolicy::CompatibilityName() const {
+  return kBuiltinFilterMetadataName;
+}
+
+BloomLikeFilterPolicy::BloomLikeFilterPolicy(double bits_per_key)
+    : warned_(false), aggregate_rounding_balance_(0) {
+  // Sanitize bits_per_key
+  if (bits_per_key < 0.5) {
+    // Round down to no filter
+    bits_per_key = 0;
+  } else if (bits_per_key < 1.0) {
+    // Minimum 1 bit per key (equiv) when creating filter
+    bits_per_key = 1.0;
+  } else if (!(bits_per_key < 100.0)) {  // including NaN
+    bits_per_key = 100.0;
+  }
+
+  // Includes a nudge toward rounding up, to ensure on all platforms
+  // that doubles specified with three decimal digits after the decimal
+  // point are interpreted accurately.
+  millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001);
+
+  // For now configure Ribbon filter to match Bloom FP rate and save
+  // memory. (Ribbon bits per key will be ~30% less than Bloom bits per key
+  // for same FP rate.)
+  desired_one_in_fp_rate_ =
+      1.0 / BloomMath::CacheLocalFpRate(
+                bits_per_key,
+                FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_),
+                /*cache_line_bits*/ 512);
+
+  // For better or worse, this is a rounding up of a nudged rounding up,
+  // e.g. 7.4999999999999 will round up to 8, but that provides more
+  // predictability against small arithmetic errors in floating point.
+  whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000;
+}
+
+BloomLikeFilterPolicy::~BloomLikeFilterPolicy() {}
+const char* BloomLikeFilterPolicy::kClassName() {
+  return "rocksdb.internal.BloomLikeFilter";
+}
+
+bool BloomLikeFilterPolicy::IsInstanceOf(const std::string& name) const {
+  if (name == kClassName()) {
+    return true;
+  } else {
+    return BuiltinFilterPolicy::IsInstanceOf(name);
+  }
+}
+
+const char* ReadOnlyBuiltinFilterPolicy::kClassName() {
+  return kBuiltinFilterMetadataName;
+}
+
+std::string BloomLikeFilterPolicy::GetId() const {
+  return Name() + GetBitsPerKeySuffix();
+}
+
+BloomFilterPolicy::BloomFilterPolicy(double bits_per_key)
+    : BloomLikeFilterPolicy(bits_per_key) {}
+
+FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (GetMillibitsPerKey() == 0) {
+    // "No filter" special case
+    return nullptr;
+  } else if (context.table_options.format_version < 5) {
+    return GetLegacyBloomBuilderWithContext(context);
+  } else {
+    return GetFastLocalBloomBuilderWithContext(context);
+  }
+}
+
+const char* BloomFilterPolicy::kClassName() { return "bloomfilter"; }
+const char* BloomFilterPolicy::kNickName() { return "rocksdb.BloomFilter"; }
+
+std::string BloomFilterPolicy::GetId() const {
+  // Including ":false" for better forward-compatibility with 6.29 and earlier
+  // which required a boolean `use_block_based_builder` parameter
+  return BloomLikeFilterPolicy::GetId() + ":false";
+}
+
+FilterBitsBuilder* BloomLikeFilterPolicy::GetFastLocalBloomBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  bool offm = context.table_options.optimize_filters_for_memory;
+  const auto options_overrides_iter =
+      context.table_options.cache_usage_options.options_overrides.find(
+          CacheEntryRole::kFilterConstruction);
+  const auto filter_construction_charged =
+      options_overrides_iter !=
+              context.table_options.cache_usage_options.options_overrides.end()
+          ? options_overrides_iter->second.charged
+          : context.table_options.cache_usage_options.options.charged;
+
+  std::shared_ptr<CacheReservationManager> cache_res_mgr;
+  if (context.table_options.block_cache &&
+      filter_construction_charged ==
+          CacheEntryRoleOptions::Decision::kEnabled) {
+    cache_res_mgr = std::make_shared<
+        CacheReservationManagerImpl<CacheEntryRole::kFilterConstruction>>(
+        context.table_options.block_cache);
+  }
+  return new FastLocalBloomBitsBuilder(
+      millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr,
+      cache_res_mgr, context.table_options.detect_filter_construct_corruption);
+}
+
+FilterBitsBuilder* BloomLikeFilterPolicy::GetLegacyBloomBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (whole_bits_per_key_ >= 14 && context.info_log &&
+      !warned_.load(std::memory_order_relaxed)) {
+    warned_ = true;
+    const char* adjective;
+    if (whole_bits_per_key_ >= 20) {
+      adjective = "Dramatic";
+    } else {
+      adjective = "Significant";
+    }
+    // For more details, see
+    // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
+    ROCKS_LOG_WARN(context.info_log,
+                   "Using legacy Bloom filter with high (%d) bits/key. "
+                   "%s filter space and/or accuracy improvement is available "
+                   "with format_version>=5.",
+                   whole_bits_per_key_, adjective);
+  }
+  return new LegacyBloomBitsBuilder(whole_bits_per_key_, context.info_log);
+}
+
+FilterBitsBuilder*
+BloomLikeFilterPolicy::GetStandard128RibbonBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  // FIXME: code duplication with GetFastLocalBloomBuilderWithContext
+  bool offm = context.table_options.optimize_filters_for_memory;
+  const auto options_overrides_iter =
+      context.table_options.cache_usage_options.options_overrides.find(
+          CacheEntryRole::kFilterConstruction);
+  const auto filter_construction_charged =
+      options_overrides_iter !=
+              context.table_options.cache_usage_options.options_overrides.end()
+          ? options_overrides_iter->second.charged
+          : context.table_options.cache_usage_options.options.charged;
+
+  std::shared_ptr<CacheReservationManager> cache_res_mgr;
+  if (context.table_options.block_cache &&
+      filter_construction_charged ==
+          CacheEntryRoleOptions::Decision::kEnabled) {
+    cache_res_mgr = std::make_shared<
+        CacheReservationManagerImpl<CacheEntryRole::kFilterConstruction>>(
+        context.table_options.block_cache);
+  }
+  return new Standard128RibbonBitsBuilder(
+      desired_one_in_fp_rate_, millibits_per_key_,
+      offm ? &aggregate_rounding_balance_ : nullptr, cache_res_mgr,
+      context.table_options.detect_filter_construct_corruption,
+      context.info_log);
+}
+
+std::string BloomLikeFilterPolicy::GetBitsPerKeySuffix() const {
+  std::string rv = ":" + std::to_string(millibits_per_key_ / 1000);
+  int frac = millibits_per_key_ % 1000;
+  if (frac > 0) {
+    rv.push_back('.');
+    rv.push_back(static_cast<char>('0' + (frac / 100)));
+    frac %= 100;
+    if (frac > 0) {
+      rv.push_back(static_cast<char>('0' + (frac / 10)));
+      frac %= 10;
+      if (frac > 0) {
+        rv.push_back(static_cast<char>('0' + frac));
+      }
+    }
+  }
+  return rv;
+}
+
+FilterBitsBuilder* BuiltinFilterPolicy::GetBuilderFromContext(
+    const FilterBuildingContext& context) {
+  if (context.table_options.filter_policy) {
+    return context.table_options.filter_policy->GetBuilderWithContext(context);
+  } else {
+    return nullptr;
+  }
+}
+
+// For testing only, but always constructable with internal names
+namespace test {
+
+const char* LegacyBloomFilterPolicy::kClassName() {
+  return "rocksdb.internal.LegacyBloomFilter";
+}
+
+FilterBitsBuilder* LegacyBloomFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (GetMillibitsPerKey() == 0) {
+    // "No filter" special case
+    return nullptr;
+  }
+  return GetLegacyBloomBuilderWithContext(context);
+}
+
+const char* FastLocalBloomFilterPolicy::kClassName() {
+  return "rocksdb.internal.FastLocalBloomFilter";
+}
+
+FilterBitsBuilder* FastLocalBloomFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (GetMillibitsPerKey() == 0) {
+    // "No filter" special case
+    return nullptr;
+  }
+  return GetFastLocalBloomBuilderWithContext(context);
+}
+
+const char* Standard128RibbonFilterPolicy::kClassName() {
+  return "rocksdb.internal.Standard128RibbonFilter";
+}
+
+FilterBitsBuilder* Standard128RibbonFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (GetMillibitsPerKey() == 0) {
+    // "No filter" special case
+    return nullptr;
+  }
+  return GetStandard128RibbonBuilderWithContext(context);
+}
+
+}  // namespace test
+
+BuiltinFilterBitsReader* BuiltinFilterPolicy::GetBuiltinFilterBitsReader(
+    const Slice& contents) {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  if (len_with_meta <= kMetadataLen) {
+    // filter is empty or broken. Treat like zero keys added.
+    return new AlwaysFalseFilter();
+  }
+
+  // Legacy Bloom filter data:
+  //             0 +-----------------------------------+
+  //               | Raw Bloom filter data             |
+  //               | ...                               |
+  //           len +-----------------------------------+
+  //               | byte for num_probes or            |
+  //               |   marker for new implementations  |
+  //         len+1 +-----------------------------------+
+  //               | four bytes for number of cache    |
+  //               |   lines                           |
+  // len_with_meta +-----------------------------------+
+
+  int8_t raw_num_probes =
+      static_cast<int8_t>(contents.data()[len_with_meta - kMetadataLen]);
+  // NB: *num_probes > 30 and < 128 probably have not been used, because of
+  // BloomFilterPolicy::initialize, unless directly calling
+  // LegacyBloomBitsBuilder as an API, but we are leaving those cases in
+  // limbo with LegacyBloomBitsReader for now.
+
+  if (raw_num_probes < 1) {
+    // Note: < 0 (or unsigned > 127) indicate special new implementations
+    // (or reserved for future use)
+    switch (raw_num_probes) {
+      case 0:
+        // Treat as zero probes (always FP)
+        return new AlwaysTrueFilter();
+      case -1:
+        // Marker for newer Bloom implementations
+        return GetBloomBitsReader(contents);
+      case -2:
+        // Marker for Ribbon implementations
+        return GetRibbonBitsReader(contents);
+      default:
+        // Reserved (treat as zero probes, always FP, for now)
+        return new AlwaysTrueFilter();
+    }
+  }
+  // else attempt decode for LegacyBloomBitsReader
+
+  int num_probes = raw_num_probes;
+  assert(num_probes >= 1);
+  assert(num_probes <= 127);
+
+  uint32_t len = len_with_meta - kMetadataLen;
+  assert(len > 0);
+
+  uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
+  uint32_t log2_cache_line_size;
+
+  if (num_lines * CACHE_LINE_SIZE == len) {
+    // Common case
+    log2_cache_line_size = ConstexprFloorLog2(CACHE_LINE_SIZE);
+  } else if (num_lines == 0 || len % num_lines != 0) {
+    // Invalid (no solution to num_lines * x == len)
+    // Treat as zero probes (always FP) for now.
+    return new AlwaysTrueFilter();
+  } else {
+    // Determine the non-native cache line size (from another system)
+    log2_cache_line_size = 0;
+    while ((num_lines << log2_cache_line_size) < len) {
+      ++log2_cache_line_size;
+    }
+    if ((num_lines << log2_cache_line_size) != len) {
+      // Invalid (block size not a power of two)
+      // Treat as zero probes (always FP) for now.
+      return new AlwaysTrueFilter();
+    }
+  }
+  // if not early return
+  return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines,
+                                   log2_cache_line_size);
+}
+
+// Read metadata to determine what kind of FilterBitsReader is needed
+// and return a new one.
+FilterBitsReader* BuiltinFilterPolicy::GetFilterBitsReader(
+    const Slice& contents) const {
+  return BuiltinFilterPolicy::GetBuiltinFilterBitsReader(contents);
+}
+
+BuiltinFilterBitsReader* BuiltinFilterPolicy::GetRibbonBitsReader(
+    const Slice& contents) {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  uint32_t len = len_with_meta - kMetadataLen;
+
+  assert(len > 0);  // precondition
+
+  uint32_t seed = static_cast<uint8_t>(contents.data()[len + 1]);
+  uint32_t num_blocks = static_cast<uint8_t>(contents.data()[len + 2]);
+  num_blocks |= static_cast<uint8_t>(contents.data()[len + 3]) << 8;
+  num_blocks |= static_cast<uint8_t>(contents.data()[len + 4]) << 16;
+  if (num_blocks < 2) {
+    // Not supported
+    // num_blocks == 1 is not used because num_starts == 1 is problematic
+    // for the hashing scheme. num_blocks == 0 is unused because there's
+    // already a concise encoding of an "always false" filter.
+    // Return something safe:
+    return new AlwaysTrueFilter();
+  }
+  return new Standard128RibbonBitsReader(contents.data(), len, num_blocks,
+                                         seed);
+}
+
+// For newer Bloom filter implementations
+BuiltinFilterBitsReader* BuiltinFilterPolicy::GetBloomBitsReader(
+    const Slice& contents) {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  uint32_t len = len_with_meta - kMetadataLen;
+
+  assert(len > 0);  // precondition
+
+  // New Bloom filter data:
+  //             0 +-----------------------------------+
+  //               | Raw Bloom filter data             |
+  //               | ...                               |
+  //           len +-----------------------------------+
+  //               | char{-1} byte -> new Bloom filter |
+  //         len+1 +-----------------------------------+
+  //               | byte for subimplementation        |
+  //               |   0: FastLocalBloom               |
+  //               |   other: reserved                 |
+  //         len+2 +-----------------------------------+
+  //               | byte for block_and_probes         |
+  //               |   0 in top 3 bits -> 6 -> 64-byte |
+  //               |   reserved:                       |
+  //               |   1 in top 3 bits -> 7 -> 128-byte|
+  //               |   2 in top 3 bits -> 8 -> 256-byte|
+  //               |   ...                             |
+  //               |   num_probes in bottom 5 bits,    |
+  //               |     except 0 and 31 reserved      |
+  //         len+3 +-----------------------------------+
+  //               | two bytes reserved                |
+  //               |   possibly for hash seed          |
+  // len_with_meta +-----------------------------------+
+
+  // Read more metadata (see above)
+  char sub_impl_val = contents.data()[len_with_meta - 4];
+  char block_and_probes = contents.data()[len_with_meta - 3];
+  int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6;
+
+  int num_probes = (block_and_probes & 31);
+  if (num_probes < 1 || num_probes > 30) {
+    // Reserved / future safe
+    return new AlwaysTrueFilter();
+  }
+
+  uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2);
+  if (rest != 0) {
+    // Reserved, possibly for hash seed
+    // Future safe
+    return new AlwaysTrueFilter();
+  }
+
+  if (sub_impl_val == 0) {        // FastLocalBloom
+    if (log2_block_bytes == 6) {  // Only block size supported for now
+      return new FastLocalBloomBitsReader(contents.data(), num_probes, len);
+    }
+  }
+  // otherwise
+  // Reserved / future safe
+  return new AlwaysTrueFilter();
+}
+
+const FilterPolicy* NewBloomFilterPolicy(double bits_per_key,
+                                         bool /*use_block_based_builder*/) {
+  // NOTE: use_block_based_builder now ignored so block-based filter is no
+  // longer accessible in public API.
+  return new BloomFilterPolicy(bits_per_key);
+}
+
+RibbonFilterPolicy::RibbonFilterPolicy(double bloom_equivalent_bits_per_key,
+                                       int bloom_before_level)
+    : BloomLikeFilterPolicy(bloom_equivalent_bits_per_key),
+      bloom_before_level_(bloom_before_level) {}
+
+FilterBitsBuilder* RibbonFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (GetMillibitsPerKey() == 0) {
+    // "No filter" special case
+    return nullptr;
+  }
+  // Treat unknown same as bottommost
+  int levelish = INT_MAX;
+
+  switch (context.compaction_style) {
+    case kCompactionStyleLevel:
+    case kCompactionStyleUniversal: {
+      if (context.reason == TableFileCreationReason::kFlush) {
+        // Treat flush as level -1
+        assert(context.level_at_creation == 0);
+        levelish = -1;
+      } else if (context.level_at_creation == -1) {
+        // Unknown level
+        assert(levelish == INT_MAX);
+      } else {
+        levelish = context.level_at_creation;
+      }
+      break;
+    }
+    case kCompactionStyleFIFO:
+    case kCompactionStyleNone:
+      // Treat as bottommost
+      assert(levelish == INT_MAX);
+      break;
+  }
+  if (levelish < bloom_before_level_) {
+    return GetFastLocalBloomBuilderWithContext(context);
+  } else {
+    return GetStandard128RibbonBuilderWithContext(context);
+  }
+}
+
+const char* RibbonFilterPolicy::kClassName() { return "ribbonfilter"; }
+const char* RibbonFilterPolicy::kNickName() { return "rocksdb.RibbonFilter"; }
+
+std::string RibbonFilterPolicy::GetId() const {
+  return BloomLikeFilterPolicy::GetId() + ":" +
+         std::to_string(bloom_before_level_);
+}
+
+const FilterPolicy* NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key,
+                                          int bloom_before_level) {
+  return new RibbonFilterPolicy(bloom_equivalent_bits_per_key,
+                                bloom_before_level);
+}
+
+FilterBuildingContext::FilterBuildingContext(
+    const BlockBasedTableOptions& _table_options)
+    : table_options(_table_options) {}
+
+FilterPolicy::~FilterPolicy() {}
+
+std::shared_ptr<const FilterPolicy> BloomLikeFilterPolicy::Create(
+    const std::string& name, double bits_per_key) {
+  if (name == test::LegacyBloomFilterPolicy::kClassName()) {
+    return std::make_shared<test::LegacyBloomFilterPolicy>(bits_per_key);
+  } else if (name == test::FastLocalBloomFilterPolicy::kClassName()) {
+    return std::make_shared<test::FastLocalBloomFilterPolicy>(bits_per_key);
+  } else if (name == test::Standard128RibbonFilterPolicy::kClassName()) {
+    return std::make_shared<test::Standard128RibbonFilterPolicy>(bits_per_key);
+  } else if (name == BloomFilterPolicy::kClassName()) {
+    // For testing
+    return std::make_shared<BloomFilterPolicy>(bits_per_key);
+  } else if (name == RibbonFilterPolicy::kClassName()) {
+    // For testing
+    return std::make_shared<RibbonFilterPolicy>(bits_per_key,
+                                                /*bloom_before_level*/ 0);
+  } else {
+    return nullptr;
+  }
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+static ObjectLibrary::PatternEntry FilterPatternEntryWithBits(
+    const char* name) {
+  return ObjectLibrary::PatternEntry(name, false).AddNumber(":", false);
+}
+
+template <typename T>
+T* NewBuiltinFilterPolicyWithBits(const std::string& uri) {
+  const std::vector<std::string> vals = StringSplit(uri, ':');
+  double bits_per_key = ParseDouble(vals[1]);
+  return new T(bits_per_key);
+}
+static int RegisterBuiltinFilterPolicies(ObjectLibrary& library,
+                                         const std::string& /*arg*/) {
+  library.AddFactory<const FilterPolicy>(
+      ReadOnlyBuiltinFilterPolicy::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new ReadOnlyBuiltinFilterPolicy());
+        return guard->get();
+      });
+
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(BloomFilterPolicy::kClassName())
+          .AnotherName(BloomFilterPolicy::kNickName()),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(NewBuiltinFilterPolicyWithBits<BloomFilterPolicy>(uri));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(BloomFilterPolicy::kClassName())
+          .AnotherName(BloomFilterPolicy::kNickName())
+          .AddSuffix(":false"),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(NewBuiltinFilterPolicyWithBits<BloomFilterPolicy>(uri));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(BloomFilterPolicy::kClassName())
+          .AnotherName(BloomFilterPolicy::kNickName())
+          .AddSuffix(":true"),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        const std::vector<std::string> vals = StringSplit(uri, ':');
+        double bits_per_key = ParseDouble(vals[1]);
+        // NOTE: This case previously configured the deprecated block-based
+        // filter, but old ways of configuring that now map to full filter. We
+        // defer to the corresponding API to ensure consistency in case that
+        // change is reverted.
+        guard->reset(NewBloomFilterPolicy(bits_per_key, true));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(RibbonFilterPolicy::kClassName())
+          .AnotherName(RibbonFilterPolicy::kNickName()),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        const std::vector<std::string> vals = StringSplit(uri, ':');
+        double bits_per_key = ParseDouble(vals[1]);
+        guard->reset(NewRibbonFilterPolicy(bits_per_key));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(RibbonFilterPolicy::kClassName())
+          .AnotherName(RibbonFilterPolicy::kNickName())
+          .AddNumber(":", true),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        const std::vector<std::string> vals = StringSplit(uri, ':');
+        double bits_per_key = ParseDouble(vals[1]);
+        int bloom_before_level = ParseInt(vals[2]);
+        guard->reset(NewRibbonFilterPolicy(bits_per_key, bloom_before_level));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(test::LegacyBloomFilterPolicy::kClassName()),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(
+            NewBuiltinFilterPolicyWithBits<test::LegacyBloomFilterPolicy>(uri));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(
+          test::FastLocalBloomFilterPolicy::kClassName()),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(
+            NewBuiltinFilterPolicyWithBits<test::FastLocalBloomFilterPolicy>(
+                uri));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(
+          test::Standard128RibbonFilterPolicy::kClassName()),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(
+            NewBuiltinFilterPolicyWithBits<test::Standard128RibbonFilterPolicy>(
+                uri));
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+}  // namespace
+#endif  // ROCKSDB_LITE
+
+Status FilterPolicy::CreateFromString(
+    const ConfigOptions& options, const std::string& value,
+    std::shared_ptr<const FilterPolicy>* policy) {
+  if (value == kNullptrString || value.empty()) {
+    policy->reset();
+    return Status::OK();
+  } else if (value == ReadOnlyBuiltinFilterPolicy::kClassName()) {
+    *policy = std::make_shared<ReadOnlyBuiltinFilterPolicy>();
+    return Status::OK();
+  }
+
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status =
+      Customizable::GetOptionsMap(options, policy->get(), value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (id.empty()) {  // We have no Id but have options.  Not good
+    return Status::NotSupported("Cannot reset object ", id);
+  } else {
+#ifndef ROCKSDB_LITE
+    static std::once_flag loaded;
+    std::call_once(loaded, [&]() {
+      RegisterBuiltinFilterPolicies(*(ObjectLibrary::Default().get()), "");
+    });
+    status = options.registry->NewSharedObject(id, policy);
+#else
+    status =
+        Status::NotSupported("Cannot load filter policy in LITE mode ", value);
+#endif  // ROCKSDB_LITE
+  }
+  if (options.ignore_unsupported_options && status.IsNotSupported()) {
+    return Status::OK();
+  } else if (status.ok()) {
+    status = Customizable::ConfigureNewObject(
+        options, const_cast<FilterPolicy*>(policy->get()), opt_map);
+  }
+  return status;
+}
+
+const std::vector<std::string>& BloomLikeFilterPolicy::GetAllFixedImpls() {
+  STATIC_AVOID_DESTRUCTION(std::vector<std::string>, impls){
+      // Match filter_bench -impl=x ordering
+      test::LegacyBloomFilterPolicy::kClassName(),
+      test::FastLocalBloomFilterPolicy::kClassName(),
+      test::Standard128RibbonFilterPolicy::kClassName(),
+  };
+  return impls;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_policy_internal.h b/src/rocksdb/table/block_based/filter_policy_internal.h
new file mode 100644
index 000000000..9bc3a2482
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_policy_internal.h
@@ -0,0 +1,340 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A class that takes a bunch of keys, then generates filter
+class FilterBitsBuilder {
+ public:
+  virtual ~FilterBitsBuilder() {}
+
+  // Add a key (or prefix) to the filter. Typically, a builder will keep
+  // a set of 64-bit key hashes and only build the filter in Finish
+  // when the final number of keys is known. Keys are added in sorted order
+  // and duplicated keys are possible, so typically, the builder will
+  // only add this key if its hash is different from the most recently
+  // added.
+  virtual void AddKey(const Slice& key) = 0;
+
+  // Called by RocksDB before Finish to populate
+  // TableProperties::num_filter_entries, so should represent the
+  // number of unique keys (and/or prefixes) added, but does not have
+  // to be exact. `return 0;` may be used to conspicuously indicate "unknown".
+  virtual size_t EstimateEntriesAdded() = 0;
+
+  // Generate the filter using the keys that are added
+  // The return value of this function would be the filter bits,
+  // The ownership of actual data is set to buf
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
+
+  // Similar to Finish(std::unique_ptr<const char[]>* buf), except that
+  // for a non-null status pointer argument, it will point to
+  // Status::Corruption() when there is any corruption during filter
+  // construction or Status::OK() otherwise.
+  //
+  // WARNING: do not use a filter resulted from a corrupted construction
+  // TODO: refactor this to have a better signature, consolidate
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf,
+                       Status* /* status */) {
+    return Finish(buf);
+  }
+
+  // Verify the filter returned from calling FilterBitsBuilder::Finish.
+  // The function returns Status::Corruption() if there is any corruption in the
+  // constructed filter or Status::OK() otherwise.
+  //
+  // Implementations should normally consult
+  // FilterBuildingContext::table_options.detect_filter_construct_corruption
+  // to determine whether to perform verification or to skip by returning
+  // Status::OK(). The decision is left to the FilterBitsBuilder so that
+  // verification prerequisites before PostVerify can be skipped when not
+  // configured.
+  //
+  // RocksDB internal will always call MaybePostVerify() on the filter after
+  // it is returned from calling FilterBitsBuilder::Finish
+  // except for FilterBitsBuilder::Finish resulting a corruption
+  // status, which indicates the filter is already in a corrupted state and
+  // there is no need to post-verify
+  virtual Status MaybePostVerify(const Slice& /* filter_content */) {
+    return Status::OK();
+  }
+
+  // Approximate the number of keys that can be added and generate a filter
+  // <= the specified number of bytes. Callers (including RocksDB) should
+  // only use this result for optimizing performance and not as a guarantee.
+  virtual size_t ApproximateNumEntries(size_t bytes) = 0;
+};
+
+// A class that checks if a key can be in filter
+// It should be initialized by Slice generated by BitsBuilder
+class FilterBitsReader {
+ public:
+  virtual ~FilterBitsReader() {}
+
+  // Check if the entry match the bits in filter
+  virtual bool MayMatch(const Slice& entry) = 0;
+
+  // Check if an array of entries match the bits in filter
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) {
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = MayMatch(*keys[i]);
+    }
+  }
+};
+
+// Exposes any extra information needed for testing built-in
+// FilterBitsBuilders
+class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  // Calculate number of bytes needed for a new filter, including
+  // metadata. Passing the result to ApproximateNumEntries should
+  // (ideally, usually) return >= the num_entry passed in.
+  // When optimize_filters_for_memory is enabled, this function
+  // is not authoritative but represents a target size that should
+  // be close to the average size.
+  virtual size_t CalculateSpace(size_t num_entries) = 0;
+
+  // Returns an estimate of the FP rate of the returned filter if
+  // `num_entries` keys are added and the filter returned by Finish
+  // is `bytes` bytes.
+  virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0;
+};
+
+// Base class for RocksDB built-in filter reader with
+// extra useful functionalities for inernal.
+class BuiltinFilterBitsReader : public FilterBitsReader {
+ public:
+  // Check if the hash of the entry match the bits in filter
+  virtual bool HashMayMatch(const uint64_t /* h */) { return true; }
+};
+
+// Base class for RocksDB built-in filter policies. This provides the
+// ability to read all kinds of built-in filters (so that old filters can
+// be used even when you change between built-in policies).
+class BuiltinFilterPolicy : public FilterPolicy {
+ public:  // overrides
+  // Read metadata to determine what kind of FilterBitsReader is needed
+  // and return a new one. This must successfully process any filter data
+  // generated by a built-in FilterBitsBuilder, regardless of the impl
+  // chosen for this BloomFilterPolicy.
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
+  static const char* kClassName();
+  bool IsInstanceOf(const std::string& id) const override;
+  // All variants of BuiltinFilterPolicy can read each others filters.
+  const char* CompatibilityName() const override;
+  static const char* kCompatibilityName();
+
+ public:  // new
+  // An internal function for the implementation of
+  // BuiltinFilterBitsReader::GetFilterBitsReader without requiring an instance
+  // or working around potential virtual overrides.
+  static BuiltinFilterBitsReader* GetBuiltinFilterBitsReader(
+      const Slice& contents);
+
+  // Returns a new FilterBitsBuilder from the filter_policy in
+  // table_options of a context, or nullptr if not applicable.
+  // (An internal convenience function to save boilerplate.)
+  static FilterBitsBuilder* GetBuilderFromContext(const FilterBuildingContext&);
+
+ private:
+  // For Bloom filter implementation(s)
+  static BuiltinFilterBitsReader* GetBloomBitsReader(const Slice& contents);
+
+  // For Ribbon filter implementation(s)
+  static BuiltinFilterBitsReader* GetRibbonBitsReader(const Slice& contents);
+};
+
+// A "read only" filter policy used for backward compatibility with old
+// OPTIONS files, which did not specifying a Bloom configuration, just
+// "rocksdb.BuiltinBloomFilter". Although this can read existing filters,
+// this policy does not build new filters, so new SST files generated
+// under the policy will get no filters (like nullptr FilterPolicy).
+// This class is considered internal API and subject to change.
+class ReadOnlyBuiltinFilterPolicy : public BuiltinFilterPolicy {
+ public:
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName();
+
+  // Does not write filters.
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override {
+    return nullptr;
+  }
+};
+
+// RocksDB built-in filter policy for Bloom or Bloom-like filters including
+// Ribbon filters.
+// This class is considered internal API and subject to change.
+// See NewBloomFilterPolicy and NewRibbonFilterPolicy.
+class BloomLikeFilterPolicy : public BuiltinFilterPolicy {
+ public:
+  explicit BloomLikeFilterPolicy(double bits_per_key);
+
+  ~BloomLikeFilterPolicy() override;
+  static const char* kClassName();
+  bool IsInstanceOf(const std::string& id) const override;
+
+  std::string GetId() const override;
+
+  // Essentially for testing only: configured millibits/key
+  int GetMillibitsPerKey() const { return millibits_per_key_; }
+  // Essentially for testing only: legacy whole bits/key
+  int GetWholeBitsPerKey() const { return whole_bits_per_key_; }
+
+  // All the different underlying implementations that a BloomLikeFilterPolicy
+  // might use, as a configuration string name for a testing mode for
+  // "always use this implementation." Only appropriate for unit tests.
+  static const std::vector<std::string>& GetAllFixedImpls();
+
+  // Convenience function for creating by name for fixed impls
+  static std::shared_ptr<const FilterPolicy> Create(const std::string& name,
+                                                    double bits_per_key);
+
+ protected:
+  // Some implementations used by aggregating policies
+  FilterBitsBuilder* GetLegacyBloomBuilderWithContext(
+      const FilterBuildingContext& context) const;
+  FilterBitsBuilder* GetFastLocalBloomBuilderWithContext(
+      const FilterBuildingContext& context) const;
+  FilterBitsBuilder* GetStandard128RibbonBuilderWithContext(
+      const FilterBuildingContext& context) const;
+
+  std::string GetBitsPerKeySuffix() const;
+
+ private:
+  // Bits per key settings are for configuring Bloom filters.
+
+  // Newer filters support fractional bits per key. For predictable behavior
+  // of 0.001-precision values across floating point implementations, we
+  // round to thousandths of a bit (on average) per key.
+  int millibits_per_key_;
+
+  // Older filters round to whole number bits per key. (There *should* be no
+  // compatibility issue with fractional bits per key, but preserving old
+  // behavior with format_version < 5 just in case.)
+  int whole_bits_per_key_;
+
+  // For configuring Ribbon filter: a desired value for 1/fp_rate. For
+  // example, 100 -> 1% fp rate.
+  double desired_one_in_fp_rate_;
+
+  // Whether relevant warnings have been logged already. (Remember so we
+  // only report once per BloomFilterPolicy instance, to keep the noise down.)
+  mutable std::atomic<bool> warned_;
+
+  // State for implementing optimize_filters_for_memory. Essentially, this
+  // tracks a surplus or deficit in total FP rate of filters generated by
+  // builders under this policy vs. what would have been generated without
+  // optimize_filters_for_memory.
+  //
+  // To avoid floating point weirdness, the actual value is
+  //  Sum over all generated filters f:
+  //   (predicted_fp_rate(f) - predicted_fp_rate(f|o_f_f_m=false)) * 2^32
+  mutable std::atomic<int64_t> aggregate_rounding_balance_;
+};
+
+// For NewBloomFilterPolicy
+//
+// This is a user-facing policy that automatically choose between
+// LegacyBloom and FastLocalBloom based on context at build time,
+// including compatibility with format_version.
+class BloomFilterPolicy : public BloomLikeFilterPolicy {
+ public:
+  explicit BloomFilterPolicy(double bits_per_key);
+
+  // To use this function, call BuiltinFilterPolicy::GetBuilderFromContext().
+  //
+  // Neither the context nor any objects therein should be saved beyond
+  // the call to this function, unless it's shared_ptr.
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override;
+
+  static const char* kClassName();
+  const char* Name() const override { return kClassName(); }
+  static const char* kNickName();
+  const char* NickName() const override { return kNickName(); }
+  std::string GetId() const override;
+};
+
+// For NewRibbonFilterPolicy
+//
+// This is a user-facing policy that chooses between Standard128Ribbon
+// and FastLocalBloom based on context at build time (LSM level and other
+// factors in extreme cases).
+class RibbonFilterPolicy : public BloomLikeFilterPolicy {
+ public:
+  explicit RibbonFilterPolicy(double bloom_equivalent_bits_per_key,
+                              int bloom_before_level);
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override;
+
+  int GetBloomBeforeLevel() const { return bloom_before_level_; }
+
+  static const char* kClassName();
+  const char* Name() const override { return kClassName(); }
+  static const char* kNickName();
+  const char* NickName() const override { return kNickName(); }
+  std::string GetId() const override;
+
+ private:
+  const int bloom_before_level_;
+};
+
+// For testing only, but always constructable with internal names
+namespace test {
+
+class LegacyBloomFilterPolicy : public BloomLikeFilterPolicy {
+ public:
+  explicit LegacyBloomFilterPolicy(double bits_per_key)
+      : BloomLikeFilterPolicy(bits_per_key) {}
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override;
+
+  static const char* kClassName();
+  const char* Name() const override { return kClassName(); }
+};
+
+class FastLocalBloomFilterPolicy : public BloomLikeFilterPolicy {
+ public:
+  explicit FastLocalBloomFilterPolicy(double bits_per_key)
+      : BloomLikeFilterPolicy(bits_per_key) {}
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override;
+
+  static const char* kClassName();
+  const char* Name() const override { return kClassName(); }
+};
+
+class Standard128RibbonFilterPolicy : public BloomLikeFilterPolicy {
+ public:
+  explicit Standard128RibbonFilterPolicy(double bloom_equiv_bits_per_key)
+      : BloomLikeFilterPolicy(bloom_equiv_bits_per_key) {}
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override;
+
+  static const char* kClassName();
+  const char* Name() const override { return kClassName(); }
+};
+
+}  // namespace test
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/flush_block_policy.cc b/src/rocksdb/table/block_based/flush_block_policy.cc
new file mode 100644
index 000000000..9bb1f334b
--- /dev/null
+++ b/src/rocksdb/table/block_based/flush_block_policy.cc
@@ -0,0 +1,146 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/flush_block_policy.h"
+
+#include <cassert>
+#include <mutex>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/flush_block_policy.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Flush block by size
+class FlushBlockBySizePolicy : public FlushBlockPolicy {
+ public:
+  // @params block_size:           Approximate size of user data packed per
+  //                               block.
+  // @params block_size_deviation: This is used to close a block before it
+  //                               reaches the configured
+  FlushBlockBySizePolicy(const uint64_t block_size,
+                         const uint64_t block_size_deviation, const bool align,
+                         const BlockBuilder& data_block_builder)
+      : block_size_(block_size),
+        block_size_deviation_limit_(
+            ((block_size * (100 - block_size_deviation)) + 99) / 100),
+        align_(align),
+        data_block_builder_(data_block_builder) {}
+
+  bool Update(const Slice& key, const Slice& value) override {
+    // it makes no sense to flush when the data block is empty
+    if (data_block_builder_.empty()) {
+      return false;
+    }
+
+    auto curr_size = data_block_builder_.CurrentSizeEstimate();
+
+    // Do flush if one of the below two conditions is true:
+    // 1) if the current estimated size already exceeds the block size,
+    // 2) block_size_deviation is set and the estimated size after appending
+    // the kv will exceed the block size and the current size is under the
+    // the deviation.
+    return curr_size >= block_size_ || BlockAlmostFull(key, value);
+  }
+
+ private:
+  bool BlockAlmostFull(const Slice& key, const Slice& value) const {
+    if (block_size_deviation_limit_ == 0) {
+      return false;
+    }
+
+    const auto curr_size = data_block_builder_.CurrentSizeEstimate();
+    auto estimated_size_after =
+        data_block_builder_.EstimateSizeAfterKV(key, value);
+
+    if (align_) {
+      estimated_size_after += BlockBasedTable::kBlockTrailerSize;
+      return estimated_size_after > block_size_;
+    }
+
+    return estimated_size_after > block_size_ &&
+           curr_size > block_size_deviation_limit_;
+  }
+
+  const uint64_t block_size_;
+  const uint64_t block_size_deviation_limit_;
+  const bool align_;
+  const BlockBuilder& data_block_builder_;
+};
+
+FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+    const BlockBasedTableOptions& table_options,
+    const BlockBuilder& data_block_builder) const {
+  return new FlushBlockBySizePolicy(
+      table_options.block_size, table_options.block_size_deviation,
+      table_options.block_align, data_block_builder);
+}
+
+FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+    const uint64_t size, const int deviation,
+    const BlockBuilder& data_block_builder) {
+  return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder);
+}
+
+#ifndef ROCKSDB_LITE
+static int RegisterFlushBlockPolicyFactories(ObjectLibrary& library,
+                                             const std::string& /*arg*/) {
+  library.AddFactory<FlushBlockPolicyFactory>(
+      FlushBlockBySizePolicyFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FlushBlockPolicyFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new FlushBlockBySizePolicyFactory());
+        return guard->get();
+      });
+  library.AddFactory<FlushBlockPolicyFactory>(
+      FlushBlockEveryKeyPolicyFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FlushBlockPolicyFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new FlushBlockEveryKeyPolicyFactory());
+        return guard->get();
+      });
+  return 2;
+}
+#endif  // ROCKSDB_LITE
+
+static bool LoadFlushPolicyFactory(
+    const std::string& id, std::shared_ptr<FlushBlockPolicyFactory>* result) {
+  if (id.empty()) {
+    result->reset(new FlushBlockBySizePolicyFactory());
+#ifdef ROCKSDB_LITE
+  } else if (id == FlushBlockBySizePolicyFactory::kClassName()) {
+    result->reset(new FlushBlockBySizePolicyFactory());
+  } else if (id == FlushBlockEveryKeyPolicyFactory::kClassName()) {
+    result->reset(new FlushBlockEveryKeyPolicyFactory());
+#endif  // ROCKSDB_LITE
+  } else {
+    return false;
+  }
+  return true;
+}
+
+FlushBlockBySizePolicyFactory::FlushBlockBySizePolicyFactory()
+    : FlushBlockPolicyFactory() {}
+
+Status FlushBlockPolicyFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<FlushBlockPolicyFactory>* factory) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterFlushBlockPolicyFactories(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  return LoadSharedObject<FlushBlockPolicyFactory>(
+      config_options, value, LoadFlushPolicyFactory, factory);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/flush_block_policy.h b/src/rocksdb/table/block_based/flush_block_policy.h
new file mode 100644
index 000000000..4f79682bc
--- /dev/null
+++ b/src/rocksdb/table/block_based/flush_block_policy.h
@@ -0,0 +1,40 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/flush_block_policy.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// FlushBlockEveryKeyPolicy currently used only in tests.
+
+class FlushBlockEveryKeyPolicy : public FlushBlockPolicy {
+ public:
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+    if (!start_) {
+      start_ = true;
+      return false;
+    }
+    return true;
+  }
+
+ private:
+  bool start_ = false;
+};
+
+class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  explicit FlushBlockEveryKeyPolicyFactory() {}
+
+  static const char* kClassName() { return "FlushBlockEveryKeyPolicyFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& /*data_block_builder*/) const override {
+    return new FlushBlockEveryKeyPolicy;
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block.cc b/src/rocksdb/table/block_based/full_filter_block.cc
new file mode 100644
index 000000000..62b7a9eca
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block.cc
@@ -0,0 +1,296 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/full_filter_block.h"
+
+#include <array>
+
+#include "block_type.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FullFilterBlockBuilder::FullFilterBlockBuilder(
+    const SliceTransform* _prefix_extractor, bool whole_key_filtering,
+    FilterBitsBuilder* filter_bits_builder)
+    : prefix_extractor_(_prefix_extractor),
+      whole_key_filtering_(whole_key_filtering),
+      last_whole_key_recorded_(false),
+      last_prefix_recorded_(false),
+      last_key_in_domain_(false),
+      any_added_(false) {
+  assert(filter_bits_builder != nullptr);
+  filter_bits_builder_.reset(filter_bits_builder);
+}
+
+size_t FullFilterBlockBuilder::EstimateEntriesAdded() {
+  return filter_bits_builder_->EstimateEntriesAdded();
+}
+
+void FullFilterBlockBuilder::Add(const Slice& key_without_ts) {
+  const bool add_prefix =
+      prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts);
+
+  if (!last_prefix_recorded_ && last_key_in_domain_) {
+    // We can reach here when a new filter partition starts in partitioned
+    // filter. The last prefix in the previous partition should be added if
+    // necessary regardless of key_without_ts, to support prefix SeekForPrev.
+    AddKey(last_prefix_str_);
+    last_prefix_recorded_ = true;
+  }
+
+  if (whole_key_filtering_) {
+    if (!add_prefix) {
+      AddKey(key_without_ts);
+    } else {
+      // if both whole_key and prefix are added to bloom then we will have whole
+      // key_without_ts and prefix addition being interleaved and thus cannot
+      // rely on the bits builder to properly detect the duplicates by comparing
+      // with the last item.
+      Slice last_whole_key = Slice(last_whole_key_str_);
+      if (!last_whole_key_recorded_ ||
+          last_whole_key.compare(key_without_ts) != 0) {
+        AddKey(key_without_ts);
+        last_whole_key_recorded_ = true;
+        last_whole_key_str_.assign(key_without_ts.data(),
+                                   key_without_ts.size());
+      }
+    }
+  }
+  if (add_prefix) {
+    last_key_in_domain_ = true;
+    AddPrefix(key_without_ts);
+  } else {
+    last_key_in_domain_ = false;
+  }
+}
+
+// Add key to filter if needed
+inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
+  filter_bits_builder_->AddKey(key);
+  any_added_ = true;
+}
+
+// Add prefix to filter if needed
+void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
+  assert(prefix_extractor_ && prefix_extractor_->InDomain(key));
+  Slice prefix = prefix_extractor_->Transform(key);
+  if (whole_key_filtering_) {
+    // if both whole_key and prefix are added to bloom then we will have whole
+    // key and prefix addition being interleaved and thus cannot rely on the
+    // bits builder to properly detect the duplicates by comparing with the last
+    // item.
+    Slice last_prefix = Slice(last_prefix_str_);
+    if (!last_prefix_recorded_ || last_prefix.compare(prefix) != 0) {
+      AddKey(prefix);
+      last_prefix_recorded_ = true;
+      last_prefix_str_.assign(prefix.data(), prefix.size());
+    }
+  } else {
+    AddKey(prefix);
+  }
+}
+
+void FullFilterBlockBuilder::Reset() {
+  last_whole_key_recorded_ = false;
+  last_prefix_recorded_ = false;
+}
+
+Slice FullFilterBlockBuilder::Finish(
+    const BlockHandle& /*tmp*/, Status* status,
+    std::unique_ptr<const char[]>* filter_data) {
+  Reset();
+  // In this impl we ignore BlockHandle
+  *status = Status::OK();
+  if (any_added_) {
+    any_added_ = false;
+    Slice filter_content = filter_bits_builder_->Finish(
+        filter_data ? filter_data : &filter_data_, status);
+    return filter_content;
+  }
+  return Slice();
+}
+
+FullFilterBlockReader::FullFilterBlockReader(
+    const BlockBasedTable* t,
+    CachableEntry<ParsedFullFilterBlock>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {}
+
+bool FullFilterBlockReader::KeyMayMatch(const Slice& key, const bool no_io,
+                                        const Slice* const /*const_ikey_ptr*/,
+                                        GetContext* get_context,
+                                        BlockCacheLookupContext* lookup_context,
+                                        Env::IOPriority rate_limiter_priority) {
+  if (!whole_key_filtering()) {
+    return true;
+  }
+  return MayMatch(key, no_io, get_context, lookup_context,
+                  rate_limiter_priority);
+}
+
+std::unique_ptr<FilterBlockReader> FullFilterBlockReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache,
+                                     nullptr /* get_context */, lookup_context,
+                                     &filter_block, BlockType::kFilter);
+    if (!s.ok()) {
+      IGNORE_STATUS_IF_ERROR(s);
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new FullFilterBlockReader(table, std::move(filter_block)));
+}
+
+bool FullFilterBlockReader::PrefixMayMatch(
+    const Slice& prefix, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  return MayMatch(prefix, no_io, get_context, lookup_context,
+                  rate_limiter_priority);
+}
+
+bool FullFilterBlockReader::MayMatch(
+    const Slice& entry, bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) const {
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+
+  const Status s =
+      GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block,
+                           BlockType::kFilter, rate_limiter_priority);
+  if (!s.ok()) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return true;
+  }
+
+  assert(filter_block.GetValue());
+
+  FilterBitsReader* const filter_bits_reader =
+      filter_block.GetValue()->filter_bits_reader();
+
+  if (filter_bits_reader) {
+    if (filter_bits_reader->MayMatch(entry)) {
+      PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+      return true;
+    } else {
+      PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+      return false;
+    }
+  }
+  return true;
+}
+
+void FullFilterBlockReader::KeysMayMatch(
+    MultiGetRange* range, const bool no_io,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  if (!whole_key_filtering()) {
+    // Simply return. Don't skip any key - consider all keys as likely to be
+    // present
+    return;
+  }
+  MayMatch(range, no_io, nullptr, lookup_context, rate_limiter_priority);
+}
+
+void FullFilterBlockReader::PrefixesMayMatch(
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    const bool no_io, BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  MayMatch(range, no_io, prefix_extractor, lookup_context,
+           rate_limiter_priority);
+}
+
+void FullFilterBlockReader::MayMatch(
+    MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) const {
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+
+  const Status s = GetOrReadFilterBlock(
+      no_io, range->begin()->get_context, lookup_context, &filter_block,
+      BlockType::kFilter, rate_limiter_priority);
+  if (!s.ok()) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return;
+  }
+
+  assert(filter_block.GetValue());
+
+  FilterBitsReader* const filter_bits_reader =
+      filter_block.GetValue()->filter_bits_reader();
+
+  if (!filter_bits_reader) {
+    return;
+  }
+
+  // We need to use an array instead of autovector for may_match since
+  // &may_match[0] doesn't work for autovector<bool> (compiler error). So
+  // declare both keys and may_match as arrays, which is also slightly less
+  // expensive compared to autovector
+  std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
+  std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
+  autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+  int num_keys = 0;
+  MultiGetRange filter_range(*range, range->begin(), range->end());
+  for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
+    if (!prefix_extractor) {
+      keys[num_keys++] = &iter->ukey_without_ts;
+    } else if (prefix_extractor->InDomain(iter->ukey_without_ts)) {
+      prefixes.emplace_back(prefix_extractor->Transform(iter->ukey_without_ts));
+      keys[num_keys++] = &prefixes.back();
+    } else {
+      filter_range.SkipKey(iter);
+    }
+  }
+
+  filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]);
+
+  int i = 0;
+  for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
+    if (!may_match[i]) {
+      // Update original MultiGet range to skip this key. The filter_range
+      // was temporarily used just to skip keys not in prefix_extractor domain
+      range->SkipKey(iter);
+      PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+    } else {
+      // PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+      PerfContext* perf_ctx = get_perf_context();
+      perf_ctx->bloom_sst_hit_count++;
+    }
+    ++i;
+  }
+}
+
+size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
+  size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<FullFilterBlockReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block.h b/src/rocksdb/table/block_based/full_filter_block.h
new file mode 100644
index 000000000..cd1771a38
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block.h
@@ -0,0 +1,147 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FilterPolicy;
+class FilterBitsBuilder;
+class FilterBitsReader;
+
+// A FullFilterBlockBuilder is used to construct a full filter for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+// The format of full filter block is:
+// +----------------------------------------------------------------+
+// |              full filter for all keys in sst file              |
+// +----------------------------------------------------------------+
+// The full filter can be very large. At the end of it, we put
+// num_probes: how many hash functions are used in bloom filter
+//
+class FullFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+  explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor,
+                                  bool whole_key_filtering,
+                                  FilterBitsBuilder* filter_bits_builder);
+  // No copying allowed
+  FullFilterBlockBuilder(const FullFilterBlockBuilder&) = delete;
+  void operator=(const FullFilterBlockBuilder&) = delete;
+
+  // bits_builder is created in filter_policy, it should be passed in here
+  // directly. and be deleted here
+  ~FullFilterBlockBuilder() {}
+
+  virtual void Add(const Slice& key_without_ts) override;
+  virtual bool IsEmpty() const override { return !any_added_; }
+  virtual size_t EstimateEntriesAdded() override;
+  virtual Slice Finish(
+      const BlockHandle& tmp, Status* status,
+      std::unique_ptr<const char[]>* filter_data = nullptr) override;
+  using FilterBlockBuilder::Finish;
+
+  virtual void ResetFilterBitsBuilder() override {
+    filter_bits_builder_.reset();
+  }
+
+  virtual Status MaybePostVerifyFilter(const Slice& filter_content) override {
+    return filter_bits_builder_->MaybePostVerify(filter_content);
+  }
+
+ protected:
+  virtual void AddKey(const Slice& key);
+  std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
+  virtual void Reset();
+  void AddPrefix(const Slice& key);
+  const SliceTransform* prefix_extractor() { return prefix_extractor_; }
+  const std::string& last_prefix_str() const { return last_prefix_str_; }
+
+ private:
+  // important: all of these might point to invalid addresses
+  // at the time of destruction of this filter block. destructor
+  // should NOT dereference them.
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  bool last_whole_key_recorded_;
+  std::string last_whole_key_str_;
+  bool last_prefix_recorded_;
+  std::string last_prefix_str_;
+  // Whether prefix_extractor_->InDomain(last_whole_key_) is true.
+  // Used in partitioned filters so that the last prefix from the previous
+  // filter partition will be added to the current partition if
+  // last_key_in_domain_ is true, regardless of the current key.
+  bool last_key_in_domain_;
+  bool any_added_;
+  std::unique_ptr<const char[]> filter_data_;
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class FullFilterBlockReader
+    : public FilterBlockReaderCommon<ParsedFullFilterBlock> {
+ public:
+  FullFilterBlockReader(const BlockBasedTable* t,
+                        CachableEntry<ParsedFullFilterBlock>&& filter_block);
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, const ReadOptions& ro,
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context);
+
+  bool KeyMayMatch(const Slice& key, const bool no_io,
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context,
+                   Env::IOPriority rate_limiter_priority) override;
+
+  bool PrefixMayMatch(const Slice& prefix, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context,
+                      Env::IOPriority rate_limiter_priority) override;
+
+  void KeysMayMatch(MultiGetRange* range, const bool no_io,
+                    BlockCacheLookupContext* lookup_context,
+                    Env::IOPriority rate_limiter_priority) override;
+  // Used in partitioned filter code
+  void KeysMayMatch2(MultiGetRange* range,
+                     const SliceTransform* /*prefix_extractor*/,
+                     const bool no_io, BlockCacheLookupContext* lookup_context,
+                     Env::IOPriority rate_limiter_priority) {
+    KeysMayMatch(range, no_io, lookup_context, rate_limiter_priority);
+  }
+
+  void PrefixesMayMatch(MultiGetRange* range,
+                        const SliceTransform* prefix_extractor,
+                        const bool no_io,
+                        BlockCacheLookupContext* lookup_context,
+                        Env::IOPriority rate_limiter_priority) override;
+  size_t ApproximateMemoryUsage() const override;
+
+ private:
+  bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context,
+                BlockCacheLookupContext* lookup_context,
+                Env::IOPriority rate_limiter_priority) const;
+  void MayMatch(MultiGetRange* range, bool no_io,
+                const SliceTransform* prefix_extractor,
+                BlockCacheLookupContext* lookup_context,
+                Env::IOPriority rate_limiter_priority) const;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block_test.cc b/src/rocksdb/table/block_based/full_filter_block_test.cc
new file mode 100644
index 000000000..bd98638e5
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block_test.cc
@@ -0,0 +1,339 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/full_filter_block.h"
+
+#include <set>
+
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/status.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  explicit TestFilterBitsBuilder() {}
+
+  // Add Key to filter
+  void AddKey(const Slice& key) override {
+    hash_entries_.push_back(Hash(key.data(), key.size(), 1));
+  }
+
+  using FilterBitsBuilder::Finish;
+
+  // Generate the filter using the keys that are added
+  Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    uint32_t len = static_cast<uint32_t>(hash_entries_.size()) * 4;
+    char* data = new char[len];
+    for (size_t i = 0; i < hash_entries_.size(); i++) {
+      EncodeFixed32(data + i * 4, hash_entries_[i]);
+    }
+    const char* const_data = data;
+    buf->reset(const_data);
+    return Slice(data, len);
+  }
+
+  size_t EstimateEntriesAdded() override { return hash_entries_.size(); }
+
+  size_t ApproximateNumEntries(size_t bytes) override { return bytes / 4; }
+
+ private:
+  std::vector<uint32_t> hash_entries_;
+};
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class TestFilterBitsReader : public FilterBitsReader {
+ public:
+  explicit TestFilterBitsReader(const Slice& contents)
+      : data_(contents.data()), len_(static_cast<uint32_t>(contents.size())) {}
+
+  // Silence compiler warning about overloaded virtual
+  using FilterBitsReader::MayMatch;
+  bool MayMatch(const Slice& entry) override {
+    uint32_t h = Hash(entry.data(), entry.size(), 1);
+    for (size_t i = 0; i + 4 <= len_; i += 4) {
+      if (h == DecodeFixed32(data_ + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  const char* data_;
+  uint32_t len_;
+};
+
+class TestHashFilter : public FilterPolicy {
+ public:
+  const char* Name() const override { return "TestHashFilter"; }
+  const char* CompatibilityName() const override { return Name(); }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override {
+    return new TestFilterBitsBuilder();
+  }
+
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+    return new TestFilterBitsReader(contents);
+  }
+};
+
+class PluginFullFilterBlockTest : public mock::MockBlockBasedTableTester,
+                                  public testing::Test {
+ public:
+  PluginFullFilterBlockTest()
+      : mock::MockBlockBasedTableTester(new TestHashFilter) {}
+};
+
+TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  Slice slice = builder.Finish();
+  ASSERT_EQ("", EscapeString(slice));
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr, Env::IO_TOTAL));
+}
+
+TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  Slice slice = builder.Finish();
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("bar",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("box",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("hello",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing",
+                                  /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                  /*get_context=*/nullptr,
+                                  /*lookup_context=*/nullptr,
+                                  rate_limiter_priority));
+  ASSERT_TRUE(!reader.KeyMayMatch("other",
+                                  /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                  /*get_context=*/nullptr,
+                                  /*lookup_context=*/nullptr,
+                                  rate_limiter_priority));
+}
+
+class FullFilterBlockTest : public mock::MockBlockBasedTableTester,
+                            public testing::Test {
+ public:
+  FullFilterBlockTest()
+      : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, false)) {}
+};
+
+TEST_F(FullFilterBlockTest, EmptyBuilder) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  Slice slice = builder.Finish();
+  ASSERT_EQ("", EscapeString(slice));
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr, Env::IO_TOTAL));
+}
+
+class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder {
+  std::unique_ptr<FilterBitsBuilder> b_;
+  std::set<std::string> uniq_;
+
+ public:
+  explicit CountUniqueFilterBitsBuilderWrapper(FilterBitsBuilder* b) : b_(b) {}
+
+  ~CountUniqueFilterBitsBuilderWrapper() override {}
+
+  void AddKey(const Slice& key) override {
+    b_->AddKey(key);
+    uniq_.insert(key.ToString());
+  }
+
+  using FilterBitsBuilder::Finish;
+
+  Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    Slice rv = b_->Finish(buf);
+    Status s_dont_care = b_->MaybePostVerify(rv);
+    s_dont_care.PermitUncheckedError();
+    uniq_.clear();
+    return rv;
+  }
+
+  size_t EstimateEntriesAdded() override { return b_->EstimateEntriesAdded(); }
+
+  size_t ApproximateNumEntries(size_t bytes) override {
+    return b_->ApproximateNumEntries(bytes);
+  }
+
+  size_t CountUnique() { return uniq_.size(); }
+};
+
+TEST_F(FullFilterBlockTest, DuplicateEntries) {
+  {  // empty prefixes
+    std::unique_ptr<const SliceTransform> prefix_extractor(
+        NewFixedPrefixTransform(0));
+    auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
+    const bool WHOLE_KEY = true;
+    FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+                                   bits_builder);
+    ASSERT_EQ(0, bits_builder->CountUnique());
+    // adds key and empty prefix; both abstractions count them
+    builder.Add("key1");
+    ASSERT_EQ(2, bits_builder->CountUnique());
+    // Add different key (unique) and also empty prefix (not unique).
+    // From here in this test, it's immaterial whether the block builder
+    // can count unique keys.
+    builder.Add("key2");
+    ASSERT_EQ(3, bits_builder->CountUnique());
+    // Empty key -> nothing unique
+    builder.Add("");
+    ASSERT_EQ(3, bits_builder->CountUnique());
+  }
+
+  // mix of empty and non-empty
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(7));
+  auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
+  const bool WHOLE_KEY = true;
+  FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+                                 bits_builder);
+  builder.Add("");  // test with empty key too
+  builder.Add("prefix1key1");
+  builder.Add("prefix1key1");
+  builder.Add("prefix1key2");
+  builder.Add("prefix1key3");
+  builder.Add("prefix2key4");
+  // 1 empty, 2 non-empty prefixes, and 4 non-empty keys
+  ASSERT_EQ(1 + 2 + 4, bits_builder->CountUnique());
+}
+
+TEST_F(FullFilterBlockTest, SingleChunk) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  ASSERT_TRUE(builder.IsEmpty());
+  builder.Add("foo");
+  ASSERT_FALSE(builder.IsEmpty());
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  // "box" only counts once
+  ASSERT_EQ(4, builder.EstimateEntriesAdded());
+  ASSERT_FALSE(builder.IsEmpty());
+  Status s;
+  Slice slice = builder.Finish(BlockHandle(), &s);
+  ASSERT_OK(s);
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("bar",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("box",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("hello",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing",
+                                  /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                  /*get_context=*/nullptr,
+                                  /*lookup_context=*/nullptr,
+                                  rate_limiter_priority));
+  ASSERT_TRUE(!reader.KeyMayMatch("other",
+                                  /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                  /*get_context=*/nullptr,
+                                  /*lookup_context=*/nullptr,
+                                  rate_limiter_priority));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/hash_index_reader.cc b/src/rocksdb/table/block_based/hash_index_reader.cc
new file mode 100644
index 000000000..bcaba17a2
--- /dev/null
+++ b/src/rocksdb/table/block_based/hash_index_reader.cc
@@ -0,0 +1,148 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/hash_index_reader.h"
+
+#include "table/block_fetcher.h"
+#include "table/meta_blocks.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status HashIndexReader::Create(const BlockBasedTable* table,
+                               const ReadOptions& ro,
+                               FilePrefetchBuffer* prefetch_buffer,
+                               InternalIterator* meta_index_iter,
+                               bool use_cache, bool prefetch, bool pin,
+                               BlockCacheLookupContext* lookup_context,
+                               std::unique_ptr<IndexReader>* index_reader) {
+  assert(table != nullptr);
+  assert(index_reader != nullptr);
+  assert(!pin || prefetch);
+
+  const BlockBasedTable::Rep* rep = table->get_rep();
+  assert(rep != nullptr);
+
+  CachableEntry<Block> index_block;
+  if (prefetch || !use_cache) {
+    const Status s =
+        ReadIndexBlock(table, prefetch_buffer, ro, use_cache,
+                       /*get_context=*/nullptr, lookup_context, &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      index_block.Reset();
+    }
+  }
+
+  // Note, failure to create prefix hash index does not need to be a
+  // hard error. We can still fall back to the original binary search index.
+  // So, Create will succeed regardless, from this point on.
+
+  index_reader->reset(new HashIndexReader(table, std::move(index_block)));
+
+  // Get prefixes block
+  BlockHandle prefixes_handle;
+  Status s =
+      FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, &prefixes_handle);
+  if (!s.ok()) {
+    // TODO: log error
+    return Status::OK();
+  }
+
+  // Get index metadata block
+  BlockHandle prefixes_meta_handle;
+  s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
+                    &prefixes_meta_handle);
+  if (!s.ok()) {
+    // TODO: log error
+    return Status::OK();
+  }
+
+  RandomAccessFileReader* const file = rep->file.get();
+  const Footer& footer = rep->footer;
+  const ImmutableOptions& ioptions = rep->ioptions;
+  const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
+  MemoryAllocator* const memory_allocator =
+      GetMemoryAllocator(rep->table_options);
+
+  // Read contents for the blocks
+  BlockContents prefixes_contents;
+  BlockFetcher prefixes_block_fetcher(
+      file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
+      &prefixes_contents, ioptions, true /*decompress*/,
+      true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+  s = prefixes_block_fetcher.ReadBlockContents();
+  if (!s.ok()) {
+    return s;
+  }
+  BlockContents prefixes_meta_contents;
+  BlockFetcher prefixes_meta_block_fetcher(
+      file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
+      &prefixes_meta_contents, ioptions, true /*decompress*/,
+      true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+  s = prefixes_meta_block_fetcher.ReadBlockContents();
+  if (!s.ok()) {
+    // TODO: log error
+    return Status::OK();
+  }
+
+  BlockPrefixIndex* prefix_index = nullptr;
+  assert(rep->table_prefix_extractor);
+  s = BlockPrefixIndex::Create(rep->table_prefix_extractor.get(),
+                               prefixes_contents.data,
+                               prefixes_meta_contents.data, &prefix_index);
+  // TODO: log error
+  if (s.ok()) {
+    HashIndexReader* const hash_index_reader =
+        static_cast<HashIndexReader*>(index_reader->get());
+    hash_index_reader->prefix_index_.reset(prefix_index);
+  }
+
+  return Status::OK();
+}
+
+InternalIteratorBase<IndexValue>* HashIndexReader::NewIterator(
+    const ReadOptions& read_options, bool disable_prefix_seek,
+    IndexBlockIter* iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+  const BlockBasedTable::Rep* rep = table()->get_rep();
+  const bool no_io = (read_options.read_tier == kBlockCacheTier);
+  CachableEntry<Block> index_block;
+  const Status s =
+      GetOrReadIndexBlock(no_io, read_options.rate_limiter_priority,
+                          get_context, lookup_context, &index_block);
+  if (!s.ok()) {
+    if (iter != nullptr) {
+      iter->Invalidate(s);
+      return iter;
+    }
+
+    return NewErrorInternalIterator<IndexValue>(s);
+  }
+
+  Statistics* kNullStats = nullptr;
+  const bool total_order_seek =
+      read_options.total_order_seek || disable_prefix_seek;
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  auto it = index_block.GetValue()->NewIndexIterator(
+      internal_comparator()->user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), iter, kNullStats,
+      total_order_seek, index_has_first_key(), index_key_includes_seq(),
+      index_value_is_full(), false /* block_contents_pinned */,
+      prefix_index_.get());
+
+  assert(it != nullptr);
+  index_block.TransferTo(it);
+
+  return it;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/hash_index_reader.h b/src/rocksdb/table/block_based/hash_index_reader.h
new file mode 100644
index 000000000..9037efc87
--- /dev/null
+++ b/src/rocksdb/table/block_based/hash_index_reader.h
@@ -0,0 +1,49 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "table/block_based/index_reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Index that leverages an internal hash table to quicken the lookup for a given
+// key.
+class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  static Status Create(const BlockBasedTable* table, const ReadOptions& ro,
+                       FilePrefetchBuffer* prefetch_buffer,
+                       InternalIterator* meta_index_iter, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader);
+
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool disable_prefix_seek,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override;
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
+#else
+    if (prefix_index_) {
+      usage += prefix_index_->ApproximateMemoryUsage();
+    }
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+
+ private:
+  HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+
+  std::unique_ptr<BlockPrefixIndex> prefix_index_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/index_builder.cc b/src/rocksdb/table/block_based/index_builder.cc
new file mode 100644
index 000000000..024730178
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_builder.cc
@@ -0,0 +1,282 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/index_builder.h"
+
+#include <assert.h>
+
+#include <cinttypes>
+#include <list>
+#include <string>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/flush_block_policy.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Create a index builder based on its type.
+IndexBuilder* IndexBuilder::CreateIndexBuilder(
+    BlockBasedTableOptions::IndexType index_type,
+    const InternalKeyComparator* comparator,
+    const InternalKeySliceTransform* int_key_slice_transform,
+    const bool use_value_delta_encoding,
+    const BlockBasedTableOptions& table_opt) {
+  IndexBuilder* result = nullptr;
+  switch (index_type) {
+    case BlockBasedTableOptions::kBinarySearch: {
+      result = new ShortenedIndexBuilder(
+          comparator, table_opt.index_block_restart_interval,
+          table_opt.format_version, use_value_delta_encoding,
+          table_opt.index_shortening, /* include_first_key */ false);
+      break;
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      // Currently kHashSearch is incompatible with index_block_restart_interval
+      // > 1
+      assert(table_opt.index_block_restart_interval == 1);
+      result = new HashIndexBuilder(
+          comparator, int_key_slice_transform,
+          table_opt.index_block_restart_interval, table_opt.format_version,
+          use_value_delta_encoding, table_opt.index_shortening);
+      break;
+    }
+    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
+      result = PartitionedIndexBuilder::CreateIndexBuilder(
+          comparator, use_value_delta_encoding, table_opt);
+      break;
+    }
+    case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
+      result = new ShortenedIndexBuilder(
+          comparator, table_opt.index_block_restart_interval,
+          table_opt.format_version, use_value_delta_encoding,
+          table_opt.index_shortening, /* include_first_key */ true);
+      break;
+    }
+    default: {
+      assert(!"Do not recognize the index type ");
+      break;
+    }
+  }
+  return result;
+}
+
+void ShortenedIndexBuilder::FindShortestInternalKeySeparator(
+    const Comparator& comparator, std::string* start, const Slice& limit) {
+  // Attempt to shorten the user portion of the key
+  Slice user_start = ExtractUserKey(*start);
+  Slice user_limit = ExtractUserKey(limit);
+  std::string tmp(user_start.data(), user_start.size());
+  comparator.FindShortestSeparator(&tmp, user_limit);
+  if (tmp.size() <= user_start.size() &&
+      comparator.Compare(user_start, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp,
+               PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
+    assert(InternalKeyComparator(&comparator).Compare(*start, tmp) < 0);
+    assert(InternalKeyComparator(&comparator).Compare(tmp, limit) < 0);
+    start->swap(tmp);
+  }
+}
+
+void ShortenedIndexBuilder::FindShortInternalKeySuccessor(
+    const Comparator& comparator, std::string* key) {
+  Slice user_key = ExtractUserKey(*key);
+  std::string tmp(user_key.data(), user_key.size());
+  comparator.FindShortSuccessor(&tmp);
+  if (tmp.size() <= user_key.size() && comparator.Compare(user_key, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp,
+               PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
+    assert(InternalKeyComparator(&comparator).Compare(*key, tmp) < 0);
+    key->swap(tmp);
+  }
+}
+
+PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
+    const InternalKeyComparator* comparator,
+    const bool use_value_delta_encoding,
+    const BlockBasedTableOptions& table_opt) {
+  return new PartitionedIndexBuilder(comparator, table_opt,
+                                     use_value_delta_encoding);
+}
+
+PartitionedIndexBuilder::PartitionedIndexBuilder(
+    const InternalKeyComparator* comparator,
+    const BlockBasedTableOptions& table_opt,
+    const bool use_value_delta_encoding)
+    : IndexBuilder(comparator),
+      index_block_builder_(table_opt.index_block_restart_interval,
+                           true /*use_delta_encoding*/,
+                           use_value_delta_encoding),
+      index_block_builder_without_seq_(table_opt.index_block_restart_interval,
+                                       true /*use_delta_encoding*/,
+                                       use_value_delta_encoding),
+      sub_index_builder_(nullptr),
+      table_opt_(table_opt),
+      // We start by false. After each partition we revise the value based on
+      // what the sub_index_builder has decided. If the feature is disabled
+      // entirely, this will be set to true after switching the first
+      // sub_index_builder. Otherwise, it could be set to true even one of the
+      // sub_index_builders could not safely exclude seq from the keys, then it
+      // wil be enforced on all sub_index_builders on ::Finish.
+      seperator_is_key_plus_seq_(false),
+      use_value_delta_encoding_(use_value_delta_encoding) {}
+
+PartitionedIndexBuilder::~PartitionedIndexBuilder() {
+  delete sub_index_builder_;
+}
+
+void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
+  assert(sub_index_builder_ == nullptr);
+  sub_index_builder_ = new ShortenedIndexBuilder(
+      comparator_, table_opt_.index_block_restart_interval,
+      table_opt_.format_version, use_value_delta_encoding_,
+      table_opt_.index_shortening, /* include_first_key */ false);
+
+  // Set sub_index_builder_->seperator_is_key_plus_seq_ to true if
+  // seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by
+  // default on Creation) so that flush policy can point to
+  // sub_index_builder_->index_block_builder_
+  if (seperator_is_key_plus_seq_) {
+    sub_index_builder_->seperator_is_key_plus_seq_ = true;
+  }
+
+  flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+      table_opt_.metadata_block_size, table_opt_.block_size_deviation,
+      // Note: this is sub-optimal since sub_index_builder_ could later reset
+      // seperator_is_key_plus_seq_ but the probability of that is low.
+      sub_index_builder_->seperator_is_key_plus_seq_
+          ? sub_index_builder_->index_block_builder_
+          : sub_index_builder_->index_block_builder_without_seq_));
+  partition_cut_requested_ = false;
+}
+
+void PartitionedIndexBuilder::RequestPartitionCut() {
+  partition_cut_requested_ = true;
+}
+
+void PartitionedIndexBuilder::AddIndexEntry(
+    std::string* last_key_in_current_block,
+    const Slice* first_key_in_next_block, const BlockHandle& block_handle) {
+  // Note: to avoid two consecuitive flush in the same method call, we do not
+  // check flush policy when adding the last key
+  if (UNLIKELY(first_key_in_next_block == nullptr)) {  // no more keys
+    if (sub_index_builder_ == nullptr) {
+      MakeNewSubIndexBuilder();
+    }
+    sub_index_builder_->AddIndexEntry(last_key_in_current_block,
+                                      first_key_in_next_block, block_handle);
+    if (!seperator_is_key_plus_seq_ &&
+        sub_index_builder_->seperator_is_key_plus_seq_) {
+      // then we need to apply it to all sub-index builders and reset
+      // flush_policy to point to Block Builder of sub_index_builder_ that store
+      // internal keys.
+      seperator_is_key_plus_seq_ = true;
+      flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+          table_opt_.metadata_block_size, table_opt_.block_size_deviation,
+          sub_index_builder_->index_block_builder_));
+    }
+    sub_index_last_key_ = std::string(*last_key_in_current_block);
+    entries_.push_back(
+        {sub_index_last_key_,
+         std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
+    sub_index_builder_ = nullptr;
+    cut_filter_block = true;
+  } else {
+    // apply flush policy only to non-empty sub_index_builder_
+    if (sub_index_builder_ != nullptr) {
+      std::string handle_encoding;
+      block_handle.EncodeTo(&handle_encoding);
+      bool do_flush =
+          partition_cut_requested_ ||
+          flush_policy_->Update(*last_key_in_current_block, handle_encoding);
+      if (do_flush) {
+        entries_.push_back(
+            {sub_index_last_key_,
+             std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
+        cut_filter_block = true;
+        sub_index_builder_ = nullptr;
+      }
+    }
+    if (sub_index_builder_ == nullptr) {
+      MakeNewSubIndexBuilder();
+    }
+    sub_index_builder_->AddIndexEntry(last_key_in_current_block,
+                                      first_key_in_next_block, block_handle);
+    sub_index_last_key_ = std::string(*last_key_in_current_block);
+    if (!seperator_is_key_plus_seq_ &&
+        sub_index_builder_->seperator_is_key_plus_seq_) {
+      // then we need to apply it to all sub-index builders and reset
+      // flush_policy to point to Block Builder of sub_index_builder_ that store
+      // internal keys.
+      seperator_is_key_plus_seq_ = true;
+      flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+          table_opt_.metadata_block_size, table_opt_.block_size_deviation,
+          sub_index_builder_->index_block_builder_));
+    }
+  }
+}
+
+Status PartitionedIndexBuilder::Finish(
+    IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
+  if (partition_cnt_ == 0) {
+    partition_cnt_ = entries_.size();
+  }
+  // It must be set to null after last key is added
+  assert(sub_index_builder_ == nullptr);
+  if (finishing_indexes == true) {
+    Entry& last_entry = entries_.front();
+    std::string handle_encoding;
+    last_partition_block_handle.EncodeTo(&handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(
+        &handle_delta_encoding,
+        last_partition_block_handle.size() - last_encoded_handle_.size());
+    last_encoded_handle_ = last_partition_block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_block_builder_.Add(last_entry.key, handle_encoding,
+                             &handle_delta_encoding_slice);
+    if (!seperator_is_key_plus_seq_) {
+      index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
+                                           handle_encoding,
+                                           &handle_delta_encoding_slice);
+    }
+    entries_.pop_front();
+  }
+  // If there is no sub_index left, then return the 2nd level index.
+  if (UNLIKELY(entries_.empty())) {
+    if (seperator_is_key_plus_seq_) {
+      index_blocks->index_block_contents = index_block_builder_.Finish();
+    } else {
+      index_blocks->index_block_contents =
+          index_block_builder_without_seq_.Finish();
+    }
+    top_level_index_size_ = index_blocks->index_block_contents.size();
+    index_size_ += top_level_index_size_;
+    return Status::OK();
+  } else {
+    // Finish the next partition index in line and Incomplete() to indicate we
+    // expect more calls to Finish
+    Entry& entry = entries_.front();
+    // Apply the policy to all sub-indexes
+    entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
+    auto s = entry.value->Finish(index_blocks);
+    index_size_ += index_blocks->index_block_contents.size();
+    finishing_indexes = true;
+    return s.ok() ? Status::Incomplete() : s;
+  }
+}
+
+size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; }
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/index_builder.h b/src/rocksdb/table/block_based/index_builder.h
new file mode 100644
index 000000000..dd3be0331
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_builder.h
@@ -0,0 +1,455 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <assert.h>
+
+#include <cinttypes>
+#include <list>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/comparator.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The interface for building index.
+// Instruction for adding a new concrete IndexBuilder:
+//  1. Create a subclass instantiated from IndexBuilder.
+//  2. Add a new entry associated with that subclass in TableOptions::IndexType.
+//  3. Add a create function for the new subclass in CreateIndexBuilder.
+// Note: we can devise more advanced design to simplify the process for adding
+// new subclass, which will, on the other hand, increase the code complexity and
+// catch unwanted attention from readers. Given that we won't add/change
+// indexes frequently, it makes sense to just embrace a more straightforward
+// design that just works.
+class IndexBuilder {
+ public:
+  static IndexBuilder* CreateIndexBuilder(
+      BlockBasedTableOptions::IndexType index_type,
+      const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
+      const InternalKeySliceTransform* int_key_slice_transform,
+      const bool use_value_delta_encoding,
+      const BlockBasedTableOptions& table_opt);
+
+  // Index builder will construct a set of blocks which contain:
+  //  1. One primary index block.
+  //  2. (Optional) a set of metablocks that contains the metadata of the
+  //     primary index.
+  struct IndexBlocks {
+    Slice index_block_contents;
+    std::unordered_map<std::string, Slice> meta_blocks;
+  };
+  explicit IndexBuilder(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
+
+  virtual ~IndexBuilder() {}
+
+  // Add a new index entry to index block.
+  // To allow further optimization, we provide `last_key_in_current_block` and
+  // `first_key_in_next_block`, based on which the specific implementation can
+  // determine the best index key to be used for the index block.
+  // Called before the OnKeyAdded() call for first_key_in_next_block.
+  // @last_key_in_current_block: this parameter maybe overridden with the value
+  //                             "substitute key".
+  // @first_key_in_next_block: it will be nullptr if the entry being added is
+  //                           the last one in the table
+  //
+  // REQUIRES: Finish() has not yet been called.
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) = 0;
+
+  // This method will be called whenever a key is added. The subclasses may
+  // override OnKeyAdded() if they need to collect additional information.
+  virtual void OnKeyAdded(const Slice& /*key*/) {}
+
+  // Inform the index builder that all entries has been written. Block builder
+  // may therefore perform any operation required for block finalization.
+  //
+  // REQUIRES: Finish() has not yet been called.
+  inline Status Finish(IndexBlocks* index_blocks) {
+    // Throw away the changes to last_partition_block_handle. It has no effect
+    // on the first call to Finish anyway.
+    BlockHandle last_partition_block_handle;
+    return Finish(index_blocks, last_partition_block_handle);
+  }
+
+  // This override of Finish can be utilized to build the 2nd level index in
+  // PartitionIndexBuilder.
+  //
+  // index_blocks will be filled with the resulting index data. If the return
+  // value is Status::InComplete() then it means that the index is partitioned
+  // and the callee should keep calling Finish until Status::OK() is returned.
+  // In that case, last_partition_block_handle is pointer to the block written
+  // with the result of the last call to Finish. This can be utilized to build
+  // the second level index pointing to each block of partitioned indexes. The
+  // last call to Finish() that returns Status::OK() populates index_blocks with
+  // the 2nd level index content.
+  virtual Status Finish(IndexBlocks* index_blocks,
+                        const BlockHandle& last_partition_block_handle) = 0;
+
+  // Get the size for index block. Must be called after ::Finish.
+  virtual size_t IndexSize() const = 0;
+
+  virtual bool seperator_is_key_plus_seq() { return true; }
+
+ protected:
+  const InternalKeyComparator* comparator_;
+  // Set after ::Finish is called
+  size_t index_size_ = 0;
+};
+
+// This index builder builds space-efficient index block.
+//
+// Optimizations:
+//  1. Made block's `block_restart_interval` to be 1, which will avoid linear
+//     search when doing index lookup (can be disabled by setting
+//     index_block_restart_interval).
+//  2. Shorten the key length for index block. Other than honestly using the
+//     last key in the data block as the index key, we instead find a shortest
+//     substitute key that serves the same function.
+class ShortenedIndexBuilder : public IndexBuilder {
+ public:
+  explicit ShortenedIndexBuilder(
+      const InternalKeyComparator* comparator,
+      const int index_block_restart_interval, const uint32_t format_version,
+      const bool use_value_delta_encoding,
+      BlockBasedTableOptions::IndexShorteningMode shortening_mode,
+      bool include_first_key)
+      : IndexBuilder(comparator),
+        index_block_builder_(index_block_restart_interval,
+                             true /*use_delta_encoding*/,
+                             use_value_delta_encoding),
+        index_block_builder_without_seq_(index_block_restart_interval,
+                                         true /*use_delta_encoding*/,
+                                         use_value_delta_encoding),
+        use_value_delta_encoding_(use_value_delta_encoding),
+        include_first_key_(include_first_key),
+        shortening_mode_(shortening_mode) {
+    // Making the default true will disable the feature for old versions
+    seperator_is_key_plus_seq_ = (format_version <= 2);
+  }
+
+  virtual void OnKeyAdded(const Slice& key) override {
+    if (include_first_key_ && current_block_first_internal_key_.empty()) {
+      current_block_first_internal_key_.assign(key.data(), key.size());
+    }
+  }
+
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) override {
+    if (first_key_in_next_block != nullptr) {
+      if (shortening_mode_ !=
+          BlockBasedTableOptions::IndexShorteningMode::kNoShortening) {
+        FindShortestInternalKeySeparator(*comparator_->user_comparator(),
+                                         last_key_in_current_block,
+                                         *first_key_in_next_block);
+      }
+      if (!seperator_is_key_plus_seq_ &&
+          comparator_->user_comparator()->Compare(
+              ExtractUserKey(*last_key_in_current_block),
+              ExtractUserKey(*first_key_in_next_block)) == 0) {
+        seperator_is_key_plus_seq_ = true;
+      }
+    } else {
+      if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode::
+                                  kShortenSeparatorsAndSuccessor) {
+        FindShortInternalKeySuccessor(*comparator_->user_comparator(),
+                                      last_key_in_current_block);
+      }
+    }
+    auto sep = Slice(*last_key_in_current_block);
+
+    assert(!include_first_key_ || !current_block_first_internal_key_.empty());
+    IndexValue entry(block_handle, current_block_first_internal_key_);
+    std::string encoded_entry;
+    std::string delta_encoded_entry;
+    entry.EncodeTo(&encoded_entry, include_first_key_, nullptr);
+    if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) {
+      entry.EncodeTo(&delta_encoded_entry, include_first_key_,
+                     &last_encoded_handle_);
+    } else {
+      // If it's the first block, or delta encoding is disabled,
+      // BlockBuilder::Add() below won't use delta-encoded slice.
+    }
+    last_encoded_handle_ = block_handle;
+    const Slice delta_encoded_entry_slice(delta_encoded_entry);
+    index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice);
+    if (!seperator_is_key_plus_seq_) {
+      index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry,
+                                           &delta_encoded_entry_slice);
+    }
+
+    current_block_first_internal_key_.clear();
+  }
+
+  using IndexBuilder::Finish;
+  virtual Status Finish(
+      IndexBlocks* index_blocks,
+      const BlockHandle& /*last_partition_block_handle*/) override {
+    if (seperator_is_key_plus_seq_) {
+      index_blocks->index_block_contents = index_block_builder_.Finish();
+    } else {
+      index_blocks->index_block_contents =
+          index_block_builder_without_seq_.Finish();
+    }
+    index_size_ = index_blocks->index_block_contents.size();
+    return Status::OK();
+  }
+
+  virtual size_t IndexSize() const override { return index_size_; }
+
+  virtual bool seperator_is_key_plus_seq() override {
+    return seperator_is_key_plus_seq_;
+  }
+
+  // Changes *key to a short string >= *key.
+  //
+  static void FindShortestInternalKeySeparator(const Comparator& comparator,
+                                               std::string* start,
+                                               const Slice& limit);
+
+  static void FindShortInternalKeySuccessor(const Comparator& comparator,
+                                            std::string* key);
+
+  friend class PartitionedIndexBuilder;
+
+ private:
+  BlockBuilder index_block_builder_;
+  BlockBuilder index_block_builder_without_seq_;
+  const bool use_value_delta_encoding_;
+  bool seperator_is_key_plus_seq_;
+  const bool include_first_key_;
+  BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
+  BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
+  std::string current_block_first_internal_key_;
+};
+
+// HashIndexBuilder contains a binary-searchable primary index and the
+// metadata for secondary hash index construction.
+// The metadata for hash index consists two parts:
+//  - a metablock that compactly contains a sequence of prefixes. All prefixes
+//    are stored consectively without any metadata (like, prefix sizes) being
+//    stored, which is kept in the other metablock.
+//  - a metablock contains the metadata of the prefixes, including prefix size,
+//    restart index and number of block it spans. The format looks like:
+//
+// +-----------------+---------------------------+---------------------+
+// <=prefix 1
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+// <=prefix 2
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+// |                                                                   |
+// | ....                                                              |
+// |                                                                   |
+// +-----------------+---------------------------+---------------------+
+// <=prefix n
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+//
+// The reason of separating these two metablocks is to enable the efficiently
+// reuse the first metablock during hash index construction without unnecessary
+// data copy or small heap allocations for prefixes.
+class HashIndexBuilder : public IndexBuilder {
+ public:
+  explicit HashIndexBuilder(
+      const InternalKeyComparator* comparator,
+      const SliceTransform* hash_key_extractor,
+      int index_block_restart_interval, int format_version,
+      bool use_value_delta_encoding,
+      BlockBasedTableOptions::IndexShorteningMode shortening_mode)
+      : IndexBuilder(comparator),
+        primary_index_builder_(comparator, index_block_restart_interval,
+                               format_version, use_value_delta_encoding,
+                               shortening_mode, /* include_first_key */ false),
+        hash_key_extractor_(hash_key_extractor) {}
+
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) override {
+    ++current_restart_index_;
+    primary_index_builder_.AddIndexEntry(last_key_in_current_block,
+                                         first_key_in_next_block, block_handle);
+  }
+
+  virtual void OnKeyAdded(const Slice& key) override {
+    auto key_prefix = hash_key_extractor_->Transform(key);
+    bool is_first_entry = pending_block_num_ == 0;
+
+    // Keys may share the prefix
+    if (is_first_entry || pending_entry_prefix_ != key_prefix) {
+      if (!is_first_entry) {
+        FlushPendingPrefix();
+      }
+
+      // need a hard copy otherwise the underlying data changes all the time.
+      // TODO(kailiu) std::to_string() is expensive. We may speed up can avoid
+      // data copy.
+      pending_entry_prefix_ = key_prefix.ToString();
+      pending_block_num_ = 1;
+      pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
+    } else {
+      // entry number increments when keys share the prefix reside in
+      // different data blocks.
+      auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
+      assert(last_restart_index <= current_restart_index_);
+      if (last_restart_index != current_restart_index_) {
+        ++pending_block_num_;
+      }
+    }
+  }
+
+  virtual Status Finish(
+      IndexBlocks* index_blocks,
+      const BlockHandle& last_partition_block_handle) override {
+    if (pending_block_num_ != 0) {
+      FlushPendingPrefix();
+    }
+    Status s = primary_index_builder_.Finish(index_blocks,
+                                             last_partition_block_handle);
+    index_blocks->meta_blocks.insert(
+        {kHashIndexPrefixesBlock.c_str(), prefix_block_});
+    index_blocks->meta_blocks.insert(
+        {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
+    return s;
+  }
+
+  virtual size_t IndexSize() const override {
+    return primary_index_builder_.IndexSize() + prefix_block_.size() +
+           prefix_meta_block_.size();
+  }
+
+  virtual bool seperator_is_key_plus_seq() override {
+    return primary_index_builder_.seperator_is_key_plus_seq();
+  }
+
+ private:
+  void FlushPendingPrefix() {
+    prefix_block_.append(pending_entry_prefix_.data(),
+                         pending_entry_prefix_.size());
+    PutVarint32Varint32Varint32(
+        &prefix_meta_block_,
+        static_cast<uint32_t>(pending_entry_prefix_.size()),
+        pending_entry_index_, pending_block_num_);
+  }
+
+  ShortenedIndexBuilder primary_index_builder_;
+  const SliceTransform* hash_key_extractor_;
+
+  // stores a sequence of prefixes
+  std::string prefix_block_;
+  // stores the metadata of prefixes
+  std::string prefix_meta_block_;
+
+  // The following 3 variables keeps unflushed prefix and its metadata.
+  // The details of block_num and entry_index can be found in
+  // "block_hash_index.{h,cc}"
+  uint32_t pending_block_num_ = 0;
+  uint32_t pending_entry_index_ = 0;
+  std::string pending_entry_prefix_;
+
+  uint64_t current_restart_index_ = 0;
+};
+
+/**
+ * IndexBuilder for two-level indexing. Internally it creates a new index for
+ * each partition and Finish then in order when Finish is called on it
+ * continiously until Status::OK() is returned.
+ *
+ * The format on the disk would be I I I I I I IP where I is block containing a
+ * partition of indexes built using ShortenedIndexBuilder and IP is a block
+ * containing a secondary index on the partitions, built using
+ * ShortenedIndexBuilder.
+ */
+class PartitionedIndexBuilder : public IndexBuilder {
+ public:
+  static PartitionedIndexBuilder* CreateIndexBuilder(
+      const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
+      const bool use_value_delta_encoding,
+      const BlockBasedTableOptions& table_opt);
+
+  explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
+                                   const BlockBasedTableOptions& table_opt,
+                                   const bool use_value_delta_encoding);
+
+  virtual ~PartitionedIndexBuilder();
+
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) override;
+
+  virtual Status Finish(
+      IndexBlocks* index_blocks,
+      const BlockHandle& last_partition_block_handle) override;
+
+  virtual size_t IndexSize() const override { return index_size_; }
+  size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
+  size_t NumPartitions() const;
+
+  inline bool ShouldCutFilterBlock() {
+    // Current policy is to align the partitions of index and filters
+    if (cut_filter_block) {
+      cut_filter_block = false;
+      return true;
+    }
+    return false;
+  }
+
+  std::string& GetPartitionKey() { return sub_index_last_key_; }
+
+  // Called when an external entity (such as filter partition builder) request
+  // cutting the next partition
+  void RequestPartitionCut();
+
+  virtual bool seperator_is_key_plus_seq() override {
+    return seperator_is_key_plus_seq_;
+  }
+
+  bool get_use_value_delta_encoding() { return use_value_delta_encoding_; }
+
+ private:
+  // Set after ::Finish is called
+  size_t top_level_index_size_ = 0;
+  // Set after ::Finish is called
+  size_t partition_cnt_ = 0;
+
+  void MakeNewSubIndexBuilder();
+
+  struct Entry {
+    std::string key;
+    std::unique_ptr<ShortenedIndexBuilder> value;
+  };
+  std::list<Entry> entries_;  // list of partitioned indexes and their keys
+  BlockBuilder index_block_builder_;              // top-level index builder
+  BlockBuilder index_block_builder_without_seq_;  // same for user keys
+  // the active partition index builder
+  ShortenedIndexBuilder* sub_index_builder_;
+  // the last key in the active partition index builder
+  std::string sub_index_last_key_;
+  std::unique_ptr<FlushBlockPolicy> flush_policy_;
+  // true if Finish is called once but not complete yet.
+  bool finishing_indexes = false;
+  const BlockBasedTableOptions& table_opt_;
+  bool seperator_is_key_plus_seq_;
+  bool use_value_delta_encoding_;
+  // true if an external entity (such as filter partition builder) request
+  // cutting the next partition
+  bool partition_cut_requested_ = true;
+  // true if it should cut the next filter partition block
+  bool cut_filter_block = false;
+  BlockHandle last_encoded_handle_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/index_reader_common.cc b/src/rocksdb/table/block_based/index_reader_common.cc
new file mode 100644
index 000000000..6584586c9
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_reader_common.cc
@@ -0,0 +1,56 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/index_reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<Block>* index_block) {
+  PERF_TIMER_GUARD(read_index_block_nanos);
+
+  assert(table != nullptr);
+  assert(index_block != nullptr);
+  assert(index_block->IsEmpty());
+
+  const Rep* const rep = table->get_rep();
+  assert(rep != nullptr);
+
+  const Status s = table->RetrieveBlock(
+      prefetch_buffer, read_options, rep->footer.index_handle(),
+      UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
+      get_context, lookup_context, /* for_compaction */ false, use_cache,
+      /* wait_for_cache */ true, /* async_read */ false);
+
+  return s;
+}
+
+Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
+    bool no_io, Env::IOPriority rate_limiter_priority, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<Block>* index_block) const {
+  assert(index_block != nullptr);
+
+  if (!index_block_.IsEmpty()) {
+    index_block->SetUnownedValue(index_block_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  read_options.rate_limiter_priority = rate_limiter_priority;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
+                        cache_index_blocks(), get_context, lookup_context,
+                        index_block);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/index_reader_common.h b/src/rocksdb/table/block_based/index_reader_common.h
new file mode 100644
index 000000000..5627b0eeb
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_reader_common.h
@@ -0,0 +1,85 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Encapsulates common functionality for the various index reader
+// implementations. Provides access to the index block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
+ public:
+  IndexReaderCommon(const BlockBasedTable* t,
+                    CachableEntry<Block>&& index_block)
+      : table_(t), index_block_(std::move(index_block)) {
+    assert(table_ != nullptr);
+  }
+
+ protected:
+  static Status ReadIndexBlock(const BlockBasedTable* table,
+                               FilePrefetchBuffer* prefetch_buffer,
+                               const ReadOptions& read_options, bool use_cache,
+                               GetContext* get_context,
+                               BlockCacheLookupContext* lookup_context,
+                               CachableEntry<Block>* index_block);
+
+  const BlockBasedTable* table() const { return table_; }
+
+  const InternalKeyComparator* internal_comparator() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+
+    return &table_->get_rep()->internal_comparator;
+  }
+
+  bool index_has_first_key() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_has_first_key;
+  }
+
+  bool index_key_includes_seq() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_key_includes_seq;
+  }
+
+  bool index_value_is_full() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_value_is_full;
+  }
+
+  bool cache_index_blocks() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+  }
+
+  Status GetOrReadIndexBlock(bool no_io, Env::IOPriority rate_limiter_priority,
+                             GetContext* get_context,
+                             BlockCacheLookupContext* lookup_context,
+                             CachableEntry<Block>* index_block) const;
+
+  size_t ApproximateIndexBlockMemoryUsage() const {
+    assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
+    return index_block_.GetOwnValue()
+               ? index_block_.GetValue()->ApproximateMemoryUsage()
+               : 0;
+  }
+
+ private:
+  const BlockBasedTable* table_;
+  CachableEntry<Block> index_block_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/mock_block_based_table.h b/src/rocksdb/table/block_based/mock_block_based_table.h
new file mode 100644
index 000000000..13f3dfaee
--- /dev/null
+++ b/src/rocksdb/table/block_based/mock_block_based_table.h
@@ -0,0 +1,62 @@
+//  Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace mock {
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class MockBlockBasedTableTester {
+  static constexpr int kMockLevel = 0;
+
+ public:
+  Options options_;
+  ImmutableOptions ioptions_;
+  EnvOptions env_options_;
+  BlockBasedTableOptions table_options_;
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+
+  explicit MockBlockBasedTableTester(const FilterPolicy* filter_policy)
+      : MockBlockBasedTableTester(
+            std::shared_ptr<const FilterPolicy>(filter_policy)){};
+
+  explicit MockBlockBasedTableTester(
+      std::shared_ptr<const FilterPolicy> filter_policy)
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator) {
+    table_options_.filter_policy = std::move(filter_policy);
+
+    constexpr bool skip_filters = false;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep(
+        ioptions_, env_options_, table_options_, icomp_, skip_filters,
+        12345 /*file_size*/, kMockLevel, immortal_table)));
+  }
+
+  FilterBitsBuilder* GetBuilder() const {
+    FilterBuildingContext context(table_options_);
+    context.column_family_name = "mock_cf";
+    context.compaction_style = ioptions_.compaction_style;
+    context.level_at_creation = kMockLevel;
+    context.info_log = ioptions_.logger;
+    return BloomFilterPolicy::GetBuilderFromContext(context);
+  }
+};
+
+}  // namespace mock
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/parsed_full_filter_block.cc b/src/rocksdb/table/block_based/parsed_full_filter_block.cc
new file mode 100644
index 000000000..9184a48d2
--- /dev/null
+++ b/src/rocksdb/table/block_based/parsed_full_filter_block.cc
@@ -0,0 +1,23 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/parsed_full_filter_block.h"
+
+#include "table/block_based/filter_policy_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ParsedFullFilterBlock::ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+                                             BlockContents&& contents)
+    : block_contents_(std::move(contents)),
+      filter_bits_reader_(
+          !block_contents_.data.empty()
+              ? filter_policy->GetFilterBitsReader(block_contents_.data)
+              : nullptr) {}
+
+ParsedFullFilterBlock::~ParsedFullFilterBlock() = default;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/parsed_full_filter_block.h b/src/rocksdb/table/block_based/parsed_full_filter_block.h
new file mode 100644
index 000000000..95d7b5208
--- /dev/null
+++ b/src/rocksdb/table/block_based/parsed_full_filter_block.h
@@ -0,0 +1,42 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FilterBitsReader;
+class FilterPolicy;
+
+// The sharable/cachable part of the full filter.
+class ParsedFullFilterBlock {
+ public:
+  ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+                        BlockContents&& contents);
+  ~ParsedFullFilterBlock();
+
+  FilterBitsReader* filter_bits_reader() const {
+    return filter_bits_reader_.get();
+  }
+
+  // TODO: consider memory usage of the FilterBitsReader
+  size_t ApproximateMemoryUsage() const {
+    return block_contents_.ApproximateMemoryUsage();
+  }
+
+  bool own_bytes() const { return block_contents_.own_bytes(); }
+
+  const Slice GetBlockContentsData() const { return block_contents_.data; }
+
+ private:
+  BlockContents block_contents_;
+  std::unique_ptr<FilterBitsReader> filter_bits_reader_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block.cc b/src/rocksdb/table/block_based/partitioned_filter_block.cc
new file mode 100644
index 000000000..af30925b7
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block.cc
@@ -0,0 +1,561 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/partitioned_filter_block.h"
+
+#include <utility>
+
+#include "block_type.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
+    const SliceTransform* _prefix_extractor, bool whole_key_filtering,
+    FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+    const bool use_value_delta_encoding,
+    PartitionedIndexBuilder* const p_index_builder,
+    const uint32_t partition_size)
+    : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering,
+                             filter_bits_builder),
+      index_on_filter_block_builder_(index_block_restart_interval,
+                                     true /*use_delta_encoding*/,
+                                     use_value_delta_encoding),
+      index_on_filter_block_builder_without_seq_(index_block_restart_interval,
+                                                 true /*use_delta_encoding*/,
+                                                 use_value_delta_encoding),
+      p_index_builder_(p_index_builder),
+      keys_added_to_partition_(0),
+      total_added_in_built_(0) {
+  keys_per_partition_ = static_cast<uint32_t>(
+      filter_bits_builder_->ApproximateNumEntries(partition_size));
+  if (keys_per_partition_ < 1) {
+    // partition_size (minus buffer, ~10%) might be smaller than minimum
+    // filter size, sometimes based on cache line size. Try to find that
+    // minimum size without CalculateSpace (not necessarily available).
+    uint32_t larger = std::max(partition_size + 4, uint32_t{16});
+    for (;;) {
+      keys_per_partition_ = static_cast<uint32_t>(
+          filter_bits_builder_->ApproximateNumEntries(larger));
+      if (keys_per_partition_ >= 1) {
+        break;
+      }
+      larger += larger / 4;
+      if (larger > 100000) {
+        // might be a broken implementation. substitute something reasonable:
+        // 1 key / byte.
+        keys_per_partition_ = partition_size;
+        break;
+      }
+    }
+  }
+}
+
+PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {
+  partitioned_filters_construction_status_.PermitUncheckedError();
+}
+
+void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
+    const Slice* next_key) {
+  // Use == to send the request only once
+  if (keys_added_to_partition_ == keys_per_partition_) {
+    // Currently only index builder is in charge of cutting a partition. We keep
+    // requesting until it is granted.
+    p_index_builder_->RequestPartitionCut();
+  }
+  if (!p_index_builder_->ShouldCutFilterBlock()) {
+    return;
+  }
+
+  // Add the prefix of the next key before finishing the partition without
+  // updating last_prefix_str_. This hack, fixes a bug with format_verison=3
+  // where seeking for the prefix would lead us to the previous partition.
+  const bool maybe_add_prefix =
+      next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
+  if (maybe_add_prefix) {
+    const Slice next_key_prefix = prefix_extractor()->Transform(*next_key);
+    if (next_key_prefix.compare(last_prefix_str()) != 0) {
+      AddKey(next_key_prefix);
+    }
+  }
+
+  total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded();
+  std::unique_ptr<const char[]> filter_data;
+  Status filter_construction_status = Status::OK();
+  Slice filter =
+      filter_bits_builder_->Finish(&filter_data, &filter_construction_status);
+  if (filter_construction_status.ok()) {
+    filter_construction_status = filter_bits_builder_->MaybePostVerify(filter);
+  }
+  std::string& index_key = p_index_builder_->GetPartitionKey();
+  filters.push_back({index_key, std::move(filter_data), filter});
+  if (!filter_construction_status.ok() &&
+      partitioned_filters_construction_status_.ok()) {
+    partitioned_filters_construction_status_ = filter_construction_status;
+  }
+  keys_added_to_partition_ = 0;
+  Reset();
+}
+
+void PartitionedFilterBlockBuilder::Add(const Slice& key) {
+  MaybeCutAFilterBlock(&key);
+  FullFilterBlockBuilder::Add(key);
+}
+
+void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
+  FullFilterBlockBuilder::AddKey(key);
+  keys_added_to_partition_++;
+}
+
+size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() {
+  return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded();
+}
+
+Slice PartitionedFilterBlockBuilder::Finish(
+    const BlockHandle& last_partition_block_handle, Status* status,
+    std::unique_ptr<const char[]>* filter_data) {
+  if (finishing_filters == true) {
+    // Record the handle of the last written filter block in the index
+    std::string handle_encoding;
+    last_partition_block_handle.EncodeTo(&handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(
+        &handle_delta_encoding,
+        last_partition_block_handle.size() - last_encoded_handle_.size());
+    last_encoded_handle_ = last_partition_block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_on_filter_block_builder_.Add(last_filter_entry_key, handle_encoding,
+                                       &handle_delta_encoding_slice);
+    if (!p_index_builder_->seperator_is_key_plus_seq()) {
+      index_on_filter_block_builder_without_seq_.Add(
+          ExtractUserKey(last_filter_entry_key), handle_encoding,
+          &handle_delta_encoding_slice);
+    }
+  } else {
+    MaybeCutAFilterBlock(nullptr);
+  }
+
+  if (!partitioned_filters_construction_status_.ok()) {
+    *status = partitioned_filters_construction_status_;
+    return Slice();
+  }
+
+  // If there is no filter partition left, then return the index on filter
+  // partitions
+  if (UNLIKELY(filters.empty())) {
+    *status = Status::OK();
+    last_filter_data.reset();
+    if (finishing_filters) {
+      // Simplest to just add them all at the end
+      total_added_in_built_ = 0;
+      if (p_index_builder_->seperator_is_key_plus_seq()) {
+        return index_on_filter_block_builder_.Finish();
+      } else {
+        return index_on_filter_block_builder_without_seq_.Finish();
+      }
+    } else {
+      // This is the rare case where no key was added to the filter
+      return Slice();
+    }
+  } else {
+    // Return the next filter partition in line and set Incomplete() status to
+    // indicate we expect more calls to Finish
+    *status = Status::Incomplete();
+    finishing_filters = true;
+
+    last_filter_entry_key = filters.front().key;
+    Slice filter = filters.front().filter;
+    last_filter_data = std::move(filters.front().filter_data);
+    if (filter_data != nullptr) {
+      *filter_data = std::move(last_filter_data);
+    }
+    filters.pop_front();
+    return filter;
+  }
+}
+
+PartitionedFilterBlockReader::PartitionedFilterBlockReader(
+    const BlockBasedTable* t, CachableEntry<Block>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {}
+
+std::unique_ptr<FilterBlockReader> PartitionedFilterBlockReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<Block> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(
+        table, prefetch_buffer, ro, use_cache, nullptr /* get_context */,
+        lookup_context, &filter_block, BlockType::kFilterPartitionIndex);
+    if (!s.ok()) {
+      IGNORE_STATUS_IF_ERROR(s);
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new PartitionedFilterBlockReader(table, std::move(filter_block)));
+}
+
+bool PartitionedFilterBlockReader::KeyMayMatch(
+    const Slice& key, const bool no_io, const Slice* const const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  assert(const_ikey_ptr != nullptr);
+  if (!whole_key_filtering()) {
+    return true;
+  }
+
+  return MayMatch(key, no_io, const_ikey_ptr, get_context, lookup_context,
+                  rate_limiter_priority, &FullFilterBlockReader::KeyMayMatch);
+}
+
+void PartitionedFilterBlockReader::KeysMayMatch(
+    MultiGetRange* range, const bool no_io,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  if (!whole_key_filtering()) {
+    return;  // Any/all may match
+  }
+
+  MayMatch(range, nullptr, no_io, lookup_context, rate_limiter_priority,
+           &FullFilterBlockReader::KeysMayMatch2);
+}
+
+bool PartitionedFilterBlockReader::PrefixMayMatch(
+    const Slice& prefix, const bool no_io, const Slice* const const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  assert(const_ikey_ptr != nullptr);
+  return MayMatch(prefix, no_io, const_ikey_ptr, get_context, lookup_context,
+                  rate_limiter_priority,
+                  &FullFilterBlockReader::PrefixMayMatch);
+}
+
+void PartitionedFilterBlockReader::PrefixesMayMatch(
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    const bool no_io, BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  assert(prefix_extractor);
+  MayMatch(range, prefix_extractor, no_io, lookup_context,
+           rate_limiter_priority, &FullFilterBlockReader::PrefixesMayMatch);
+}
+
+BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
+    const CachableEntry<Block>& filter_block, const Slice& entry) const {
+  IndexBlockIter iter;
+  const InternalKeyComparator* const comparator = internal_comparator();
+  Statistics* kNullStats = nullptr;
+  filter_block.GetValue()->NewIndexIterator(
+      comparator->user_comparator(),
+      table()->get_rep()->get_global_seqno(BlockType::kFilterPartitionIndex),
+      &iter, kNullStats, true /* total_order_seek */,
+      false /* have_first_key */, index_key_includes_seq(),
+      index_value_is_full());
+  iter.Seek(entry);
+  if (UNLIKELY(!iter.Valid())) {
+    // entry is larger than all the keys. However its prefix might still be
+    // present in the last partition. If this is called by PrefixMayMatch this
+    // is necessary for correct behavior. Otherwise it is unnecessary but safe.
+    // Assuming this is an unlikely case for full key search, the performance
+    // overhead should be negligible.
+    iter.SeekToLast();
+  }
+  assert(iter.Valid());
+  BlockHandle fltr_blk_handle = iter.value().handle;
+  return fltr_blk_handle;
+}
+
+Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
+    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle,
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority,
+    CachableEntry<ParsedFullFilterBlock>* filter_block) const {
+  assert(table());
+  assert(filter_block);
+  assert(filter_block->IsEmpty());
+
+  if (!filter_map_.empty()) {
+    auto iter = filter_map_.find(fltr_blk_handle.offset());
+    // This is a possible scenario since block cache might not have had space
+    // for the partition
+    if (iter != filter_map_.end()) {
+      filter_block->SetUnownedValue(iter->second.GetValue());
+      return Status::OK();
+    }
+  }
+
+  ReadOptions read_options;
+  read_options.rate_limiter_priority = rate_limiter_priority;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  const Status s =
+      table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle,
+                             UncompressionDict::GetEmptyDict(), filter_block,
+                             BlockType::kFilter, get_context, lookup_context,
+                             /* for_compaction */ false, /* use_cache */ true,
+                             /* wait_for_cache */ true, /* async_read */ false);
+
+  return s;
+}
+
+bool PartitionedFilterBlockReader::MayMatch(
+    const Slice& slice, bool no_io, const Slice* const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority,
+    FilterFunction filter_function) const {
+  CachableEntry<Block> filter_block;
+  Status s = GetOrReadFilterBlock(
+      no_io, get_context, lookup_context, &filter_block,
+      BlockType::kFilterPartitionIndex, rate_limiter_priority);
+  if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return true;
+  }
+
+  if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
+    return true;
+  }
+
+  auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr);
+  if (UNLIKELY(filter_handle.size() == 0)) {  // key is out of range
+    return false;
+  }
+
+  CachableEntry<ParsedFullFilterBlock> filter_partition_block;
+  s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle,
+                              no_io, get_context, lookup_context,
+                              rate_limiter_priority, &filter_partition_block);
+  if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return true;
+  }
+
+  FullFilterBlockReader filter_partition(table(),
+                                         std::move(filter_partition_block));
+  return (filter_partition.*filter_function)(slice, no_io, const_ikey_ptr,
+                                             get_context, lookup_context,
+                                             rate_limiter_priority);
+}
+
+void PartitionedFilterBlockReader::MayMatch(
+    MultiGetRange* range, const SliceTransform* prefix_extractor, bool no_io,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority,
+    FilterManyFunction filter_function) const {
+  CachableEntry<Block> filter_block;
+  Status s = GetOrReadFilterBlock(
+      no_io, range->begin()->get_context, lookup_context, &filter_block,
+      BlockType::kFilterPartitionIndex, rate_limiter_priority);
+  if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return;  // Any/all may match
+  }
+
+  if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
+    return;  // Any/all may match
+  }
+
+  auto start_iter_same_handle = range->begin();
+  BlockHandle prev_filter_handle = BlockHandle::NullBlockHandle();
+
+  // For all keys mapping to same partition (must be adjacent in sorted order)
+  // share block cache lookup and use full filter multiget on the partition
+  // filter.
+  for (auto iter = start_iter_same_handle; iter != range->end(); ++iter) {
+    // TODO: re-use one top-level index iterator
+    BlockHandle this_filter_handle =
+        GetFilterPartitionHandle(filter_block, iter->ikey);
+    if (!prev_filter_handle.IsNull() &&
+        this_filter_handle != prev_filter_handle) {
+      MultiGetRange subrange(*range, start_iter_same_handle, iter);
+      MayMatchPartition(&subrange, prefix_extractor, prev_filter_handle, no_io,
+                        lookup_context, rate_limiter_priority, filter_function);
+      range->AddSkipsFrom(subrange);
+      start_iter_same_handle = iter;
+    }
+    if (UNLIKELY(this_filter_handle.size() == 0)) {  // key is out of range
+      // Not reachable with current behavior of GetFilterPartitionHandle
+      assert(false);
+      range->SkipKey(iter);
+      prev_filter_handle = BlockHandle::NullBlockHandle();
+    } else {
+      prev_filter_handle = this_filter_handle;
+    }
+  }
+  if (!prev_filter_handle.IsNull()) {
+    MultiGetRange subrange(*range, start_iter_same_handle, range->end());
+    MayMatchPartition(&subrange, prefix_extractor, prev_filter_handle, no_io,
+                      lookup_context, rate_limiter_priority, filter_function);
+    range->AddSkipsFrom(subrange);
+  }
+}
+
+void PartitionedFilterBlockReader::MayMatchPartition(
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    BlockHandle filter_handle, bool no_io,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority,
+    FilterManyFunction filter_function) const {
+  CachableEntry<ParsedFullFilterBlock> filter_partition_block;
+  Status s = GetFilterPartitionBlock(
+      nullptr /* prefetch_buffer */, filter_handle, no_io,
+      range->begin()->get_context, lookup_context, rate_limiter_priority,
+      &filter_partition_block);
+  if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return;  // Any/all may match
+  }
+
+  FullFilterBlockReader filter_partition(table(),
+                                         std::move(filter_partition_block));
+  (filter_partition.*filter_function)(range, prefix_extractor, no_io,
+                                      lookup_context, rate_limiter_priority);
+}
+
+size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
+  size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<PartitionedFilterBlockReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
+  // TODO(myabandeh): better estimation for filter_map_ size
+}
+
+// TODO(myabandeh): merge this with the same function in IndexReader
+Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
+                                                       bool pin) {
+  assert(table());
+
+  const BlockBasedTable::Rep* const rep = table()->get_rep();
+  assert(rep);
+
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+
+  CachableEntry<Block> filter_block;
+
+  Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
+                                  &lookup_context, &filter_block,
+                                  BlockType::kFilterPartitionIndex,
+                                  ro.rate_limiter_priority);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(rep->ioptions.logger,
+                    "Error retrieving top-level filter block while trying to "
+                    "cache filter partitions: %s",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  // Before read partitions, prefetch them to avoid lots of IOs
+  assert(filter_block.GetValue());
+
+  IndexBlockIter biter;
+  const InternalKeyComparator* const comparator = internal_comparator();
+  Statistics* kNullStats = nullptr;
+  filter_block.GetValue()->NewIndexIterator(
+      comparator->user_comparator(),
+      rep->get_global_seqno(BlockType::kFilterPartitionIndex), &biter,
+      kNullStats, true /* total_order_seek */, false /* have_first_key */,
+      index_key_includes_seq(), index_value_is_full());
+  // Index partitions are assumed to be consecuitive. Prefetch them all.
+  // Read the first block offset
+  biter.SeekToFirst();
+  BlockHandle handle = biter.value().handle;
+  uint64_t prefetch_off = handle.offset();
+
+  // Read the last block's offset
+  biter.SeekToLast();
+  handle = biter.value().handle;
+  uint64_t last_off =
+      handle.offset() + handle.size() + BlockBasedTable::kBlockTrailerSize;
+  uint64_t prefetch_len = last_off - prefetch_off;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+  rep->CreateFilePrefetchBuffer(
+      0, 0, &prefetch_buffer, false /* Implicit autoreadahead */,
+      0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/);
+
+  IOOptions opts;
+  s = rep->file->PrepareIOOptions(ro, opts);
+  if (s.ok()) {
+    s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
+                                  static_cast<size_t>(prefetch_len),
+                                  ro.rate_limiter_priority);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  // After prefetch, read the partitions one by one
+  for (biter.SeekToFirst(); biter.Valid(); biter.Next()) {
+    handle = biter.value().handle;
+
+    CachableEntry<ParsedFullFilterBlock> block;
+    // TODO: Support counter batch update for partitioned index and
+    // filter blocks
+    s = table()->MaybeReadBlockAndLoadToCache(
+        prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
+        /* wait */ true, /* for_compaction */ false, &block, BlockType::kFilter,
+        nullptr /* get_context */, &lookup_context, nullptr /* contents */,
+        false);
+    if (!s.ok()) {
+      return s;
+    }
+    assert(s.ok() || block.GetValue() == nullptr);
+
+    if (block.GetValue() != nullptr) {
+      if (block.IsCached()) {
+        if (pin) {
+          filter_map_[handle.offset()] = std::move(block);
+        }
+      }
+    }
+  }
+  return biter.status();
+}
+
+const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator()
+    const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return &table()->get_rep()->internal_comparator;
+}
+
+bool PartitionedFilterBlockReader::index_key_includes_seq() const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return table()->get_rep()->index_key_includes_seq;
+}
+
+bool PartitionedFilterBlockReader::index_value_is_full() const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return table()->get_rep()->index_value_is_full;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block.h b/src/rocksdb/table/block_based/partitioned_filter_block.h
new file mode 100644
index 000000000..955b50739
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block.h
@@ -0,0 +1,178 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <deque>
+#include <list>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/block.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/index_builder.h"
+#include "util/autovector.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+class InternalKeyComparator;
+
+class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
+ public:
+  explicit PartitionedFilterBlockBuilder(
+      const SliceTransform* prefix_extractor, bool whole_key_filtering,
+      FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+      const bool use_value_delta_encoding,
+      PartitionedIndexBuilder* const p_index_builder,
+      const uint32_t partition_size);
+
+  virtual ~PartitionedFilterBlockBuilder();
+
+  void AddKey(const Slice& key) override;
+  void Add(const Slice& key) override;
+  size_t EstimateEntriesAdded() override;
+
+  virtual Slice Finish(
+      const BlockHandle& last_partition_block_handle, Status* status,
+      std::unique_ptr<const char[]>* filter_data = nullptr) override;
+
+  virtual void ResetFilterBitsBuilder() override {
+    // Previously constructed partitioned filters by
+    // this to-be-reset FiterBitsBuilder can also be
+    // cleared
+    filters.clear();
+    FullFilterBlockBuilder::ResetFilterBitsBuilder();
+  }
+
+  // For PartitionFilter, optional post-verifing the filter is done
+  // as part of PartitionFilterBlockBuilder::Finish
+  // to avoid implementation complexity of doing it elsewhere.
+  // Therefore we are skipping it in here.
+  virtual Status MaybePostVerifyFilter(
+      const Slice& /* filter_content */) override {
+    return Status::OK();
+  }
+
+ private:
+  // Filter data
+  BlockBuilder index_on_filter_block_builder_;  // top-level index builder
+  BlockBuilder
+      index_on_filter_block_builder_without_seq_;  // same for user keys
+  struct FilterEntry {
+    std::string key;
+    std::unique_ptr<const char[]> filter_data;
+    Slice filter;
+  };
+  std::deque<FilterEntry> filters;  // list of partitioned filters and keys used
+                                    // in building the index
+
+  // Set to the first non-okay status if any of the filter
+  // partitions experiences construction error.
+  // If partitioned_filters_construction_status_ is non-okay,
+  // then the whole partitioned filters should not be used.
+  Status partitioned_filters_construction_status_;
+  std::string last_filter_entry_key;
+  std::unique_ptr<const char[]> last_filter_data;
+  std::unique_ptr<IndexBuilder> value;
+  bool finishing_filters =
+      false;  // true if Finish is called once but not complete yet.
+  // The policy of when cut a filter block and Finish it
+  void MaybeCutAFilterBlock(const Slice* next_key);
+  // Currently we keep the same number of partitions for filters and indexes.
+  // This would allow for some potentioal optimizations in future. If such
+  // optimizations did not realize we can use different number of partitions and
+  // eliminate p_index_builder_
+  PartitionedIndexBuilder* const p_index_builder_;
+  // The desired number of keys per partition
+  uint32_t keys_per_partition_;
+  // The number of keys added to the last partition so far
+  uint32_t keys_added_to_partition_;
+  // According to the bits builders, how many keys/prefixes added
+  // in all the filters we have fully built
+  uint64_t total_added_in_built_;
+  BlockHandle last_encoded_handle_;
+};
+
+class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
+ public:
+  PartitionedFilterBlockReader(const BlockBasedTable* t,
+                               CachableEntry<Block>&& filter_block);
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, const ReadOptions& ro,
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context);
+
+  bool KeyMayMatch(const Slice& key, const bool no_io,
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context,
+                   Env::IOPriority rate_limiter_priority) override;
+  void KeysMayMatch(MultiGetRange* range, const bool no_io,
+                    BlockCacheLookupContext* lookup_context,
+                    Env::IOPriority rate_limiter_priority) override;
+
+  bool PrefixMayMatch(const Slice& prefix, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context,
+                      Env::IOPriority rate_limiter_priority) override;
+  void PrefixesMayMatch(MultiGetRange* range,
+                        const SliceTransform* prefix_extractor,
+                        const bool no_io,
+                        BlockCacheLookupContext* lookup_context,
+                        Env::IOPriority rate_limiter_priority) override;
+
+  size_t ApproximateMemoryUsage() const override;
+
+ private:
+  BlockHandle GetFilterPartitionHandle(const CachableEntry<Block>& filter_block,
+                                       const Slice& entry) const;
+  Status GetFilterPartitionBlock(
+      FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle,
+      bool no_io, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      Env::IOPriority rate_limiter_priority,
+      CachableEntry<ParsedFullFilterBlock>* filter_block) const;
+
+  using FilterFunction = bool (FullFilterBlockReader::*)(
+      const Slice& slice, const bool no_io, const Slice* const const_ikey_ptr,
+      GetContext* get_context, BlockCacheLookupContext* lookup_context,
+      Env::IOPriority rate_limiter_priority);
+  bool MayMatch(const Slice& slice, bool no_io, const Slice* const_ikey_ptr,
+                GetContext* get_context,
+                BlockCacheLookupContext* lookup_context,
+                Env::IOPriority rate_limiter_priority,
+                FilterFunction filter_function) const;
+  using FilterManyFunction = void (FullFilterBlockReader::*)(
+      MultiGetRange* range, const SliceTransform* prefix_extractor,
+      const bool no_io, BlockCacheLookupContext* lookup_context,
+      Env::IOPriority rate_limiter_priority);
+  void MayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor,
+                bool no_io, BlockCacheLookupContext* lookup_context,
+                Env::IOPriority rate_limiter_priority,
+                FilterManyFunction filter_function) const;
+  void MayMatchPartition(MultiGetRange* range,
+                         const SliceTransform* prefix_extractor,
+                         BlockHandle filter_handle, bool no_io,
+                         BlockCacheLookupContext* lookup_context,
+                         Env::IOPriority rate_limiter_priority,
+                         FilterManyFunction filter_function) const;
+  Status CacheDependencies(const ReadOptions& ro, bool pin) override;
+
+  const InternalKeyComparator* internal_comparator() const;
+  bool index_key_includes_seq() const;
+  bool index_value_is_full() const;
+
+ protected:
+  // For partition blocks pinned in cache. Can be a subset of blocks
+  // in case some fail insertion on attempt to pin.
+  UnorderedMap<uint64_t, CachableEntry<ParsedFullFilterBlock>> filter_map_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block_test.cc b/src/rocksdb/table/block_based/partitioned_filter_block_test.cc
new file mode 100644
index 000000000..0ce50d2bc
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block_test.cc
@@ -0,0 +1,436 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/partitioned_filter_block.h"
+
+#include <map>
+
+#include "index_builder.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::map<uint64_t, std::string> blooms;
+
+class MockedBlockBasedTable : public BlockBasedTable {
+ public:
+  MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib)
+      : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) {
+    // Initialize what Open normally does as much as necessary for the test
+    rep->index_key_includes_seq = pib->seperator_is_key_plus_seq();
+    rep->index_value_is_full = !pib->get_use_value_delta_encoding();
+  }
+};
+
+class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader {
+ public:
+  MyPartitionedFilterBlockReader(BlockBasedTable* t,
+                                 CachableEntry<Block>&& filter_block)
+      : PartitionedFilterBlockReader(t, std::move(filter_block)) {
+    for (const auto& pair : blooms) {
+      const uint64_t offset = pair.first;
+      const std::string& bloom = pair.second;
+
+      assert(t);
+      assert(t->get_rep());
+      CachableEntry<ParsedFullFilterBlock> block(
+          new ParsedFullFilterBlock(
+              t->get_rep()->table_options.filter_policy.get(),
+              BlockContents(Slice(bloom))),
+          nullptr /* cache */, nullptr /* cache_handle */,
+          true /* own_value */);
+      filter_map_[offset] = std::move(block);
+    }
+  }
+};
+
+class PartitionedFilterBlockTest
+    : public testing::Test,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+  Options options_;
+  ImmutableOptions ioptions_;
+  EnvOptions env_options_;
+  BlockBasedTableOptions table_options_;
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+  std::shared_ptr<Cache> cache_;
+  int bits_per_key_;
+
+  PartitionedFilterBlockTest()
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator),
+        bits_per_key_(10) {
+    table_options_.filter_policy.reset(
+        NewBloomFilterPolicy(bits_per_key_, false));
+    table_options_.format_version = GetParam();
+    table_options_.index_block_restart_interval = 3;
+  }
+
+  ~PartitionedFilterBlockTest() override {}
+
+  const std::string keys[4] = {"afoo", "bar", "box", "hello"};
+  const std::string missing_keys[2] = {"missing", "other"};
+
+  uint64_t MaxIndexSize() {
+    int num_keys = sizeof(keys) / sizeof(*keys);
+    uint64_t max_key_size = 0;
+    for (int i = 1; i < num_keys; i++) {
+      max_key_size =
+          std::max(max_key_size, static_cast<uint64_t>(keys[i].size()));
+    }
+    uint64_t max_index_size = num_keys * (max_key_size + 8 /*handle*/);
+    return max_index_size;
+  }
+
+  uint64_t MaxFilterSize() {
+    int num_keys = sizeof(keys) / sizeof(*keys);
+    // General, rough over-approximation
+    return num_keys * bits_per_key_ + (CACHE_LINE_SIZE * 8 + /*metadata*/ 5);
+  }
+
+  uint64_t last_offset = 10;
+  BlockHandle Write(const Slice& slice) {
+    BlockHandle bh(last_offset + 1, slice.size());
+    blooms[bh.offset()] = slice.ToString();
+    last_offset += bh.size();
+    return bh;
+  }
+
+  PartitionedIndexBuilder* NewIndexBuilder() {
+    const bool kValueDeltaEncoded = true;
+    return PartitionedIndexBuilder::CreateIndexBuilder(
+        &icomp_, !kValueDeltaEncoded, table_options_);
+  }
+
+  PartitionedFilterBlockBuilder* NewBuilder(
+      PartitionedIndexBuilder* const p_index_builder,
+      const SliceTransform* prefix_extractor = nullptr) {
+    assert(table_options_.block_size_deviation <= 100);
+    auto partition_size =
+        static_cast<uint32_t>(((table_options_.metadata_block_size *
+                                (100 - table_options_.block_size_deviation)) +
+                               99) /
+                              100);
+    partition_size = std::max(partition_size, static_cast<uint32_t>(1));
+    const bool kValueDeltaEncoded = true;
+    return new PartitionedFilterBlockBuilder(
+        prefix_extractor, table_options_.whole_key_filtering,
+        BloomFilterPolicy::GetBuilderFromContext(
+            FilterBuildingContext(table_options_)),
+        table_options_.index_block_restart_interval, !kValueDeltaEncoded,
+        p_index_builder, partition_size);
+  }
+
+  PartitionedFilterBlockReader* NewReader(
+      PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) {
+    BlockHandle bh;
+    Status status;
+    Slice slice;
+    std::unique_ptr<const char[]> filter_data;
+    do {
+      slice = builder->Finish(bh, &status, &filter_data);
+      bh = Write(slice);
+    } while (status.IsIncomplete());
+
+    constexpr bool skip_filters = false;
+    constexpr uint64_t file_size = 12345;
+    constexpr int level = 0;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockedBlockBasedTable(
+        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
+                                 icomp_, skip_filters, file_size, level,
+                                 immortal_table),
+        pib));
+    BlockContents contents(slice);
+    CachableEntry<Block> block(
+        new Block(std::move(contents), 0 /* read_amp_bytes_per_bit */, nullptr),
+        nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+    auto reader =
+        new MyPartitionedFilterBlockReader(table_.get(), std::move(block));
+    return reader;
+  }
+
+  void VerifyReader(PartitionedFilterBlockBuilder* builder,
+                    PartitionedIndexBuilder* pib, bool empty = false) {
+    std::unique_ptr<PartitionedFilterBlockReader> reader(
+        NewReader(builder, pib));
+    Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+    // Querying added keys
+    const bool no_io = true;
+    for (auto key : keys) {
+      auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+      const Slice ikey_slice = Slice(*ikey.rep());
+      ASSERT_TRUE(reader->KeyMayMatch(key, !no_io, &ikey_slice,
+                                      /*get_context=*/nullptr,
+                                      /*lookup_context=*/nullptr,
+                                      rate_limiter_priority));
+    }
+    {
+      // querying a key twice
+      auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue);
+      const Slice ikey_slice = Slice(*ikey.rep());
+      ASSERT_TRUE(reader->KeyMayMatch(keys[0], !no_io, &ikey_slice,
+                                      /*get_context=*/nullptr,
+                                      /*lookup_context=*/nullptr,
+                                      rate_limiter_priority));
+    }
+    // querying missing keys
+    for (auto key : missing_keys) {
+      auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+      const Slice ikey_slice = Slice(*ikey.rep());
+      if (empty) {
+        ASSERT_TRUE(reader->KeyMayMatch(key, !no_io, &ikey_slice,
+                                        /*get_context=*/nullptr,
+                                        /*lookup_context=*/nullptr,
+                                        rate_limiter_priority));
+      } else {
+        // assuming a good hash function
+        ASSERT_FALSE(reader->KeyMayMatch(key, !no_io, &ikey_slice,
+                                         /*get_context=*/nullptr,
+                                         /*lookup_context=*/nullptr,
+                                         rate_limiter_priority));
+      }
+    }
+  }
+
+  int TestBlockPerKey() {
+    std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+    std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+        NewBuilder(pib.get()));
+    int i = 0;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i]);
+
+    VerifyReader(builder.get(), pib.get());
+    return CountNumOfIndexPartitions(pib.get());
+  }
+
+  void TestBlockPerTwoKeys(const SliceTransform* prefix_extractor = nullptr) {
+    std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+    std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+        NewBuilder(pib.get(), prefix_extractor));
+    int i = 0;
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i]);
+
+    VerifyReader(builder.get(), pib.get(), prefix_extractor);
+  }
+
+  void TestBlockPerAllKeys() {
+    std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+    std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+        NewBuilder(pib.get()));
+    int i = 0;
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i]);
+
+    VerifyReader(builder.get(), pib.get());
+  }
+
+  void CutABlock(PartitionedIndexBuilder* builder,
+                 const std::string& user_key) {
+    // Assuming a block is cut, add an entry to the index
+    std::string key =
+        std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep());
+    BlockHandle dont_care_block_handle(1, 1);
+    builder->AddIndexEntry(&key, nullptr, dont_care_block_handle);
+  }
+
+  void CutABlock(PartitionedIndexBuilder* builder, const std::string& user_key,
+                 const std::string& next_user_key) {
+    // Assuming a block is cut, add an entry to the index
+    std::string key =
+        std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep());
+    std::string next_key = std::string(
+        *InternalKey(next_user_key, 0, ValueType::kTypeValue).rep());
+    BlockHandle dont_care_block_handle(1, 1);
+    Slice slice = Slice(next_key.data(), next_key.size());
+    builder->AddIndexEntry(&key, &slice, dont_care_block_handle);
+  }
+
+  int CountNumOfIndexPartitions(PartitionedIndexBuilder* builder) {
+    IndexBuilder::IndexBlocks dont_care_ib;
+    BlockHandle dont_care_bh(10, 10);
+    Status s;
+    int cnt = 0;
+    do {
+      s = builder->Finish(&dont_care_ib, dont_care_bh);
+      cnt++;
+    } while (s.IsIncomplete());
+    return cnt - 1;  // 1 is 2nd level index
+  }
+};
+
+// Format versions potentially intersting to partitioning
+INSTANTIATE_TEST_CASE_P(FormatVersions, PartitionedFilterBlockTest,
+                        testing::ValuesIn(std::set<uint32_t>{
+                            2, 3, 4, test::kDefaultFormatVersion,
+                            kLatestFormatVersion}));
+
+TEST_P(PartitionedFilterBlockTest, EmptyBuilder) {
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(NewBuilder(pib.get()));
+  const bool empty = true;
+  VerifyReader(builder.get(), pib.get(), empty);
+}
+
+TEST_P(PartitionedFilterBlockTest, OneBlock) {
+  uint64_t max_index_size = MaxIndexSize();
+  for (uint64_t i = 1; i < max_index_size + 1; i++) {
+    table_options_.metadata_block_size = i;
+    TestBlockPerAllKeys();
+  }
+}
+
+TEST_P(PartitionedFilterBlockTest, TwoBlocksPerKey) {
+  uint64_t max_index_size = MaxIndexSize();
+  for (uint64_t i = 1; i < max_index_size + 1; i++) {
+    table_options_.metadata_block_size = i;
+    TestBlockPerTwoKeys();
+  }
+}
+
+// This reproduces the bug that a prefix is the same among multiple consecutive
+// blocks but the bug would add it only to the first block.
+TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
+  // some small number to cause partition cuts
+  table_options_.metadata_block_size = 1;
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(1));
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+      NewBuilder(pib.get(), prefix_extractor.get()));
+  const std::string pkeys[3] = {"p-key10", "p-key20", "p-key30"};
+  builder->Add(pkeys[0]);
+  CutABlock(pib.get(), pkeys[0], pkeys[1]);
+  builder->Add(pkeys[1]);
+  CutABlock(pib.get(), pkeys[1], pkeys[2]);
+  builder->Add(pkeys[2]);
+  CutABlock(pib.get(), pkeys[2]);
+  std::unique_ptr<PartitionedFilterBlockReader> reader(
+      NewReader(builder.get(), pib.get()));
+  for (auto key : pkeys) {
+    auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key),
+                                       /*no_io=*/false, &ikey_slice,
+                                       /*get_context=*/nullptr,
+                                       /*lookup_context=*/nullptr,
+                                       Env::IO_TOTAL));
+  }
+  // Non-existent keys but with the same prefix
+  const std::string pnonkeys[4] = {"p-key9", "p-key11", "p-key21", "p-key31"};
+  for (auto key : pnonkeys) {
+    auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key),
+                                       /*no_io=*/false, &ikey_slice,
+                                       /*get_context=*/nullptr,
+                                       /*lookup_context=*/nullptr,
+                                       Env::IO_TOTAL));
+  }
+}
+
+// This reproduces the bug in format_version=3 that the seeking the prefix will
+// lead us to the partition before the one that has filter for the prefix.
+TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) {
+  // some small number to cause partition cuts
+  table_options_.metadata_block_size = 1;
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(2));
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+      NewBuilder(pib.get(), prefix_extractor.get()));
+  // In the bug, searching for prefix "p3" on an index with format version 3,
+  // will give the key "p3" and the partition of the keys that are <= p3, i.e.,
+  // p2-keys, where the filter for prefix "p3" does not exist.
+  const std::string pkeys[] = {"p1-key1", "p2-key2", "p3-key3", "p4-key3",
+                               "p5-key3"};
+  builder->Add(pkeys[0]);
+  CutABlock(pib.get(), pkeys[0], pkeys[1]);
+  builder->Add(pkeys[1]);
+  CutABlock(pib.get(), pkeys[1], pkeys[2]);
+  builder->Add(pkeys[2]);
+  CutABlock(pib.get(), pkeys[2], pkeys[3]);
+  builder->Add(pkeys[3]);
+  CutABlock(pib.get(), pkeys[3], pkeys[4]);
+  builder->Add(pkeys[4]);
+  CutABlock(pib.get(), pkeys[4]);
+  std::unique_ptr<PartitionedFilterBlockReader> reader(
+      NewReader(builder.get(), pib.get()));
+  Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+  for (auto key : pkeys) {
+    auto prefix = prefix_extractor->Transform(key);
+    auto ikey = InternalKey(prefix, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(prefix,
+                                       /*no_io=*/false, &ikey_slice,
+                                       /*get_context=*/nullptr,
+                                       /*lookup_context=*/nullptr,
+                                       rate_limiter_priority));
+  }
+}
+
+TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) {
+  uint64_t max_index_size = MaxIndexSize();
+  for (uint64_t i = 1; i < max_index_size + 1; i++) {
+    table_options_.metadata_block_size = i;
+    TestBlockPerKey();
+  }
+}
+
+TEST_P(PartitionedFilterBlockTest, PartitionCount) {
+  int num_keys = sizeof(keys) / sizeof(*keys);
+  table_options_.metadata_block_size =
+      std::max(MaxIndexSize(), MaxFilterSize());
+  int partitions = TestBlockPerKey();
+  ASSERT_EQ(partitions, 1);
+  // A low number ensures cutting a block after each key
+  table_options_.metadata_block_size = 1;
+  partitions = TestBlockPerKey();
+  ASSERT_EQ(partitions, num_keys - 1 /* last two keys make one flush */);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/partitioned_index_iterator.cc b/src/rocksdb/table/block_based/partitioned_index_iterator.cc
new file mode 100644
index 000000000..b9bc2155a
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_index_iterator.cc
@@ -0,0 +1,163 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/partitioned_index_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+void PartitionedIndexIterator::Seek(const Slice& target) { SeekImpl(&target); }
+
+void PartitionedIndexIterator::SeekToFirst() { SeekImpl(nullptr); }
+
+void PartitionedIndexIterator::SeekImpl(const Slice* target) {
+  SavePrevIndexValue();
+
+  if (target) {
+    index_iter_->Seek(*target);
+  } else {
+    index_iter_->SeekToFirst();
+  }
+
+  if (!index_iter_->Valid()) {
+    ResetPartitionedIndexIter();
+    return;
+  }
+
+  InitPartitionedIndexBlock();
+
+  if (target) {
+    block_iter_.Seek(*target);
+  } else {
+    block_iter_.SeekToFirst();
+  }
+  FindKeyForward();
+
+  // We could check upper bound here, but that would be too complicated
+  // and checking index upper bound is less useful than for data blocks.
+
+  if (target) {
+    assert(!Valid() || (table_->get_rep()->index_key_includes_seq
+                            ? (icomp_.Compare(*target, key()) <= 0)
+                            : (user_comparator_.Compare(ExtractUserKey(*target),
+                                                        key()) <= 0)));
+  }
+}
+
+void PartitionedIndexIterator::SeekToLast() {
+  SavePrevIndexValue();
+  index_iter_->SeekToLast();
+  if (!index_iter_->Valid()) {
+    ResetPartitionedIndexIter();
+    return;
+  }
+  InitPartitionedIndexBlock();
+  block_iter_.SeekToLast();
+  FindKeyBackward();
+}
+
+void PartitionedIndexIterator::Next() {
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Next();
+  FindKeyForward();
+}
+
+void PartitionedIndexIterator::Prev() {
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Prev();
+
+  FindKeyBackward();
+}
+
+void PartitionedIndexIterator::InitPartitionedIndexBlock() {
+  BlockHandle partitioned_index_handle = index_iter_->value().handle;
+  if (!block_iter_points_to_real_block_ ||
+      partitioned_index_handle.offset() != prev_block_offset_ ||
+      // if previous attempt of reading the block missed cache, try again
+      block_iter_.status().IsIncomplete()) {
+    if (block_iter_points_to_real_block_) {
+      ResetPartitionedIndexIter();
+    }
+    auto* rep = table_->get_rep();
+    bool is_for_compaction =
+        lookup_context_.caller == TableReaderCaller::kCompaction;
+    // Prefetch additional data for range scans (iterators).
+    // Implicit auto readahead:
+    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
+    // Explicit user requested readahead:
+    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
+    block_prefetcher_.PrefetchIfNeeded(
+        rep, partitioned_index_handle, read_options_.readahead_size,
+        is_for_compaction, /*no_sequential_checking=*/false,
+        read_options_.rate_limiter_priority);
+    Status s;
+    table_->NewDataBlockIterator<IndexBlockIter>(
+        read_options_, partitioned_index_handle, &block_iter_,
+        BlockType::kIndex,
+        /*get_context=*/nullptr, &lookup_context_,
+        block_prefetcher_.prefetch_buffer(),
+        /*for_compaction=*/is_for_compaction, /*async_read=*/false, s);
+    block_iter_points_to_real_block_ = true;
+    // We could check upper bound here but it is complicated to reason about
+    // upper bound in index iterator. On the other than, in large scans, index
+    // iterators are moved much less frequently compared to data blocks. So
+    // the upper bound check is skipped for simplicity.
+  }
+}
+
+void PartitionedIndexIterator::FindKeyForward() {
+  // This method's code is kept short to make it likely to be inlined.
+
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.Valid()) {
+    // This is the only call site of FindBlockForward(), but it's extracted into
+    // a separate method to keep FindKeyForward() short and likely to be
+    // inlined. When transitioning to a different block, we call
+    // FindBlockForward(), which is much longer and is probably not inlined.
+    FindBlockForward();
+  } else {
+    // This is the fast path that avoids a function call.
+  }
+}
+
+void PartitionedIndexIterator::FindBlockForward() {
+  // TODO the while loop inherits from two-level-iterator. We don't know
+  // whether a block can be empty so it can be replaced by an "if".
+  do {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+    ResetPartitionedIndexIter();
+    index_iter_->Next();
+
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    InitPartitionedIndexBlock();
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
+}
+
+void PartitionedIndexIterator::FindKeyBackward() {
+  while (!block_iter_.Valid()) {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    ResetPartitionedIndexIter();
+    index_iter_->Prev();
+
+    if (index_iter_->Valid()) {
+      InitPartitionedIndexBlock();
+      block_iter_.SeekToLast();
+    } else {
+      return;
+    }
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_index_iterator.h b/src/rocksdb/table/block_based/partitioned_index_iterator.h
new file mode 100644
index 000000000..6412fe239
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_index_iterator.h
@@ -0,0 +1,160 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_based_table_reader_impl.h"
+#include "table/block_based/block_prefetcher.h"
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Iterator that iterates over partitioned index.
+// Some upper and lower bound tricks played in block based table iterators
+// could be played here, but it's too complicated to reason about index
+// keys with upper or lower bound, so we skip it for simplicity.
+class PartitionedIndexIterator : public InternalIteratorBase<IndexValue> {
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+ public:
+  PartitionedIndexIterator(
+      const BlockBasedTable* table, const ReadOptions& read_options,
+      const InternalKeyComparator& icomp,
+      std::unique_ptr<InternalIteratorBase<IndexValue>>&& index_iter,
+      TableReaderCaller caller, size_t compaction_readahead_size = 0)
+      : index_iter_(std::move(index_iter)),
+        table_(table),
+        read_options_(read_options),
+#ifndef NDEBUG
+        icomp_(icomp),
+#endif
+        user_comparator_(icomp.user_comparator()),
+        block_iter_points_to_real_block_(false),
+        lookup_context_(caller),
+        block_prefetcher_(
+            compaction_readahead_size,
+            table_->get_rep()->table_options.initial_auto_readahead_size) {
+  }
+
+  ~PartitionedIndexIterator() override {}
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice&) override {
+    // Shouldn't be called.
+    assert(false);
+  }
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() final override;
+  bool NextAndGetResult(IterateResult*) override {
+    assert(false);
+    return false;
+  }
+  void Prev() override;
+  bool Valid() const override {
+    return block_iter_points_to_real_block_ && block_iter_.Valid();
+  }
+  Slice key() const override {
+    assert(Valid());
+    return block_iter_.key();
+  }
+  Slice user_key() const override {
+    assert(Valid());
+    return block_iter_.user_key();
+  }
+  IndexValue value() const override {
+    assert(Valid());
+    return block_iter_.value();
+  }
+  Status status() const override {
+    // Prefix index set status to NotFound when the prefix does not exist
+    if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
+      return index_iter_->status();
+    } else if (block_iter_points_to_real_block_) {
+      return block_iter_.status();
+    } else {
+      return Status::OK();
+    }
+  }
+  inline IterBoundCheck UpperBoundCheckResult() override {
+    // Shouldn't be called.
+    assert(false);
+    return IterBoundCheck::kUnknown;
+  }
+  void SetPinnedItersMgr(PinnedIteratorsManager*) override {
+    // Shouldn't be called.
+    assert(false);
+  }
+  bool IsKeyPinned() const override {
+    // Shouldn't be called.
+    assert(false);
+    return false;
+  }
+  bool IsValuePinned() const override {
+    // Shouldn't be called.
+    assert(false);
+    return false;
+  }
+
+  void ResetPartitionedIndexIter() {
+    if (block_iter_points_to_real_block_) {
+      block_iter_.Invalidate(Status::OK());
+      block_iter_points_to_real_block_ = false;
+    }
+  }
+
+  void SavePrevIndexValue() {
+    if (block_iter_points_to_real_block_) {
+      // Reseek. If they end up with the same data block, we shouldn't re-fetch
+      // the same data block.
+      prev_block_offset_ = index_iter_->value().handle.offset();
+    }
+  }
+
+  void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (block_prefetcher_.prefetch_buffer() != nullptr &&
+        read_options_.adaptive_readahead) {
+      block_prefetcher_.prefetch_buffer()->GetReadaheadState(
+          &(readahead_file_info->index_block_readahead_info));
+    }
+  }
+
+  void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (read_options_.adaptive_readahead) {
+      block_prefetcher_.SetReadaheadState(
+          &(readahead_file_info->index_block_readahead_info));
+    }
+  }
+
+  std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
+
+ private:
+  friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test;
+  const BlockBasedTable* table_;
+  const ReadOptions read_options_;
+#ifndef NDEBUG
+  const InternalKeyComparator& icomp_;
+#endif
+  UserComparatorWrapper user_comparator_;
+  IndexBlockIter block_iter_;
+
+  // True if block_iter_ is initialized and points to the same block
+  // as index iterator.
+  bool block_iter_points_to_real_block_;
+  uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
+  BlockCacheLookupContext lookup_context_;
+  BlockPrefetcher block_prefetcher_;
+
+  // If `target` is null, seek to first.
+  void SeekImpl(const Slice* target);
+
+  void InitPartitionedIndexBlock();
+  void FindKeyForward();
+  void FindBlockForward();
+  void FindKeyBackward();
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_index_reader.cc b/src/rocksdb/table/block_based/partitioned_index_reader.cc
new file mode 100644
index 000000000..017ea4a3a
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_index_reader.cc
@@ -0,0 +1,215 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/partitioned_index_reader.h"
+
+#include "file/random_access_file_reader.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/partitioned_index_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status PartitionIndexReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<IndexReader>* index_reader) {
+  assert(table != nullptr);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+  assert(index_reader != nullptr);
+
+  CachableEntry<Block> index_block;
+  if (prefetch || !use_cache) {
+    const Status s =
+        ReadIndexBlock(table, prefetch_buffer, ro, use_cache,
+                       /*get_context=*/nullptr, lookup_context, &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      index_block.Reset();
+    }
+  }
+
+  index_reader->reset(new PartitionIndexReader(table, std::move(index_block)));
+
+  return Status::OK();
+}
+
+InternalIteratorBase<IndexValue>* PartitionIndexReader::NewIterator(
+    const ReadOptions& read_options, bool /* disable_prefix_seek */,
+    IndexBlockIter* iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+  const bool no_io = (read_options.read_tier == kBlockCacheTier);
+  CachableEntry<Block> index_block;
+  const Status s =
+      GetOrReadIndexBlock(no_io, read_options.rate_limiter_priority,
+                          get_context, lookup_context, &index_block);
+  if (!s.ok()) {
+    if (iter != nullptr) {
+      iter->Invalidate(s);
+      return iter;
+    }
+
+    return NewErrorInternalIterator<IndexValue>(s);
+  }
+
+  const BlockBasedTable::Rep* rep = table()->rep_;
+  InternalIteratorBase<IndexValue>* it = nullptr;
+
+  Statistics* kNullStats = nullptr;
+  // Filters are already checked before seeking the index
+  if (!partition_map_.empty()) {
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    it = NewTwoLevelIterator(
+        new BlockBasedTable::PartitionedIndexIteratorState(table(),
+                                                           &partition_map_),
+        index_block.GetValue()->NewIndexIterator(
+            internal_comparator()->user_comparator(),
+            rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
+            index_has_first_key(), index_key_includes_seq(),
+            index_value_is_full()));
+  } else {
+    ReadOptions ro;
+    ro.fill_cache = read_options.fill_cache;
+    ro.deadline = read_options.deadline;
+    ro.io_timeout = read_options.io_timeout;
+    ro.adaptive_readahead = read_options.adaptive_readahead;
+    ro.async_io = read_options.async_io;
+    ro.rate_limiter_priority = read_options.rate_limiter_priority;
+
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(
+        index_block.GetValue()->NewIndexIterator(
+            internal_comparator()->user_comparator(),
+            rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
+            index_has_first_key(), index_key_includes_seq(),
+            index_value_is_full()));
+
+    it = new PartitionedIndexIterator(
+        table(), ro, *internal_comparator(), std::move(index_iter),
+        lookup_context ? lookup_context->caller
+                       : TableReaderCaller::kUncategorized);
+  }
+
+  assert(it != nullptr);
+  index_block.TransferTo(it);
+
+  return it;
+
+  // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
+  // on-stack BlockIter while the state is on heap. Currentlly it assumes
+  // the first level iter is always on heap and will attempt to delete it
+  // in its destructor.
+}
+Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
+                                               bool pin) {
+  // Before read partitions, prefetch them to avoid lots of IOs
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+  const BlockBasedTable::Rep* rep = table()->rep_;
+  IndexBlockIter biter;
+  BlockHandle handle;
+  Statistics* kNullStats = nullptr;
+
+  CachableEntry<Block> index_block;
+  {
+    Status s = GetOrReadIndexBlock(false /* no_io */, ro.rate_limiter_priority,
+                                   nullptr /* get_context */, &lookup_context,
+                                   &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  index_block.GetValue()->NewIndexIterator(
+      internal_comparator()->user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), &biter, kNullStats, true,
+      index_has_first_key(), index_key_includes_seq(), index_value_is_full());
+  // Index partitions are assumed to be consecuitive. Prefetch them all.
+  // Read the first block offset
+  biter.SeekToFirst();
+  if (!biter.Valid()) {
+    // Empty index.
+    return biter.status();
+  }
+  handle = biter.value().handle;
+  uint64_t prefetch_off = handle.offset();
+
+  // Read the last block's offset
+  biter.SeekToLast();
+  if (!biter.Valid()) {
+    // Empty index.
+    return biter.status();
+  }
+  handle = biter.value().handle;
+  uint64_t last_off =
+      handle.offset() + BlockBasedTable::BlockSizeWithTrailer(handle);
+  uint64_t prefetch_len = last_off - prefetch_off;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+  rep->CreateFilePrefetchBuffer(
+      0, 0, &prefetch_buffer, false /*Implicit auto readahead*/,
+      0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/);
+  IOOptions opts;
+  {
+    Status s = rep->file->PrepareIOOptions(ro, opts);
+    if (s.ok()) {
+      s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
+                                    static_cast<size_t>(prefetch_len),
+                                    ro.rate_limiter_priority);
+    }
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // For saving "all or nothing" to partition_map_
+  UnorderedMap<uint64_t, CachableEntry<Block>> map_in_progress;
+
+  // After prefetch, read the partitions one by one
+  biter.SeekToFirst();
+  size_t partition_count = 0;
+  for (; biter.Valid(); biter.Next()) {
+    handle = biter.value().handle;
+    CachableEntry<Block> block;
+    ++partition_count;
+    // TODO: Support counter batch update for partitioned index and
+    // filter blocks
+    Status s = table()->MaybeReadBlockAndLoadToCache(
+        prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
+        /*wait=*/true, /*for_compaction=*/false, &block, BlockType::kIndex,
+        /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr,
+        /*async_read=*/false);
+
+    if (!s.ok()) {
+      return s;
+    }
+    if (block.GetValue() != nullptr) {
+      // Might need to "pin" some mmap-read blocks (GetOwnValue) if some
+      // partitions are successfully compressed (cached) and some are not
+      // compressed (mmap eligible)
+      if (block.IsCached() || block.GetOwnValue()) {
+        if (pin) {
+          map_in_progress[handle.offset()] = std::move(block);
+        }
+      }
+    }
+  }
+  Status s = biter.status();
+  // Save (pin) them only if everything checks out
+  if (map_in_progress.size() == partition_count && s.ok()) {
+    std::swap(partition_map_, map_in_progress);
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_index_reader.h b/src/rocksdb/table/block_based/partitioned_index_reader.h
new file mode 100644
index 000000000..58a7877ab
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_index_reader.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/index_reader_common.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Index that allows binary search lookup in a two-level index structure.
+class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  // Read the partition index from the file and create an instance for
+  // `PartitionIndexReader`.
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(const BlockBasedTable* table, const ReadOptions& ro,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader);
+
+  // return a two-level iterator: first level is on the partition index
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override;
+
+  Status CacheDependencies(const ReadOptions& ro, bool pin) override;
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
+    return usage;
+  }
+
+ private:
+  PartitionIndexReader(const BlockBasedTable* t,
+                       CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+
+  // For partition blocks pinned in cache. This is expected to be "all or
+  // none" so that !partition_map_.empty() can use an iterator expecting
+  // all partitions to be saved here.
+  UnorderedMap<uint64_t, CachableEntry<Block>> partition_map_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/reader_common.cc b/src/rocksdb/table/block_based/reader_common.cc
new file mode 100644
index 000000000..0ff43e9b4
--- /dev/null
+++ b/src/rocksdb/table/block_based/reader_common.cc
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/reader_common.h"
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/table.h"
+#include "table/format.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+void ForceReleaseCachedEntry(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle, true /* erase_if_last_ref */);
+}
+
+// WART: this is specific to block-based table
+Status VerifyBlockChecksum(ChecksumType type, const char* data,
+                           size_t block_size, const std::string& file_name,
+                           uint64_t offset) {
+  PERF_TIMER_GUARD(block_checksum_time);
+  // After block_size bytes is compression type (1 byte), which is part of
+  // the checksummed section.
+  size_t len = block_size + 1;
+  // And then the stored checksum value (4 bytes).
+  uint32_t stored = DecodeFixed32(data + len);
+
+  uint32_t computed = ComputeBuiltinChecksum(type, data, len);
+  if (stored == computed) {
+    return Status::OK();
+  } else {
+    // Unmask for people who might look for reference crc value
+    if (type == kCRC32c) {
+      stored = crc32c::Unmask(stored);
+      computed = crc32c::Unmask(computed);
+    }
+    return Status::Corruption(
+        "block checksum mismatch: stored = " + std::to_string(stored) +
+        ", computed = " + std::to_string(computed) +
+        ", type = " + std::to_string(type) + "  in " + file_name + " offset " +
+        std::to_string(offset) + " size " + std::to_string(block_size));
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/reader_common.h b/src/rocksdb/table/block_based/reader_common.h
new file mode 100644
index 000000000..5bb199f28
--- /dev/null
+++ b/src/rocksdb/table/block_based/reader_common.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "rocksdb/cache.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Release the cached entry and decrement its ref count.
+extern void ForceReleaseCachedEntry(void* arg, void* h);
+
+inline MemoryAllocator* GetMemoryAllocator(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache.get()
+             ? table_options.block_cache->memory_allocator()
+             : nullptr;
+}
+
+inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache_compressed.get()
+             ? table_options.block_cache_compressed->memory_allocator()
+             : nullptr;
+}
+
+// Assumes block has a trailer as in format.h. file_name and offset provided
+// for generating a diagnostic message in returned status.
+extern Status VerifyBlockChecksum(ChecksumType type, const char* data,
+                                  size_t block_size,
+                                  const std::string& file_name,
+                                  uint64_t offset);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/uncompression_dict_reader.cc b/src/rocksdb/table/block_based/uncompression_dict_reader.cc
new file mode 100644
index 000000000..dc9a47ec7
--- /dev/null
+++ b/src/rocksdb/table/block_based/uncompression_dict_reader.cc
@@ -0,0 +1,124 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/uncompression_dict_reader.h"
+
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status UncompressionDictReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+  assert(uncompression_dict_reader);
+
+  CachableEntry<UncompressionDict> uncompression_dict;
+  if (prefetch || !use_cache) {
+    const Status s = ReadUncompressionDictionary(
+        table, prefetch_buffer, ro, use_cache, nullptr /* get_context */,
+        lookup_context, &uncompression_dict);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      uncompression_dict.Reset();
+    }
+  }
+
+  uncompression_dict_reader->reset(
+      new UncompressionDictReader(table, std::move(uncompression_dict)));
+
+  return Status::OK();
+}
+
+Status UncompressionDictReader::ReadUncompressionDictionary(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<UncompressionDict>* uncompression_dict) {
+  // TODO: add perf counter for compression dictionary read time
+
+  assert(table);
+  assert(uncompression_dict);
+  assert(uncompression_dict->IsEmpty());
+
+  const BlockBasedTable::Rep* const rep = table->get_rep();
+  assert(rep);
+  assert(!rep->compression_dict_handle.IsNull());
+
+  const Status s = table->RetrieveBlock(
+      prefetch_buffer, read_options, rep->compression_dict_handle,
+      UncompressionDict::GetEmptyDict(), uncompression_dict,
+      BlockType::kCompressionDictionary, get_context, lookup_context,
+      /* for_compaction */ false, use_cache, /* wait_for_cache */ true,
+      /* async_read */ false);
+
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(
+        rep->ioptions.logger,
+        "Encountered error while reading data from compression dictionary "
+        "block %s",
+        s.ToString().c_str());
+  }
+
+  return s;
+}
+
+Status UncompressionDictReader::GetOrReadUncompressionDictionary(
+    FilePrefetchBuffer* prefetch_buffer, bool no_io, bool verify_checksums,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    CachableEntry<UncompressionDict>* uncompression_dict) const {
+  assert(uncompression_dict);
+
+  if (!uncompression_dict_.IsEmpty()) {
+    uncompression_dict->SetUnownedValue(uncompression_dict_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+  read_options.verify_checksums = verify_checksums;
+
+  return ReadUncompressionDictionary(table_, prefetch_buffer, read_options,
+                                     cache_dictionary_blocks(), get_context,
+                                     lookup_context, uncompression_dict);
+}
+
+size_t UncompressionDictReader::ApproximateMemoryUsage() const {
+  assert(!uncompression_dict_.GetOwnValue() ||
+         uncompression_dict_.GetValue() != nullptr);
+  size_t usage = uncompression_dict_.GetOwnValue()
+                     ? uncompression_dict_.GetValue()->ApproximateMemoryUsage()
+                     : 0;
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<UncompressionDictReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+
+  return usage;
+}
+
+bool UncompressionDictReader::cache_dictionary_blocks() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/uncompression_dict_reader.h b/src/rocksdb/table/block_based/uncompression_dict_reader.h
new file mode 100644
index 000000000..416d25e2d
--- /dev/null
+++ b/src/rocksdb/table/block_based/uncompression_dict_reader.h
@@ -0,0 +1,60 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+
+#include "table/block_based/cachable_entry.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBasedTable;
+struct BlockCacheLookupContext;
+class FilePrefetchBuffer;
+class GetContext;
+struct ReadOptions;
+struct UncompressionDict;
+
+// Provides access to the uncompression dictionary regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class UncompressionDictReader {
+ public:
+  static Status Create(
+      const BlockBasedTable* table, const ReadOptions& ro,
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context,
+      std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader);
+
+  Status GetOrReadUncompressionDictionary(
+      FilePrefetchBuffer* prefetch_buffer, bool no_io, bool verify_checksums,
+      GetContext* get_context, BlockCacheLookupContext* lookup_context,
+      CachableEntry<UncompressionDict>* uncompression_dict) const;
+
+  size_t ApproximateMemoryUsage() const;
+
+ private:
+  UncompressionDictReader(const BlockBasedTable* t,
+                          CachableEntry<UncompressionDict>&& uncompression_dict)
+      : table_(t), uncompression_dict_(std::move(uncompression_dict)) {
+    assert(table_);
+  }
+
+  bool cache_dictionary_blocks() const;
+
+  static Status ReadUncompressionDictionary(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      CachableEntry<UncompressionDict>* uncompression_dict);
+
+  const BlockBasedTable* table_;
+  CachableEntry<UncompressionDict> uncompression_dict_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_fetcher.cc b/src/rocksdb/table/block_fetcher.cc
new file mode 100644
index 000000000..8df0850b3
--- /dev/null
+++ b/src/rocksdb/table/block_fetcher.cc
@@ -0,0 +1,399 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_fetcher.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <string>
+
+#include "logging/logging.h"
+#include "memory/memory_allocator.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/env.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/reader_common.h"
+#include "table/format.h"
+#include "table/persistent_cache_helper.h"
+#include "util/compression.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+inline void BlockFetcher::ProcessTrailerIfPresent() {
+  if (footer_.GetBlockTrailerSize() > 0) {
+    assert(footer_.GetBlockTrailerSize() == BlockBasedTable::kBlockTrailerSize);
+    if (read_options_.verify_checksums) {
+      io_status_ = status_to_io_status(VerifyBlockChecksum(
+          footer_.checksum_type(), slice_.data(), block_size_,
+          file_->file_name(), handle_.offset()));
+      RecordTick(ioptions_.stats, BLOCK_CHECKSUM_COMPUTE_COUNT);
+    }
+    compression_type_ =
+        BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_);
+  } else {
+    // E.g. plain table or cuckoo table
+    compression_type_ = kNoCompression;
+  }
+}
+
+inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() {
+  if (cache_options_.persistent_cache &&
+      !cache_options_.persistent_cache->IsCompressed()) {
+    Status status = PersistentCacheHelper::LookupUncompressed(
+        cache_options_, handle_, contents_);
+    if (status.ok()) {
+      // uncompressed page is found for the block handle
+      return true;
+    } else {
+      // uncompressed page is not found
+      if (ioptions_.logger && !status.IsNotFound()) {
+        assert(!status.ok());
+        ROCKS_LOG_INFO(ioptions_.logger,
+                       "Error reading from persistent cache. %s",
+                       status.ToString().c_str());
+      }
+    }
+  }
+  return false;
+}
+
+inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
+  if (prefetch_buffer_ != nullptr) {
+    IOOptions opts;
+    IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
+    if (io_s.ok()) {
+      bool read_from_prefetch_buffer = false;
+      if (read_options_.async_io && !for_compaction_) {
+        read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCacheAsync(
+            opts, file_, handle_.offset(), block_size_with_trailer_, &slice_,
+            &io_s, read_options_.rate_limiter_priority);
+      } else {
+        read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCache(
+            opts, file_, handle_.offset(), block_size_with_trailer_, &slice_,
+            &io_s, read_options_.rate_limiter_priority, for_compaction_);
+      }
+      if (read_from_prefetch_buffer) {
+        ProcessTrailerIfPresent();
+        if (!io_status_.ok()) {
+          return true;
+        }
+        got_from_prefetch_buffer_ = true;
+        used_buf_ = const_cast<char*>(slice_.data());
+      }
+    }
+    if (!io_s.ok()) {
+      io_status_ = io_s;
+      return true;
+    }
+  }
+  return got_from_prefetch_buffer_;
+}
+
+inline bool BlockFetcher::TryGetSerializedBlockFromPersistentCache() {
+  if (cache_options_.persistent_cache &&
+      cache_options_.persistent_cache->IsCompressed()) {
+    std::unique_ptr<char[]> buf;
+    io_status_ = status_to_io_status(PersistentCacheHelper::LookupSerialized(
+        cache_options_, handle_, &buf, block_size_with_trailer_));
+    if (io_status_.ok()) {
+      heap_buf_ = CacheAllocationPtr(buf.release());
+      used_buf_ = heap_buf_.get();
+      slice_ = Slice(heap_buf_.get(), block_size_);
+      ProcessTrailerIfPresent();
+      return true;
+    } else if (!io_status_.IsNotFound() && ioptions_.logger) {
+      assert(!io_status_.ok());
+      ROCKS_LOG_INFO(ioptions_.logger,
+                     "Error reading from persistent cache. %s",
+                     io_status_.ToString().c_str());
+    }
+  }
+  return false;
+}
+
+inline void BlockFetcher::PrepareBufferForBlockFromFile() {
+  // cache miss read from device
+  if ((do_uncompress_ || ioptions_.allow_mmap_reads) &&
+      block_size_with_trailer_ < kDefaultStackBufferSize) {
+    // If we've got a small enough chunk of data, read it in to the
+    // trivially allocated stack buffer instead of needing a full malloc()
+    //
+    // `GetBlockContents()` cannot return this data as its lifetime is tied to
+    // this `BlockFetcher`'s lifetime. That is fine because this is only used
+    // in cases where we do not expect the `GetBlockContents()` result to be the
+    // same buffer we are assigning here. If we guess incorrectly, there will be
+    // a heap allocation and memcpy in `GetBlockContents()` to obtain the final
+    // result. Considering we are eliding a heap allocation here by using the
+    // stack buffer, the cost of guessing incorrectly here is one extra memcpy.
+    //
+    // When `do_uncompress_` is true, we expect the uncompression step will
+    // allocate heap memory for the final result. However this expectation will
+    // be wrong if the block turns out to already be uncompressed, which we
+    // won't know for sure until after reading it.
+    //
+    // When `ioptions_.allow_mmap_reads` is true, we do not expect the file
+    // reader to use the scratch buffer at all, but instead return a pointer
+    // into the mapped memory. This expectation will be wrong when using a
+    // file reader that does not implement mmap reads properly.
+    used_buf_ = &stack_buf_[0];
+  } else if (maybe_compressed_ && !do_uncompress_) {
+    compressed_buf_ =
+        AllocateBlock(block_size_with_trailer_, memory_allocator_compressed_);
+    used_buf_ = compressed_buf_.get();
+  } else {
+    heap_buf_ = AllocateBlock(block_size_with_trailer_, memory_allocator_);
+    used_buf_ = heap_buf_.get();
+  }
+}
+
+inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() {
+  if (io_status_.ok() && read_options_.fill_cache &&
+      cache_options_.persistent_cache &&
+      cache_options_.persistent_cache->IsCompressed()) {
+    PersistentCacheHelper::InsertSerialized(cache_options_, handle_, used_buf_,
+                                            block_size_with_trailer_);
+  }
+}
+
+inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() {
+  if (io_status_.ok() && !got_from_prefetch_buffer_ &&
+      read_options_.fill_cache && cache_options_.persistent_cache &&
+      !cache_options_.persistent_cache->IsCompressed()) {
+    // insert to uncompressed cache
+    PersistentCacheHelper::InsertUncompressed(cache_options_, handle_,
+                                              *contents_);
+  }
+}
+
+inline void BlockFetcher::CopyBufferToHeapBuf() {
+  assert(used_buf_ != heap_buf_.get());
+  heap_buf_ = AllocateBlock(block_size_with_trailer_, memory_allocator_);
+  memcpy(heap_buf_.get(), used_buf_, block_size_with_trailer_);
+#ifndef NDEBUG
+  num_heap_buf_memcpy_++;
+#endif
+}
+
+inline void BlockFetcher::CopyBufferToCompressedBuf() {
+  assert(used_buf_ != compressed_buf_.get());
+  compressed_buf_ =
+      AllocateBlock(block_size_with_trailer_, memory_allocator_compressed_);
+  memcpy(compressed_buf_.get(), used_buf_, block_size_with_trailer_);
+#ifndef NDEBUG
+  num_compressed_buf_memcpy_++;
+#endif
+}
+
+// Entering this method means the block is not compressed or do not need to be
+// uncompressed. The block can be in one of the following buffers:
+// 1. prefetch buffer if prefetch is enabled and the block is prefetched before
+// 2. stack_buf_ if block size is smaller than the stack_buf_ size and block
+//    is not compressed
+// 3. heap_buf_ if the block is not compressed
+// 4. compressed_buf_ if the block is compressed
+// 5. direct_io_buf_ if direct IO is enabled
+// After this method, if the block is compressed, it should be in
+// compressed_buf_, otherwise should be in heap_buf_.
+inline void BlockFetcher::GetBlockContents() {
+  if (slice_.data() != used_buf_) {
+    // the slice content is not the buffer provided
+    *contents_ = BlockContents(Slice(slice_.data(), block_size_));
+  } else {
+    // page can be either uncompressed or compressed, the buffer either stack
+    // or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096
+    if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) {
+      CopyBufferToHeapBuf();
+    } else if (used_buf_ == compressed_buf_.get()) {
+      if (compression_type_ == kNoCompression &&
+          memory_allocator_ != memory_allocator_compressed_) {
+        CopyBufferToHeapBuf();
+      } else {
+        heap_buf_ = std::move(compressed_buf_);
+      }
+    } else if (direct_io_buf_.get() != nullptr) {
+      if (compression_type_ == kNoCompression) {
+        CopyBufferToHeapBuf();
+      } else {
+        CopyBufferToCompressedBuf();
+        heap_buf_ = std::move(compressed_buf_);
+      }
+    }
+    *contents_ = BlockContents(std::move(heap_buf_), block_size_);
+  }
+#ifndef NDEBUG
+  contents_->has_trailer = footer_.GetBlockTrailerSize() > 0;
+#endif
+}
+
+IOStatus BlockFetcher::ReadBlockContents() {
+  if (TryGetUncompressBlockFromPersistentCache()) {
+    compression_type_ = kNoCompression;
+#ifndef NDEBUG
+    contents_->has_trailer = footer_.GetBlockTrailerSize() > 0;
+#endif  // NDEBUG
+    return IOStatus::OK();
+  }
+  if (TryGetFromPrefetchBuffer()) {
+    if (!io_status_.ok()) {
+      return io_status_;
+    }
+  } else if (!TryGetSerializedBlockFromPersistentCache()) {
+    IOOptions opts;
+    io_status_ = file_->PrepareIOOptions(read_options_, opts);
+    // Actual file read
+    if (io_status_.ok()) {
+      if (file_->use_direct_io()) {
+        PERF_TIMER_GUARD(block_read_time);
+        io_status_ = file_->Read(
+            opts, handle_.offset(), block_size_with_trailer_, &slice_, nullptr,
+            &direct_io_buf_, read_options_.rate_limiter_priority);
+        PERF_COUNTER_ADD(block_read_count, 1);
+        used_buf_ = const_cast<char*>(slice_.data());
+      } else {
+        PrepareBufferForBlockFromFile();
+        PERF_TIMER_GUARD(block_read_time);
+        io_status_ = file_->Read(opts, handle_.offset(),
+                                 block_size_with_trailer_, &slice_, used_buf_,
+                                 nullptr, read_options_.rate_limiter_priority);
+        PERF_COUNTER_ADD(block_read_count, 1);
+#ifndef NDEBUG
+        if (slice_.data() == &stack_buf_[0]) {
+          num_stack_buf_memcpy_++;
+        } else if (slice_.data() == heap_buf_.get()) {
+          num_heap_buf_memcpy_++;
+        } else if (slice_.data() == compressed_buf_.get()) {
+          num_compressed_buf_memcpy_++;
+        }
+#endif
+      }
+    }
+
+    // TODO: introduce dedicated perf counter for range tombstones
+    switch (block_type_) {
+      case BlockType::kFilter:
+      case BlockType::kFilterPartitionIndex:
+        PERF_COUNTER_ADD(filter_block_read_count, 1);
+        break;
+
+      case BlockType::kCompressionDictionary:
+        PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
+        break;
+
+      case BlockType::kIndex:
+        PERF_COUNTER_ADD(index_block_read_count, 1);
+        break;
+
+      // Nothing to do here as we don't have counters for the other types.
+      default:
+        break;
+    }
+
+    PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_);
+    if (!io_status_.ok()) {
+      return io_status_;
+    }
+
+    if (slice_.size() != block_size_with_trailer_) {
+      return IOStatus::Corruption(
+          "truncated block read from " + file_->file_name() + " offset " +
+          std::to_string(handle_.offset()) + ", expected " +
+          std::to_string(block_size_with_trailer_) + " bytes, got " +
+          std::to_string(slice_.size()));
+    }
+
+    ProcessTrailerIfPresent();
+    if (io_status_.ok()) {
+      InsertCompressedBlockToPersistentCacheIfNeeded();
+    } else {
+      return io_status_;
+    }
+  }
+
+  if (do_uncompress_ && compression_type_ != kNoCompression) {
+    PERF_TIMER_GUARD(block_decompress_time);
+    // compressed page, uncompress, update cache
+    UncompressionContext context(compression_type_);
+    UncompressionInfo info(context, uncompression_dict_, compression_type_);
+    io_status_ = status_to_io_status(UncompressSerializedBlock(
+        info, slice_.data(), block_size_, contents_, footer_.format_version(),
+        ioptions_, memory_allocator_));
+#ifndef NDEBUG
+    num_heap_buf_memcpy_++;
+#endif
+    compression_type_ = kNoCompression;
+  } else {
+    GetBlockContents();
+  }
+
+  InsertUncompressedBlockToPersistentCacheIfNeeded();
+
+  return io_status_;
+}
+
+IOStatus BlockFetcher::ReadAsyncBlockContents() {
+  if (TryGetUncompressBlockFromPersistentCache()) {
+    compression_type_ = kNoCompression;
+#ifndef NDEBUG
+    contents_->has_trailer = footer_.GetBlockTrailerSize() > 0;
+#endif  // NDEBUG
+    return IOStatus::OK();
+  } else if (!TryGetSerializedBlockFromPersistentCache()) {
+    assert(prefetch_buffer_ != nullptr);
+    if (!for_compaction_) {
+      IOOptions opts;
+      IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      io_s = status_to_io_status(prefetch_buffer_->PrefetchAsync(
+          opts, file_, handle_.offset(), block_size_with_trailer_, &slice_));
+      if (io_s.IsTryAgain()) {
+        return io_s;
+      }
+      if (io_s.ok()) {
+        // Data Block is already in prefetch.
+        got_from_prefetch_buffer_ = true;
+        ProcessTrailerIfPresent();
+        if (!io_status_.ok()) {
+          return io_status_;
+        }
+        used_buf_ = const_cast<char*>(slice_.data());
+
+        if (do_uncompress_ && compression_type_ != kNoCompression) {
+          PERF_TIMER_GUARD(block_decompress_time);
+          // compressed page, uncompress, update cache
+          UncompressionContext context(compression_type_);
+          UncompressionInfo info(context, uncompression_dict_,
+                                 compression_type_);
+          io_status_ = status_to_io_status(UncompressSerializedBlock(
+              info, slice_.data(), block_size_, contents_,
+              footer_.format_version(), ioptions_, memory_allocator_));
+#ifndef NDEBUG
+          num_heap_buf_memcpy_++;
+#endif
+          compression_type_ = kNoCompression;
+        } else {
+          GetBlockContents();
+        }
+        InsertUncompressedBlockToPersistentCacheIfNeeded();
+        return io_status_;
+      }
+    }
+    // Fallback to sequential reading of data blocks in case of io_s returns
+    // error or for_compaction_is true.
+    return ReadBlockContents();
+  }
+  return io_status_;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_fetcher.h b/src/rocksdb/table/block_fetcher.h
new file mode 100644
index 000000000..72adced30
--- /dev/null
+++ b/src/rocksdb/table/block_fetcher.h
@@ -0,0 +1,142 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "memory/memory_allocator.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_type.h"
+#include "table/format.h"
+#include "table/persistent_cache_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Retrieves a single block of a given file. Utilizes the prefetch buffer and/or
+// persistent cache provided (if any) to try to avoid reading from the file
+// directly. Note that both the prefetch buffer and the persistent cache are
+// optional; also, note that the persistent cache may be configured to store
+// either compressed or uncompressed blocks.
+//
+// If the retrieved block is compressed and the do_uncompress flag is set,
+// BlockFetcher uncompresses the block (using the uncompression dictionary,
+// if provided, to prime the compression algorithm), and returns the resulting
+// uncompressed block data. Otherwise, it returns the original block.
+//
+// Two read options affect the behavior of BlockFetcher: if verify_checksums is
+// true, the checksum of the (original) block is checked; if fill_cache is true,
+// the block is added to the persistent cache if needed.
+//
+// Memory for uncompressed and compressed blocks is allocated as needed
+// using memory_allocator and memory_allocator_compressed, respectively
+// (if provided; otherwise, the default allocator is used).
+
+class BlockFetcher {
+ public:
+  BlockFetcher(RandomAccessFileReader* file,
+               FilePrefetchBuffer* prefetch_buffer,
+               const Footer& footer /* ref retained */,
+               const ReadOptions& read_options,
+               const BlockHandle& handle /* ref retained */,
+               BlockContents* contents,
+               const ImmutableOptions& ioptions /* ref retained */,
+               bool do_uncompress, bool maybe_compressed, BlockType block_type,
+               const UncompressionDict& uncompression_dict /* ref retained */,
+               const PersistentCacheOptions& cache_options /* ref retained */,
+               MemoryAllocator* memory_allocator = nullptr,
+               MemoryAllocator* memory_allocator_compressed = nullptr,
+               bool for_compaction = false)
+      : file_(file),
+        prefetch_buffer_(prefetch_buffer),
+        footer_(footer),
+        read_options_(read_options),
+        handle_(handle),
+        contents_(contents),
+        ioptions_(ioptions),
+        do_uncompress_(do_uncompress),
+        maybe_compressed_(maybe_compressed),
+        block_type_(block_type),
+        block_size_(static_cast<size_t>(handle_.size())),
+        block_size_with_trailer_(block_size_ + footer.GetBlockTrailerSize()),
+        uncompression_dict_(uncompression_dict),
+        cache_options_(cache_options),
+        memory_allocator_(memory_allocator),
+        memory_allocator_compressed_(memory_allocator_compressed),
+        for_compaction_(for_compaction) {
+    io_status_.PermitUncheckedError();  // TODO(AR) can we improve on this?
+  }
+
+  IOStatus ReadBlockContents();
+  IOStatus ReadAsyncBlockContents();
+
+  inline CompressionType get_compression_type() const {
+    return compression_type_;
+  }
+  inline size_t GetBlockSizeWithTrailer() const {
+    return block_size_with_trailer_;
+  }
+
+#ifndef NDEBUG
+  int TEST_GetNumStackBufMemcpy() const { return num_stack_buf_memcpy_; }
+  int TEST_GetNumHeapBufMemcpy() const { return num_heap_buf_memcpy_; }
+  int TEST_GetNumCompressedBufMemcpy() const {
+    return num_compressed_buf_memcpy_;
+  }
+
+#endif
+ private:
+#ifndef NDEBUG
+  int num_stack_buf_memcpy_ = 0;
+  int num_heap_buf_memcpy_ = 0;
+  int num_compressed_buf_memcpy_ = 0;
+
+#endif
+  static const uint32_t kDefaultStackBufferSize = 5000;
+
+  RandomAccessFileReader* file_;
+  FilePrefetchBuffer* prefetch_buffer_;
+  const Footer& footer_;
+  const ReadOptions read_options_;
+  const BlockHandle& handle_;
+  BlockContents* contents_;
+  const ImmutableOptions& ioptions_;
+  const bool do_uncompress_;
+  const bool maybe_compressed_;
+  const BlockType block_type_;
+  const size_t block_size_;
+  const size_t block_size_with_trailer_;
+  const UncompressionDict& uncompression_dict_;
+  const PersistentCacheOptions& cache_options_;
+  MemoryAllocator* memory_allocator_;
+  MemoryAllocator* memory_allocator_compressed_;
+  IOStatus io_status_;
+  Slice slice_;
+  char* used_buf_ = nullptr;
+  AlignedBuf direct_io_buf_;
+  CacheAllocationPtr heap_buf_;
+  CacheAllocationPtr compressed_buf_;
+  char stack_buf_[kDefaultStackBufferSize];
+  bool got_from_prefetch_buffer_ = false;
+  CompressionType compression_type_;
+  bool for_compaction_ = false;
+
+  // return true if found
+  bool TryGetUncompressBlockFromPersistentCache();
+  // return true if found
+  bool TryGetFromPrefetchBuffer();
+  bool TryGetSerializedBlockFromPersistentCache();
+  void PrepareBufferForBlockFromFile();
+  // Copy content from used_buf_ to new heap_buf_.
+  void CopyBufferToHeapBuf();
+  // Copy content from used_buf_ to new compressed_buf_.
+  void CopyBufferToCompressedBuf();
+  void GetBlockContents();
+  void InsertCompressedBlockToPersistentCacheIfNeeded();
+  void InsertUncompressedBlockToPersistentCacheIfNeeded();
+  void ProcessTrailerIfPresent();
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_fetcher_test.cc b/src/rocksdb/table/block_fetcher_test.cc
new file mode 100644
index 000000000..82caee282
--- /dev/null
+++ b/src/rocksdb/table/block_fetcher_test.cc
@@ -0,0 +1,521 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_fetcher.h"
+
+#include "db/table_properties_collector.h"
+#include "file/file_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "table/block_based/binary_search_index_reader.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+struct MemcpyStats {
+  int num_stack_buf_memcpy;
+  int num_heap_buf_memcpy;
+  int num_compressed_buf_memcpy;
+};
+
+struct BufAllocationStats {
+  int num_heap_buf_allocations;
+  int num_compressed_buf_allocations;
+};
+
+struct TestStats {
+  MemcpyStats memcpy_stats;
+  BufAllocationStats buf_allocation_stats;
+};
+
+class BlockFetcherTest : public testing::Test {
+ public:
+  enum class Mode {
+    kBufferedRead = 0,
+    kBufferedMmap,
+    kDirectRead,
+    kNumModes,
+  };
+  // use NumModes as array size to avoid "size of array '...' has non-integral
+  // type" errors.
+  const static int NumModes = static_cast<int>(Mode::kNumModes);
+
+ protected:
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    test_dir_ = test::PerThreadDBPath("block_fetcher_test");
+    env_ = Env::Default();
+    fs_ = FileSystem::Default();
+    ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+  }
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  void AssertSameBlock(const std::string& block1, const std::string& block2) {
+    ASSERT_EQ(block1, block2);
+  }
+
+  // Creates a table with kv pairs (i, i) where i ranges from 0 to 9, inclusive.
+  void CreateTable(const std::string& table_name,
+                   const CompressionType& compression_type) {
+    std::unique_ptr<WritableFileWriter> writer;
+    NewFileWriter(table_name, &writer);
+
+    // Create table builder.
+    ImmutableOptions ioptions(options_);
+    InternalKeyComparator comparator(options_.comparator);
+    ColumnFamilyOptions cf_options(options_);
+    MutableCFOptions moptions(cf_options);
+    IntTblPropCollectorFactories factories;
+    std::unique_ptr<TableBuilder> table_builder(table_factory_.NewTableBuilder(
+        TableBuilderOptions(ioptions, moptions, comparator, &factories,
+                            compression_type, CompressionOptions(),
+                            0 /* column_family_id */, kDefaultColumnFamilyName,
+                            -1 /* level */),
+        writer.get()));
+
+    // Build table.
+    for (int i = 0; i < 9; i++) {
+      std::string key = ToInternalKey(std::to_string(i));
+      // Append "00000000" to string value to enhance compression ratio
+      std::string value = "00000000" + std::to_string(i);
+      table_builder->Add(key, value);
+    }
+    ASSERT_OK(table_builder->Finish());
+  }
+
+  void FetchIndexBlock(const std::string& table_name,
+                       CountedMemoryAllocator* heap_buf_allocator,
+                       CountedMemoryAllocator* compressed_buf_allocator,
+                       MemcpyStats* memcpy_stats, BlockContents* index_block,
+                       std::string* result) {
+    FileOptions fopt(options_);
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, fopt, &file);
+
+    // Get handle of the index block.
+    Footer footer;
+    ReadFooter(file.get(), &footer);
+    const BlockHandle& index_handle = footer.index_handle();
+
+    CompressionType compression_type;
+    FetchBlock(file.get(), index_handle, BlockType::kIndex,
+               false /* compressed */, false /* do_uncompress */,
+               heap_buf_allocator, compressed_buf_allocator, index_block,
+               memcpy_stats, &compression_type);
+    ASSERT_EQ(compression_type, CompressionType::kNoCompression);
+    result->assign(index_block->data.ToString());
+  }
+
+  // Fetches the first data block in both direct IO and non-direct IO mode.
+  //
+  // compressed: whether the data blocks are compressed;
+  // do_uncompress: whether the data blocks should be uncompressed on fetching.
+  // compression_type: the expected compression type.
+  //
+  // Expects:
+  // Block contents are the same.
+  // Bufferr allocation and memory copy statistics are expected.
+  void TestFetchDataBlock(
+      const std::string& table_name_prefix, bool compressed, bool do_uncompress,
+      std::array<TestStats, NumModes> expected_stats_by_mode) {
+    for (CompressionType compression_type : GetSupportedCompressions()) {
+      bool do_compress = compression_type != kNoCompression;
+      if (compressed != do_compress) continue;
+      std::string compression_type_str =
+          CompressionTypeToString(compression_type);
+
+      std::string table_name = table_name_prefix + compression_type_str;
+      CreateTable(table_name, compression_type);
+
+      CompressionType expected_compression_type_after_fetch =
+          (compressed && !do_uncompress) ? compression_type : kNoCompression;
+
+      BlockContents blocks[NumModes];
+      std::string block_datas[NumModes];
+      MemcpyStats memcpy_stats[NumModes];
+      CountedMemoryAllocator heap_buf_allocators[NumModes];
+      CountedMemoryAllocator compressed_buf_allocators[NumModes];
+      for (int i = 0; i < NumModes; ++i) {
+        SetMode(static_cast<Mode>(i));
+        FetchFirstDataBlock(table_name, compressed, do_uncompress,
+                            expected_compression_type_after_fetch,
+                            &heap_buf_allocators[i],
+                            &compressed_buf_allocators[i], &blocks[i],
+                            &block_datas[i], &memcpy_stats[i]);
+      }
+
+      for (int i = 0; i < NumModes - 1; ++i) {
+        AssertSameBlock(block_datas[i], block_datas[i + 1]);
+      }
+
+      // Check memcpy and buffer allocation statistics.
+      for (int i = 0; i < NumModes; ++i) {
+        const TestStats& expected_stats = expected_stats_by_mode[i];
+
+        ASSERT_EQ(memcpy_stats[i].num_stack_buf_memcpy,
+                  expected_stats.memcpy_stats.num_stack_buf_memcpy);
+        ASSERT_EQ(memcpy_stats[i].num_heap_buf_memcpy,
+                  expected_stats.memcpy_stats.num_heap_buf_memcpy);
+        ASSERT_EQ(memcpy_stats[i].num_compressed_buf_memcpy,
+                  expected_stats.memcpy_stats.num_compressed_buf_memcpy);
+
+        if (kXpressCompression == compression_type) {
+          // XPRESS allocates memory internally, thus does not support for
+          // custom allocator verification
+          continue;
+        } else {
+          ASSERT_EQ(
+              heap_buf_allocators[i].GetNumAllocations(),
+              expected_stats.buf_allocation_stats.num_heap_buf_allocations);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumAllocations(),
+                    expected_stats.buf_allocation_stats
+                        .num_compressed_buf_allocations);
+
+          // The allocated buffers are not deallocated until
+          // the block content is deleted.
+          ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(), 0);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), 0);
+          blocks[i].allocation.reset();
+          ASSERT_EQ(
+              heap_buf_allocators[i].GetNumDeallocations(),
+              expected_stats.buf_allocation_stats.num_heap_buf_allocations);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(),
+                    expected_stats.buf_allocation_stats
+                        .num_compressed_buf_allocations);
+        }
+      }
+    }
+  }
+
+  void SetMode(Mode mode) {
+    switch (mode) {
+      case Mode::kBufferedRead:
+        options_.use_direct_reads = false;
+        options_.allow_mmap_reads = false;
+        break;
+      case Mode::kBufferedMmap:
+        options_.use_direct_reads = false;
+        options_.allow_mmap_reads = true;
+        break;
+      case Mode::kDirectRead:
+        options_.use_direct_reads = true;
+        options_.allow_mmap_reads = false;
+        break;
+      case Mode::kNumModes:
+        assert(false);
+    }
+  }
+
+ private:
+  std::string test_dir_;
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  BlockBasedTableFactory table_factory_;
+  Options options_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+
+  void WriteToFile(const std::string& content, const std::string& filename) {
+    std::unique_ptr<FSWritableFile> f;
+    ASSERT_OK(fs_->NewWritableFile(Path(filename), FileOptions(), &f, nullptr));
+    ASSERT_OK(f->Append(content, IOOptions(), nullptr));
+    ASSERT_OK(f->Close(IOOptions(), nullptr));
+  }
+
+  void NewFileWriter(const std::string& filename,
+                     std::unique_ptr<WritableFileWriter>* writer) {
+    std::string path = Path(filename);
+    FileOptions file_options;
+    ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), path,
+                                         file_options, writer, nullptr));
+  }
+
+  void NewFileReader(const std::string& filename, const FileOptions& opt,
+                     std::unique_ptr<RandomAccessFileReader>* reader) {
+    std::string path = Path(filename);
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), path,
+                                             env_->GetSystemClock().get()));
+  }
+
+  void NewTableReader(const ImmutableOptions& ioptions,
+                      const FileOptions& foptions,
+                      const InternalKeyComparator& comparator,
+                      const std::string& table_name,
+                      std::unique_ptr<BlockBasedTable>* table) {
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, foptions, &file);
+
+    uint64_t file_size = 0;
+    ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size));
+
+    std::unique_ptr<TableReader> table_reader;
+    ReadOptions ro;
+    const auto* table_options =
+        table_factory_.GetOptions<BlockBasedTableOptions>();
+    ASSERT_NE(table_options, nullptr);
+    ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options,
+                                    comparator, std::move(file), file_size,
+                                    &table_reader));
+
+    table->reset(reinterpret_cast<BlockBasedTable*>(table_reader.release()));
+  }
+
+  std::string ToInternalKey(const std::string& key) {
+    InternalKey internal_key(key, 0, ValueType::kTypeValue);
+    return internal_key.Encode().ToString();
+  }
+
+  void ReadFooter(RandomAccessFileReader* file, Footer* footer) {
+    uint64_t file_size = 0;
+    ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size));
+    IOOptions opts;
+    ASSERT_OK(ReadFooterFromFile(opts, file, nullptr /* prefetch_buffer */,
+                                 file_size, footer,
+                                 kBlockBasedTableMagicNumber));
+  }
+
+  // NOTE: compression_type returns the compression type of the fetched block
+  // contents, so if the block is fetched and uncompressed, then it's
+  // kNoCompression.
+  void FetchBlock(RandomAccessFileReader* file, const BlockHandle& block,
+                  BlockType block_type, bool compressed, bool do_uncompress,
+                  MemoryAllocator* heap_buf_allocator,
+                  MemoryAllocator* compressed_buf_allocator,
+                  BlockContents* contents, MemcpyStats* stats,
+                  CompressionType* compresstion_type) {
+    ImmutableOptions ioptions(options_);
+    ReadOptions roptions;
+    PersistentCacheOptions persistent_cache_options;
+    Footer footer;
+    ReadFooter(file, &footer);
+    std::unique_ptr<BlockFetcher> fetcher(new BlockFetcher(
+        file, nullptr /* prefetch_buffer */, footer, roptions, block, contents,
+        ioptions, do_uncompress, compressed, block_type,
+        UncompressionDict::GetEmptyDict(), persistent_cache_options,
+        heap_buf_allocator, compressed_buf_allocator));
+
+    ASSERT_OK(fetcher->ReadBlockContents());
+
+    stats->num_stack_buf_memcpy = fetcher->TEST_GetNumStackBufMemcpy();
+    stats->num_heap_buf_memcpy = fetcher->TEST_GetNumHeapBufMemcpy();
+    stats->num_compressed_buf_memcpy =
+        fetcher->TEST_GetNumCompressedBufMemcpy();
+
+    *compresstion_type = fetcher->get_compression_type();
+  }
+
+  // NOTE: expected_compression_type is the expected compression
+  // type of the fetched block content, if the block is uncompressed,
+  // then the expected compression type is kNoCompression.
+  void FetchFirstDataBlock(const std::string& table_name, bool compressed,
+                           bool do_uncompress,
+                           CompressionType expected_compression_type,
+                           MemoryAllocator* heap_buf_allocator,
+                           MemoryAllocator* compressed_buf_allocator,
+                           BlockContents* block, std::string* result,
+                           MemcpyStats* memcpy_stats) {
+    ImmutableOptions ioptions(options_);
+    InternalKeyComparator comparator(options_.comparator);
+    FileOptions foptions(options_);
+
+    // Get block handle for the first data block.
+    std::unique_ptr<BlockBasedTable> table;
+    NewTableReader(ioptions, foptions, comparator, table_name, &table);
+
+    std::unique_ptr<BlockBasedTable::IndexReader> index_reader;
+    ReadOptions ro;
+    ASSERT_OK(BinarySearchIndexReader::Create(
+        table.get(), ro, nullptr /* prefetch_buffer */, false /* use_cache */,
+        false /* prefetch */, false /* pin */, nullptr /* lookup_context */,
+        &index_reader));
+
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iter(
+        index_reader->NewIterator(
+            ReadOptions(), false /* disable_prefix_seek */, nullptr /* iter */,
+            nullptr /* get_context */, nullptr /* lookup_context */));
+    ASSERT_OK(iter->status());
+    iter->SeekToFirst();
+    BlockHandle first_block_handle = iter->value().handle;
+
+    // Fetch first data block.
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, foptions, &file);
+    CompressionType compression_type;
+    FetchBlock(file.get(), first_block_handle, BlockType::kData, compressed,
+               do_uncompress, heap_buf_allocator, compressed_buf_allocator,
+               block, memcpy_stats, &compression_type);
+    ASSERT_EQ(compression_type, expected_compression_type);
+    result->assign(block->data.ToString());
+  }
+};
+
+// Skip the following tests in lite mode since direct I/O is unsupported.
+#ifndef ROCKSDB_LITE
+
+// Fetch index block under both direct IO and non-direct IO.
+// Expects:
+// the index block contents are the same for both read modes.
+TEST_F(BlockFetcherTest, FetchIndexBlock) {
+  for (CompressionType compression : GetSupportedCompressions()) {
+    std::string table_name =
+        "FetchIndexBlock" + CompressionTypeToString(compression);
+    CreateTable(table_name, compression);
+
+    CountedMemoryAllocator allocator;
+    MemcpyStats memcpy_stats;
+    BlockContents indexes[NumModes];
+    std::string index_datas[NumModes];
+    for (int i = 0; i < NumModes; ++i) {
+      SetMode(static_cast<Mode>(i));
+      FetchIndexBlock(table_name, &allocator, &allocator, &memcpy_stats,
+                      &indexes[i], &index_datas[i]);
+    }
+    for (int i = 0; i < NumModes - 1; ++i) {
+      AssertSameBlock(index_datas[i], index_datas[i + 1]);
+    }
+  }
+}
+
+// Data blocks are not compressed,
+// fetch data block under direct IO, mmap IO,and non-direct IO.
+// Expects:
+// 1. in non-direct IO mode, allocate a heap buffer and memcpy the block
+//    into the buffer;
+// 2. in direct IO mode, allocate a heap buffer and memcpy from the
+//    direct IO buffer to the heap buffer.
+TEST_F(BlockFetcherTest, FetchUncompressedDataBlock) {
+  TestStats expected_non_mmap_stats = {
+      {
+          0 /* num_stack_buf_memcpy */,
+          1 /* num_heap_buf_memcpy */,
+          0 /* num_compressed_buf_memcpy */,
+      },
+      {
+          1 /* num_heap_buf_allocations */,
+          0 /* num_compressed_buf_allocations */,
+      }};
+  TestStats expected_mmap_stats = {{
+                                       0 /* num_stack_buf_memcpy */,
+                                       0 /* num_heap_buf_memcpy */,
+                                       0 /* num_compressed_buf_memcpy */,
+                                   },
+                                   {
+                                       0 /* num_heap_buf_allocations */,
+                                       0 /* num_compressed_buf_allocations */,
+                                   }};
+  std::array<TestStats, NumModes> expected_stats_by_mode{{
+      expected_non_mmap_stats /* kBufferedRead */,
+      expected_mmap_stats /* kBufferedMmap */,
+      expected_non_mmap_stats /* kDirectRead */,
+  }};
+  TestFetchDataBlock("FetchUncompressedDataBlock", false, false,
+                     expected_stats_by_mode);
+}
+
+// Data blocks are compressed,
+// fetch data block under both direct IO and non-direct IO,
+// but do not uncompress.
+// Expects:
+// 1. in non-direct IO mode, allocate a compressed buffer and memcpy the block
+//    into the buffer;
+// 2. in direct IO mode, allocate a compressed buffer and memcpy from the
+//    direct IO buffer to the compressed buffer.
+TEST_F(BlockFetcherTest, FetchCompressedDataBlock) {
+  TestStats expected_non_mmap_stats = {
+      {
+          0 /* num_stack_buf_memcpy */,
+          0 /* num_heap_buf_memcpy */,
+          1 /* num_compressed_buf_memcpy */,
+      },
+      {
+          0 /* num_heap_buf_allocations */,
+          1 /* num_compressed_buf_allocations */,
+      }};
+  TestStats expected_mmap_stats = {{
+                                       0 /* num_stack_buf_memcpy */,
+                                       0 /* num_heap_buf_memcpy */,
+                                       0 /* num_compressed_buf_memcpy */,
+                                   },
+                                   {
+                                       0 /* num_heap_buf_allocations */,
+                                       0 /* num_compressed_buf_allocations */,
+                                   }};
+  std::array<TestStats, NumModes> expected_stats_by_mode{{
+      expected_non_mmap_stats /* kBufferedRead */,
+      expected_mmap_stats /* kBufferedMmap */,
+      expected_non_mmap_stats /* kDirectRead */,
+  }};
+  TestFetchDataBlock("FetchCompressedDataBlock", true, false,
+                     expected_stats_by_mode);
+}
+
+// Data blocks are compressed,
+// fetch and uncompress data block under both direct IO and non-direct IO.
+// Expects:
+// 1. in non-direct IO mode, since the block is small, so it's first memcpyed
+//    to the stack buffer, then a heap buffer is allocated and the block is
+//    uncompressed into the heap.
+// 2. in direct IO mode mode, allocate a heap buffer, then directly uncompress
+//    and memcpy from the direct IO buffer to the heap buffer.
+TEST_F(BlockFetcherTest, FetchAndUncompressCompressedDataBlock) {
+  TestStats expected_buffered_read_stats = {
+      {
+          1 /* num_stack_buf_memcpy */,
+          1 /* num_heap_buf_memcpy */,
+          0 /* num_compressed_buf_memcpy */,
+      },
+      {
+          1 /* num_heap_buf_allocations */,
+          0 /* num_compressed_buf_allocations */,
+      }};
+  TestStats expected_mmap_stats = {{
+                                       0 /* num_stack_buf_memcpy */,
+                                       1 /* num_heap_buf_memcpy */,
+                                       0 /* num_compressed_buf_memcpy */,
+                                   },
+                                   {
+                                       1 /* num_heap_buf_allocations */,
+                                       0 /* num_compressed_buf_allocations */,
+                                   }};
+  TestStats expected_direct_read_stats = {
+      {
+          0 /* num_stack_buf_memcpy */,
+          1 /* num_heap_buf_memcpy */,
+          0 /* num_compressed_buf_memcpy */,
+      },
+      {
+          1 /* num_heap_buf_allocations */,
+          0 /* num_compressed_buf_allocations */,
+      }};
+  std::array<TestStats, NumModes> expected_stats_by_mode{{
+      expected_buffered_read_stats,
+      expected_mmap_stats,
+      expected_direct_read_stats,
+  }};
+  TestFetchDataBlock("FetchAndUncompressCompressedDataBlock", true, true,
+                     expected_stats_by_mode);
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/cleanable_test.cc b/src/rocksdb/table/cleanable_test.cc
new file mode 100644
index 000000000..b58eb7dc6
--- /dev/null
+++ b/src/rocksdb/table/cleanable_test.cc
@@ -0,0 +1,390 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/cleanable.h"
+
+#include <gtest/gtest.h>
+
+#include <functional>
+
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/perf_context.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CleanableTest : public testing::Test {};
+
+// Use this to keep track of the cleanups that were actually performed
+void Multiplier(void* arg1, void* arg2) {
+  int* res = reinterpret_cast<int*>(arg1);
+  int* num = reinterpret_cast<int*>(arg2);
+  *res *= *num;
+}
+
+// the first Cleanup is on stack and the rest on heap, so test with both cases
+TEST_F(CleanableTest, Register) {
+  int n2 = 2, n3 = 3;
+  int res = 1;
+  { Cleanable c1; }
+  // ~Cleanable
+  ASSERT_EQ(1, res);
+
+  res = 1;
+  {
+    Cleanable c1;
+    c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+  }
+  // ~Cleanable
+  ASSERT_EQ(2, res);
+
+  res = 1;
+  {
+    Cleanable c1;
+    c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+    c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+  }
+  // ~Cleanable
+  ASSERT_EQ(6, res);
+
+  // Test the Reset does cleanup
+  res = 1;
+  {
+    Cleanable c1;
+    c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+    c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+    c1.Reset();
+    ASSERT_EQ(6, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(6, res);
+
+  // Test Clenable is usable after Reset
+  res = 1;
+  {
+    Cleanable c1;
+    c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+    c1.Reset();
+    ASSERT_EQ(2, res);
+    c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+  }
+  // ~Cleanable
+  ASSERT_EQ(6, res);
+}
+
+// the first Cleanup is on stack and the rest on heap,
+// so test all the combinations of them
+TEST_F(CleanableTest, Delegation) {
+  int n2 = 2, n3 = 3, n5 = 5, n7 = 7;
+  int res = 1;
+  {
+    Cleanable c2;
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      c1.DelegateCleanupsTo(&c2);
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(2, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    {
+      Cleanable c1;
+      c1.DelegateCleanupsTo(&c2);
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(1, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+      c1.DelegateCleanupsTo(&c2);
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(6, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    c2.RegisterCleanup(Multiplier, &res, &n5);  // res = 5;
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+      c1.DelegateCleanupsTo(&c2);                 // res = 2 * 3 * 5;
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(30, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    c2.RegisterCleanup(Multiplier, &res, &n5);  // res = 5;
+    c2.RegisterCleanup(Multiplier, &res, &n7);  // res = 5 * 7;
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+      c1.DelegateCleanupsTo(&c2);                 // res = 2 * 3 * 5 * 7;
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(210, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    c2.RegisterCleanup(Multiplier, &res, &n5);  // res = 5;
+    c2.RegisterCleanup(Multiplier, &res, &n7);  // res = 5 * 7;
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      c1.DelegateCleanupsTo(&c2);                 // res = 2 * 5 * 7;
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(70, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    c2.RegisterCleanup(Multiplier, &res, &n5);  // res = 5;
+    c2.RegisterCleanup(Multiplier, &res, &n7);  // res = 5 * 7;
+    {
+      Cleanable c1;
+      c1.DelegateCleanupsTo(&c2);  // res = 5 * 7;
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(35, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    c2.RegisterCleanup(Multiplier, &res, &n5);  // res = 5;
+    {
+      Cleanable c1;
+      c1.DelegateCleanupsTo(&c2);  // res = 5;
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(5, res);
+}
+
+static void ReleaseStringHeap(void* s, void*) {
+  delete reinterpret_cast<const std::string*>(s);
+}
+
+class PinnableSlice4Test : public PinnableSlice {
+ public:
+  void TestStringIsRegistered(std::string* s) {
+    ASSERT_TRUE(cleanup_.function == ReleaseStringHeap);
+    ASSERT_EQ(cleanup_.arg1, s);
+    ASSERT_EQ(cleanup_.arg2, nullptr);
+    ASSERT_EQ(cleanup_.next, nullptr);
+  }
+};
+
+// Putting the PinnableSlice tests here due to similarity to Cleanable tests
+TEST_F(CleanableTest, PinnableSlice) {
+  int n2 = 2;
+  int res = 1;
+  const std::string const_str = "123";
+
+  {
+    res = 1;
+    PinnableSlice4Test value;
+    Slice slice(const_str);
+    value.PinSlice(slice, Multiplier, &res, &n2);
+    std::string str;
+    str.assign(value.data(), value.size());
+    ASSERT_EQ(const_str, str);
+  }
+  // ~Cleanable
+  ASSERT_EQ(2, res);
+
+  {
+    res = 1;
+    PinnableSlice4Test value;
+    Slice slice(const_str);
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      value.PinSlice(slice, &c1);
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);  // cleanups must have be delegated to value
+    std::string str;
+    str.assign(value.data(), value.size());
+    ASSERT_EQ(const_str, str);
+  }
+  // ~Cleanable
+  ASSERT_EQ(2, res);
+
+  {
+    PinnableSlice4Test value;
+    Slice slice(const_str);
+    value.PinSelf(slice);
+    std::string str;
+    str.assign(value.data(), value.size());
+    ASSERT_EQ(const_str, str);
+  }
+
+  {
+    PinnableSlice4Test value;
+    std::string* self_str_ptr = value.GetSelf();
+    self_str_ptr->assign(const_str);
+    value.PinSelf();
+    std::string str;
+    str.assign(value.data(), value.size());
+    ASSERT_EQ(const_str, str);
+  }
+}
+
+static void Decrement(void* intptr, void*) { --*static_cast<int*>(intptr); }
+
+// Allow unit testing moved-from data
+template <class T>
+void MarkInitializedForClangAnalyze(T& t) {
+  // No net effect, but confuse analyzer. (Published advice doesn't work.)
+  char* p = reinterpret_cast<char*>(&t);
+  std::swap(*p, *p);
+}
+
+TEST_F(CleanableTest, SharedWrapCleanables) {
+  int val = 5;
+  Cleanable c1, c2;
+  c1.RegisterCleanup(&Decrement, &val, nullptr);
+  c1.RegisterCleanup(&Decrement, &val, nullptr);
+  ASSERT_TRUE(c1.HasCleanups());
+  ASSERT_FALSE(c2.HasCleanups());
+
+  SharedCleanablePtr scp1;
+  ASSERT_EQ(scp1.get(), nullptr);
+
+  // No-ops
+  scp1.RegisterCopyWith(&c2);
+  scp1.MoveAsCleanupTo(&c2);
+
+  ASSERT_FALSE(c2.HasCleanups());
+  c2.RegisterCleanup(&Decrement, &val, nullptr);
+  c2.RegisterCleanup(&Decrement, &val, nullptr);
+  c2.RegisterCleanup(&Decrement, &val, nullptr);
+
+  scp1.Allocate();
+  ASSERT_NE(scp1.get(), nullptr);
+  ASSERT_FALSE(scp1->HasCleanups());
+
+  // Copy ctor (alias scp2 = scp1)
+  SharedCleanablePtr scp2{scp1};
+  ASSERT_EQ(scp1.get(), scp2.get());
+
+  c1.DelegateCleanupsTo(&*scp1);
+  ASSERT_TRUE(scp1->HasCleanups());
+  ASSERT_TRUE(scp2->HasCleanups());
+  ASSERT_FALSE(c1.HasCleanups());
+
+  SharedCleanablePtr scp3;
+  ASSERT_EQ(scp3.get(), nullptr);
+
+  // Copy operator (alias scp3 = scp2 = scp1)
+  scp3 = scp2;
+
+  // Make scp2 point elsewhere
+  scp2.Allocate();
+  c2.DelegateCleanupsTo(&*scp2);
+
+  ASSERT_EQ(val, 5);
+  // Move operator, invoke old c2 cleanups
+  scp2 = std::move(scp1);
+  ASSERT_EQ(val, 2);
+  MarkInitializedForClangAnalyze(scp1);
+  ASSERT_EQ(scp1.get(), nullptr);
+
+  // Move ctor
+  {
+    SharedCleanablePtr scp4{std::move(scp3)};
+    MarkInitializedForClangAnalyze(scp3);
+    ASSERT_EQ(scp3.get(), nullptr);
+    ASSERT_EQ(scp4.get(), scp2.get());
+
+    scp2.Reset();
+    ASSERT_EQ(val, 2);
+    // invoke old c1 cleanups
+  }
+  ASSERT_EQ(val, 0);
+}
+
+TEST_F(CleanableTest, CleanableWrapShared) {
+  int val = 5;
+  SharedCleanablePtr scp1, scp2;
+  scp1.Allocate();
+  scp1->RegisterCleanup(&Decrement, &val, nullptr);
+  scp1->RegisterCleanup(&Decrement, &val, nullptr);
+
+  scp2.Allocate();
+  scp2->RegisterCleanup(&Decrement, &val, nullptr);
+  scp2->RegisterCleanup(&Decrement, &val, nullptr);
+  scp2->RegisterCleanup(&Decrement, &val, nullptr);
+
+  {
+    Cleanable c1;
+    {
+      Cleanable c2, c3;
+      scp1.RegisterCopyWith(&c1);
+      scp1.MoveAsCleanupTo(&c2);
+      ASSERT_TRUE(c1.HasCleanups());
+      ASSERT_TRUE(c2.HasCleanups());
+      ASSERT_EQ(scp1.get(), nullptr);
+      scp2.MoveAsCleanupTo(&c3);
+      ASSERT_TRUE(c3.HasCleanups());
+      ASSERT_EQ(scp2.get(), nullptr);
+      c2.Reset();
+      ASSERT_FALSE(c2.HasCleanups());
+      ASSERT_EQ(val, 5);
+      // invoke cleanups from scp2
+    }
+    ASSERT_EQ(val, 2);
+    // invoke cleanups from scp1
+  }
+  ASSERT_EQ(val, 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc b/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc
new file mode 100644
index 000000000..296825d94
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc
@@ -0,0 +1,553 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_builder.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "util/autovector.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+const std::string CuckooTablePropertyNames::kEmptyKey =
+    "rocksdb.cuckoo.bucket.empty.key";
+const std::string CuckooTablePropertyNames::kNumHashFunc =
+    "rocksdb.cuckoo.hash.num";
+const std::string CuckooTablePropertyNames::kHashTableSize =
+    "rocksdb.cuckoo.hash.size";
+const std::string CuckooTablePropertyNames::kValueLength =
+    "rocksdb.cuckoo.value.length";
+const std::string CuckooTablePropertyNames::kIsLastLevel =
+    "rocksdb.cuckoo.file.islastlevel";
+const std::string CuckooTablePropertyNames::kCuckooBlockSize =
+    "rocksdb.cuckoo.hash.cuckooblocksize";
+const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
+    "rocksdb.cuckoo.hash.identityfirst";
+const std::string CuckooTablePropertyNames::kUseModuleHash =
+    "rocksdb.cuckoo.hash.usemodule";
+const std::string CuckooTablePropertyNames::kUserKeyLength =
+    "rocksdb.cuckoo.hash.userkeylength";
+
+// Obtained by running echo rocksdb.table.cuckoo | sha1sum
+extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
+
+CuckooTableBuilder::CuckooTableBuilder(
+    WritableFileWriter* file, double max_hash_table_ratio,
+    uint32_t max_num_hash_table, uint32_t max_search_depth,
+    const Comparator* user_comparator, uint32_t cuckoo_block_size,
+    bool use_module_hash, bool identity_as_first_hash,
+    uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
+    uint32_t column_family_id, const std::string& column_family_name,
+    const std::string& db_id, const std::string& db_session_id,
+    uint64_t file_number)
+    : num_hash_func_(2),
+      file_(file),
+      max_hash_table_ratio_(max_hash_table_ratio),
+      max_num_hash_func_(max_num_hash_table),
+      max_search_depth_(max_search_depth),
+      cuckoo_block_size_(std::max(1U, cuckoo_block_size)),
+      hash_table_size_(use_module_hash ? 0 : 2),
+      is_last_level_file_(false),
+      has_seen_first_key_(false),
+      has_seen_first_value_(false),
+      key_size_(0),
+      value_size_(0),
+      num_entries_(0),
+      num_values_(0),
+      ucomp_(user_comparator),
+      use_module_hash_(use_module_hash),
+      identity_as_first_hash_(identity_as_first_hash),
+      get_slice_hash_(get_slice_hash),
+      closed_(false) {
+  // Data is in a huge block.
+  properties_.num_data_blocks = 1;
+  properties_.index_size = 0;
+  properties_.filter_size = 0;
+  properties_.column_family_id = column_family_id;
+  properties_.column_family_name = column_family_name;
+  properties_.db_id = db_id;
+  properties_.db_session_id = db_session_id;
+  properties_.orig_file_number = file_number;
+  status_.PermitUncheckedError();
+  io_status_.PermitUncheckedError();
+}
+
+void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
+  if (num_entries_ >= kMaxVectorIdx - 1) {
+    status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
+    return;
+  }
+  ParsedInternalKey ikey;
+  Status pik_status =
+      ParseInternalKey(key, &ikey, false /* log_err_key */);  // TODO
+  if (!pik_status.ok()) {
+    status_ = Status::Corruption("Unable to parse key into internal key. ",
+                                 pik_status.getState());
+    return;
+  }
+  if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
+    status_ = Status::NotSupported("Unsupported key type " +
+                                   std::to_string(ikey.type));
+    return;
+  }
+
+  // Determine if we can ignore the sequence number and value type from
+  // internal keys by looking at sequence number from first key. We assume
+  // that if first key has a zero sequence number, then all the remaining
+  // keys will have zero seq. no.
+  if (!has_seen_first_key_) {
+    is_last_level_file_ = ikey.sequence == 0;
+    has_seen_first_key_ = true;
+    smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+    largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+    key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
+  }
+  if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) {
+    status_ = Status::NotSupported("all keys have to be the same size");
+    return;
+  }
+
+  if (ikey.type == kTypeValue) {
+    if (!has_seen_first_value_) {
+      has_seen_first_value_ = true;
+      value_size_ = value.size();
+    }
+    if (value_size_ != value.size()) {
+      status_ = Status::NotSupported("all values have to be the same size");
+      return;
+    }
+
+    if (is_last_level_file_) {
+      kvs_.append(ikey.user_key.data(), ikey.user_key.size());
+    } else {
+      kvs_.append(key.data(), key.size());
+    }
+    kvs_.append(value.data(), value.size());
+    ++num_values_;
+  } else {
+    if (is_last_level_file_) {
+      deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size());
+    } else {
+      deleted_keys_.append(key.data(), key.size());
+    }
+  }
+  ++num_entries_;
+
+  // In order to fill the empty buckets in the hash table, we identify a
+  // key which is not used so far (unused_user_key). We determine this by
+  // maintaining smallest and largest keys inserted so far in bytewise order
+  // and use them to find a key outside this range in Finish() operation.
+  // Note that this strategy is independent of user comparator used here.
+  if (ikey.user_key.compare(smallest_user_key_) < 0) {
+    smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+  } else if (ikey.user_key.compare(largest_user_key_) > 0) {
+    largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+  }
+  if (!use_module_hash_) {
+    if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
+      hash_table_size_ *= 2;
+    }
+  }
+}
+
+bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const {
+  assert(closed_);
+  return idx >= num_values_;
+}
+
+Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
+  assert(closed_);
+  if (IsDeletedKey(idx)) {
+    return Slice(
+        &deleted_keys_[static_cast<size_t>((idx - num_values_) * key_size_)],
+        static_cast<size_t>(key_size_));
+  }
+  return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_))],
+               static_cast<size_t>(key_size_));
+}
+
+Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
+  assert(closed_);
+  return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
+}
+
+Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
+  assert(closed_);
+  if (IsDeletedKey(idx)) {
+    static std::string empty_value(static_cast<unsigned int>(value_size_), 'a');
+    return Slice(empty_value);
+  }
+  return Slice(
+      &kvs_[static_cast<size_t>(idx * (key_size_ + value_size_) + key_size_)],
+      static_cast<size_t>(value_size_));
+}
+
+Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
+  buckets->resize(
+      static_cast<size_t>(hash_table_size_ + cuckoo_block_size_ - 1));
+  uint32_t make_space_for_key_call_id = 0;
+  for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
+    uint64_t bucket_id = 0;
+    bool bucket_found = false;
+    autovector<uint64_t> hash_vals;
+    Slice user_key = GetUserKey(vector_idx);
+    for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
+         ++hash_cnt) {
+      uint64_t hash_val =
+          CuckooHash(user_key, hash_cnt, use_module_hash_, hash_table_size_,
+                     identity_as_first_hash_, get_slice_hash_);
+      // If there is a collision, check next cuckoo_block_size_ locations for
+      // empty locations. While checking, if we reach end of the hash table,
+      // stop searching and proceed for next hash function.
+      for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+           ++block_idx, ++hash_val) {
+        if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx ==
+            kMaxVectorIdx) {
+          bucket_id = hash_val;
+          bucket_found = true;
+          break;
+        } else {
+          if (ucomp_->Compare(
+                  user_key, GetUserKey((*buckets)[static_cast<size_t>(hash_val)]
+                                           .vector_idx)) == 0) {
+            return Status::NotSupported("Same key is being inserted again.");
+          }
+          hash_vals.push_back(hash_val);
+        }
+      }
+    }
+    while (!bucket_found &&
+           !MakeSpaceForKey(hash_vals, ++make_space_for_key_call_id, buckets,
+                            &bucket_id)) {
+      // Rehash by increashing number of hash tables.
+      if (num_hash_func_ >= max_num_hash_func_) {
+        return Status::NotSupported("Too many collisions. Unable to hash.");
+      }
+      // We don't really need to rehash the entire table because old hashes are
+      // still valid and we only increased the number of hash functions.
+      uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_,
+                                     hash_table_size_, identity_as_first_hash_,
+                                     get_slice_hash_);
+      ++num_hash_func_;
+      for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+           ++block_idx, ++hash_val) {
+        if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx ==
+            kMaxVectorIdx) {
+          bucket_found = true;
+          bucket_id = hash_val;
+          break;
+        } else {
+          hash_vals.push_back(hash_val);
+        }
+      }
+    }
+    (*buckets)[static_cast<size_t>(bucket_id)].vector_idx = vector_idx;
+  }
+  return Status::OK();
+}
+
+Status CuckooTableBuilder::Finish() {
+  assert(!closed_);
+  closed_ = true;
+  std::vector<CuckooBucket> buckets;
+  std::string unused_bucket;
+  if (num_entries_ > 0) {
+    // Calculate the real hash size if module hash is enabled.
+    if (use_module_hash_) {
+      hash_table_size_ =
+          static_cast<uint64_t>(num_entries_ / max_hash_table_ratio_);
+    }
+    status_ = MakeHashTable(&buckets);
+    if (!status_.ok()) {
+      return status_;
+    }
+    // Determine unused_user_key to fill empty buckets.
+    std::string unused_user_key = smallest_user_key_;
+    int curr_pos = static_cast<int>(unused_user_key.size()) - 1;
+    while (curr_pos >= 0) {
+      --unused_user_key[curr_pos];
+      if (Slice(unused_user_key).compare(smallest_user_key_) < 0) {
+        break;
+      }
+      --curr_pos;
+    }
+    if (curr_pos < 0) {
+      // Try using the largest key to identify an unused key.
+      unused_user_key = largest_user_key_;
+      curr_pos = static_cast<int>(unused_user_key.size()) - 1;
+      while (curr_pos >= 0) {
+        ++unused_user_key[curr_pos];
+        if (Slice(unused_user_key).compare(largest_user_key_) > 0) {
+          break;
+        }
+        --curr_pos;
+      }
+    }
+    if (curr_pos < 0) {
+      return Status::Corruption("Unable to find unused key");
+    }
+    if (is_last_level_file_) {
+      unused_bucket = unused_user_key;
+    } else {
+      ParsedInternalKey ikey(unused_user_key, 0, kTypeValue);
+      AppendInternalKey(&unused_bucket, ikey);
+    }
+  }
+  properties_.num_entries = num_entries_;
+  properties_.num_deletions = num_entries_ - num_values_;
+  properties_.fixed_key_len = key_size_;
+  properties_.user_collected_properties[CuckooTablePropertyNames::kValueLength]
+      .assign(reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
+
+  uint64_t bucket_size = key_size_ + value_size_;
+  unused_bucket.resize(static_cast<size_t>(bucket_size), 'a');
+  // Write the table.
+  uint32_t num_added = 0;
+  for (auto& bucket : buckets) {
+    if (bucket.vector_idx == kMaxVectorIdx) {
+      io_status_ = file_->Append(Slice(unused_bucket));
+    } else {
+      ++num_added;
+      io_status_ = file_->Append(GetKey(bucket.vector_idx));
+      if (io_status_.ok()) {
+        if (value_size_ > 0) {
+          io_status_ = file_->Append(GetValue(bucket.vector_idx));
+        }
+      }
+    }
+    if (!io_status_.ok()) {
+      status_ = io_status_;
+      return status_;
+    }
+  }
+  assert(num_added == NumEntries());
+  properties_.raw_key_size = num_added * properties_.fixed_key_len;
+  properties_.raw_value_size = num_added * value_size_;
+
+  uint64_t offset = buckets.size() * bucket_size;
+  properties_.data_size = offset;
+  unused_bucket.resize(static_cast<size_t>(properties_.fixed_key_len));
+  properties_.user_collected_properties[CuckooTablePropertyNames::kEmptyKey] =
+      unused_bucket;
+  properties_.user_collected_properties[CuckooTablePropertyNames::kNumHashFunc]
+      .assign(reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_));
+
+  properties_
+      .user_collected_properties[CuckooTablePropertyNames::kHashTableSize]
+      .assign(reinterpret_cast<const char*>(&hash_table_size_),
+              sizeof(hash_table_size_));
+  properties_.user_collected_properties[CuckooTablePropertyNames::kIsLastLevel]
+      .assign(reinterpret_cast<const char*>(&is_last_level_file_),
+              sizeof(is_last_level_file_));
+  properties_
+      .user_collected_properties[CuckooTablePropertyNames::kCuckooBlockSize]
+      .assign(reinterpret_cast<const char*>(&cuckoo_block_size_),
+              sizeof(cuckoo_block_size_));
+  properties_
+      .user_collected_properties[CuckooTablePropertyNames::kIdentityAsFirstHash]
+      .assign(reinterpret_cast<const char*>(&identity_as_first_hash_),
+              sizeof(identity_as_first_hash_));
+  properties_
+      .user_collected_properties[CuckooTablePropertyNames::kUseModuleHash]
+      .assign(reinterpret_cast<const char*>(&use_module_hash_),
+              sizeof(use_module_hash_));
+  uint32_t user_key_len = static_cast<uint32_t>(smallest_user_key_.size());
+  properties_
+      .user_collected_properties[CuckooTablePropertyNames::kUserKeyLength]
+      .assign(reinterpret_cast<const char*>(&user_key_len),
+              sizeof(user_key_len));
+
+  // Write meta blocks.
+  MetaIndexBuilder meta_index_builder;
+  PropertyBlockBuilder property_block_builder;
+
+  property_block_builder.AddTableProperty(properties_);
+  property_block_builder.Add(properties_.user_collected_properties);
+  Slice property_block = property_block_builder.Finish();
+  BlockHandle property_block_handle;
+  property_block_handle.set_offset(offset);
+  property_block_handle.set_size(property_block.size());
+  io_status_ = file_->Append(property_block);
+  offset += property_block.size();
+  if (!io_status_.ok()) {
+    status_ = io_status_;
+    return status_;
+  }
+
+  meta_index_builder.Add(kPropertiesBlockName, property_block_handle);
+  Slice meta_index_block = meta_index_builder.Finish();
+
+  BlockHandle meta_index_block_handle;
+  meta_index_block_handle.set_offset(offset);
+  meta_index_block_handle.set_size(meta_index_block.size());
+  io_status_ = file_->Append(meta_index_block);
+  if (!io_status_.ok()) {
+    status_ = io_status_;
+    return status_;
+  }
+
+  FooterBuilder footer;
+  footer.Build(kCuckooTableMagicNumber, /* format_version */ 1, offset,
+               kNoChecksum, meta_index_block_handle);
+  io_status_ = file_->Append(footer.GetSlice());
+  status_ = io_status_;
+  return status_;
+}
+
+void CuckooTableBuilder::Abandon() {
+  assert(!closed_);
+  closed_ = true;
+}
+
+uint64_t CuckooTableBuilder::NumEntries() const { return num_entries_; }
+
+uint64_t CuckooTableBuilder::FileSize() const {
+  if (closed_) {
+    return file_->GetFileSize();
+  } else if (num_entries_ == 0) {
+    return 0;
+  }
+
+  if (use_module_hash_) {
+    return static_cast<uint64_t>((key_size_ + value_size_) * num_entries_ /
+                                 max_hash_table_ratio_);
+  } else {
+    // Account for buckets being a power of two.
+    // As elements are added, file size remains constant for a while and
+    // doubles its size. Since compaction algorithm stops adding elements
+    // only after it exceeds the file limit, we account for the extra element
+    // being added here.
+    uint64_t expected_hash_table_size = hash_table_size_;
+    if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
+      expected_hash_table_size *= 2;
+    }
+    return (key_size_ + value_size_) * expected_hash_table_size - 1;
+  }
+}
+
+// This method is invoked when there is no place to insert the target key.
+// It searches for a set of elements that can be moved to accommodate target
+// key. The search is a BFS graph traversal with first level (hash_vals)
+// being all the buckets target key could go to.
+// Then, from each node (curr_node), we find all the buckets that curr_node
+// could go to. They form the children of curr_node in the tree.
+// We continue the traversal until we find an empty bucket, in which case, we
+// move all elements along the path from first level to this empty bucket, to
+// make space for target key which is inserted at first level (*bucket_id).
+// If tree depth exceedes max depth, we return false indicating failure.
+bool CuckooTableBuilder::MakeSpaceForKey(
+    const autovector<uint64_t>& hash_vals,
+    const uint32_t make_space_for_key_call_id,
+    std::vector<CuckooBucket>* buckets, uint64_t* bucket_id) {
+  struct CuckooNode {
+    uint64_t bucket_id;
+    uint32_t depth;
+    uint32_t parent_pos;
+    CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos)
+        : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {}
+  };
+  // This is BFS search tree that is stored simply as a vector.
+  // Each node stores the index of parent node in the vector.
+  std::vector<CuckooNode> tree;
+  // We want to identify already visited buckets in the current method call so
+  // that we don't add same buckets again for exploration in the tree.
+  // We do this by maintaining a count of current method call in
+  // make_space_for_key_call_id, which acts as a unique id for this invocation
+  // of the method. We store this number into the nodes that we explore in
+  // current method call.
+  // It is unlikely for the increment operation to overflow because the maximum
+  // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
+  for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
+    uint64_t bid = hash_vals[hash_cnt];
+    (*buckets)[static_cast<size_t>(bid)].make_space_for_key_call_id =
+        make_space_for_key_call_id;
+    tree.push_back(CuckooNode(bid, 0, 0));
+  }
+  bool null_found = false;
+  uint32_t curr_pos = 0;
+  while (!null_found && curr_pos < tree.size()) {
+    CuckooNode& curr_node = tree[curr_pos];
+    uint32_t curr_depth = curr_node.depth;
+    if (curr_depth >= max_search_depth_) {
+      break;
+    }
+    CuckooBucket& curr_bucket =
+        (*buckets)[static_cast<size_t>(curr_node.bucket_id)];
+    for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !null_found;
+         ++hash_cnt) {
+      uint64_t child_bucket_id = CuckooHash(
+          GetUserKey(curr_bucket.vector_idx), hash_cnt, use_module_hash_,
+          hash_table_size_, identity_as_first_hash_, get_slice_hash_);
+      // Iterate inside Cuckoo Block.
+      for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+           ++block_idx, ++child_bucket_id) {
+        if ((*buckets)[static_cast<size_t>(child_bucket_id)]
+                .make_space_for_key_call_id == make_space_for_key_call_id) {
+          continue;
+        }
+        (*buckets)[static_cast<size_t>(child_bucket_id)]
+            .make_space_for_key_call_id = make_space_for_key_call_id;
+        tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, curr_pos));
+        if ((*buckets)[static_cast<size_t>(child_bucket_id)].vector_idx ==
+            kMaxVectorIdx) {
+          null_found = true;
+          break;
+        }
+      }
+    }
+    ++curr_pos;
+  }
+
+  if (null_found) {
+    // There is an empty node in tree.back(). Now, traverse the path from this
+    // empty node to top of the tree and at every node in the path, replace
+    // child with the parent. Stop when first level is reached in the tree
+    // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
+    // this location in first level for target key to be inserted.
+    uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
+    while (bucket_to_replace_pos >= num_hash_func_) {
+      CuckooNode& curr_node = tree[bucket_to_replace_pos];
+      (*buckets)[static_cast<size_t>(curr_node.bucket_id)] =
+          (*buckets)[static_cast<size_t>(tree[curr_node.parent_pos].bucket_id)];
+      bucket_to_replace_pos = curr_node.parent_pos;
+    }
+    *bucket_id = tree[bucket_to_replace_pos].bucket_id;
+  }
+  return null_found;
+}
+
+std::string CuckooTableBuilder::GetFileChecksum() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
+const char* CuckooTableBuilder::GetFileChecksumFuncName() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksumFuncName();
+  } else {
+    return kUnknownFileChecksumFuncName;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder.h b/src/rocksdb/table/cuckoo/cuckoo_table_builder.h
new file mode 100644
index 000000000..a125e1f4c
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder.h
@@ -0,0 +1,138 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_builder.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CuckooTableBuilder : public TableBuilder {
+ public:
+  CuckooTableBuilder(
+      WritableFileWriter* file, double max_hash_table_ratio,
+      uint32_t max_num_hash_func, uint32_t max_search_depth,
+      const Comparator* user_comparator, uint32_t cuckoo_block_size,
+      bool use_module_hash, bool identity_as_first_hash,
+      uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
+      uint32_t column_family_id, const std::string& column_family_name,
+      const std::string& db_id = "", const std::string& db_session_id = "",
+      uint64_t file_number = 0);
+  // No copying allowed
+  CuckooTableBuilder(const CuckooTableBuilder&) = delete;
+  void operator=(const CuckooTableBuilder&) = delete;
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~CuckooTableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override { return status_; }
+
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override { return io_status_; }
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+  TableProperties GetTableProperties() const override { return properties_; }
+
+  // Get file checksum
+  std::string GetFileChecksum() const override;
+
+  // Get file checksum function name
+  const char* GetFileChecksumFuncName() const override;
+
+ private:
+  struct CuckooBucket {
+    CuckooBucket() : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {}
+    uint32_t vector_idx;
+    // This number will not exceed kvs_.size() + max_num_hash_func_.
+    // We assume number of items is <= 2^32.
+    uint32_t make_space_for_key_call_id;
+  };
+  static const uint32_t kMaxVectorIdx = std::numeric_limits<int32_t>::max();
+
+  bool MakeSpaceForKey(const autovector<uint64_t>& hash_vals,
+                       const uint32_t call_id,
+                       std::vector<CuckooBucket>* buckets, uint64_t* bucket_id);
+  Status MakeHashTable(std::vector<CuckooBucket>* buckets);
+
+  inline bool IsDeletedKey(uint64_t idx) const;
+  inline Slice GetKey(uint64_t idx) const;
+  inline Slice GetUserKey(uint64_t idx) const;
+  inline Slice GetValue(uint64_t idx) const;
+
+  uint32_t num_hash_func_;
+  WritableFileWriter* file_;
+  const double max_hash_table_ratio_;
+  const uint32_t max_num_hash_func_;
+  const uint32_t max_search_depth_;
+  const uint32_t cuckoo_block_size_;
+  uint64_t hash_table_size_;
+  bool is_last_level_file_;
+  bool has_seen_first_key_;
+  bool has_seen_first_value_;
+  uint64_t key_size_;
+  uint64_t value_size_;
+  // A list of fixed-size key-value pairs concatenating into a string.
+  // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific
+  // key / value given an index
+  std::string kvs_;
+  std::string deleted_keys_;
+  // Number of key-value pairs stored in kvs_ + number of deleted keys
+  uint64_t num_entries_;
+  // Number of keys that contain value (non-deletion op)
+  uint64_t num_values_;
+  Status status_;
+  IOStatus io_status_;
+  TableProperties properties_;
+  const Comparator* ucomp_;
+  bool use_module_hash_;
+  bool identity_as_first_hash_;
+  uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
+                              uint64_t max_num_buckets);
+  std::string largest_user_key_ = "";
+  std::string smallest_user_key_ = "";
+
+  bool closed_;  // Either Finish() or Abandon() has been called.
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc b/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc
new file mode 100644
index 000000000..be1c62117
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc
@@ -0,0 +1,640 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "table/cuckoo/cuckoo_table_builder.h"
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+extern const uint64_t kCuckooTableMagicNumber;
+
+namespace {
+std::unordered_map<std::string, std::vector<uint64_t>> hash_map;
+
+uint64_t GetSliceHash(const Slice& s, uint32_t index,
+                      uint64_t /*max_num_buckets*/) {
+  return hash_map[s.ToString()][index];
+}
+}  // namespace
+
+class CuckooBuilderTest : public testing::Test {
+ public:
+  CuckooBuilderTest() {
+    env_ = Env::Default();
+    Options options;
+    options.allow_mmap_reads = true;
+    file_options_ = FileOptions(options);
+  }
+
+  void CheckFileContents(const std::vector<std::string>& keys,
+                         const std::vector<std::string>& values,
+                         const std::vector<uint64_t>& expected_locations,
+                         std::string expected_unused_bucket,
+                         uint64_t expected_table_size,
+                         uint32_t expected_num_hash_func,
+                         bool expected_is_last_level,
+                         uint32_t expected_cuckoo_block_size = 1) {
+    uint64_t num_deletions = 0;
+    for (const auto& key : keys) {
+      ParsedInternalKey parsed;
+      Status pik_status =
+          ParseInternalKey(key, &parsed, true /* log_err_key */);
+      if (pik_status.ok() && parsed.type == kTypeDeletion) {
+        num_deletions++;
+      }
+    }
+    // Read file
+    uint64_t read_file_size;
+    ASSERT_OK(env_->GetFileSize(fname, &read_file_size));
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env_->GetFileSystem(), fname, file_options_, &file_reader, nullptr));
+
+    Options options;
+    options.allow_mmap_reads = true;
+    ImmutableOptions ioptions(options);
+
+    // Assert Table Properties.
+    std::unique_ptr<TableProperties> props;
+    ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size,
+                                  kCuckooTableMagicNumber, ioptions, &props));
+    // Check unused bucket.
+    std::string unused_key =
+        props->user_collected_properties[CuckooTablePropertyNames::kEmptyKey];
+    ASSERT_EQ(expected_unused_bucket.substr(0, props->fixed_key_len),
+              unused_key);
+
+    uint64_t value_len_found = *reinterpret_cast<const uint64_t*>(
+        props->user_collected_properties[CuckooTablePropertyNames::kValueLength]
+            .data());
+    ASSERT_EQ(values.empty() ? 0 : values[0].size(), value_len_found);
+    ASSERT_EQ(props->raw_value_size, values.size() * value_len_found);
+    const uint64_t table_size = *reinterpret_cast<const uint64_t*>(
+        props
+            ->user_collected_properties
+                [CuckooTablePropertyNames::kHashTableSize]
+            .data());
+    ASSERT_EQ(expected_table_size, table_size);
+    const uint32_t num_hash_func_found = *reinterpret_cast<const uint32_t*>(
+        props->user_collected_properties[CuckooTablePropertyNames::kNumHashFunc]
+            .data());
+    ASSERT_EQ(expected_num_hash_func, num_hash_func_found);
+    const uint32_t cuckoo_block_size = *reinterpret_cast<const uint32_t*>(
+        props
+            ->user_collected_properties
+                [CuckooTablePropertyNames::kCuckooBlockSize]
+            .data());
+    ASSERT_EQ(expected_cuckoo_block_size, cuckoo_block_size);
+    const bool is_last_level_found = *reinterpret_cast<const bool*>(
+        props->user_collected_properties[CuckooTablePropertyNames::kIsLastLevel]
+            .data());
+    ASSERT_EQ(expected_is_last_level, is_last_level_found);
+
+    ASSERT_EQ(props->num_entries, keys.size());
+    ASSERT_EQ(props->num_deletions, num_deletions);
+    ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size());
+    ASSERT_EQ(props->data_size,
+              expected_unused_bucket.size() *
+                  (expected_table_size + expected_cuckoo_block_size - 1));
+    ASSERT_EQ(props->raw_key_size, keys.size() * props->fixed_key_len);
+    ASSERT_EQ(props->column_family_id, 0);
+    ASSERT_EQ(props->column_family_name, kDefaultColumnFamilyName);
+
+    // Check contents of the bucket.
+    std::vector<bool> keys_found(keys.size(), false);
+    size_t bucket_size = expected_unused_bucket.size();
+    for (uint32_t i = 0; i + 1 < table_size + cuckoo_block_size; ++i) {
+      Slice read_slice;
+      ASSERT_OK(file_reader->Read(IOOptions(), i * bucket_size, bucket_size,
+                                  &read_slice, nullptr, nullptr,
+                                  Env::IO_TOTAL /* rate_limiter_priority */));
+      size_t key_idx =
+          std::find(expected_locations.begin(), expected_locations.end(), i) -
+          expected_locations.begin();
+      if (key_idx == keys.size()) {
+        // i is not one of the expected locations. Empty bucket.
+        if (read_slice.data() == nullptr) {
+          ASSERT_EQ(0, expected_unused_bucket.size());
+        } else {
+          ASSERT_EQ(read_slice.compare(expected_unused_bucket), 0);
+        }
+      } else {
+        keys_found[key_idx] = true;
+        ASSERT_EQ(read_slice.compare(keys[key_idx] + values[key_idx]), 0);
+      }
+    }
+    for (auto key_found : keys_found) {
+      // Check that all keys wereReader found.
+      ASSERT_TRUE(key_found);
+    }
+  }
+
+  std::string GetInternalKey(Slice user_key, bool zero_seqno,
+                             ValueType type = kTypeValue) {
+    IterKey ikey;
+    ikey.SetInternalKey(user_key, zero_seqno ? 0 : 1000, type);
+    return ikey.GetInternalKey().ToString();
+  }
+
+  uint64_t NextPowOf2(uint64_t num) {
+    uint64_t n = 2;
+    while (n <= num) {
+      n *= 2;
+    }
+    return n;
+  }
+
+  uint64_t GetExpectedTableSize(uint64_t num) {
+    return NextPowOf2(static_cast<uint64_t>(num / kHashTableRatio));
+  }
+
+  Env* env_;
+  FileOptions file_options_;
+  std::string fname;
+  const double kHashTableRatio = 0.9;
+};
+
+TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
+  std::unique_ptr<WritableFile> writable_file;
+  fname = test::PerThreadDBPath("EmptyFile");
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100,
+                             BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  ASSERT_EQ(0UL, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  CheckFileContents({}, {}, {}, "", 2, 2, false);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
+  for (auto type : {kTypeValue, kTypeDeletion}) {
+    uint32_t num_hash_fun = 4;
+    std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+    std::vector<std::string> values;
+    if (type == kTypeValue) {
+      values = {"v01", "v02", "v03", "v04"};
+    } else {
+      values = {"", "", "", ""};
+    }
+    // Need to have a temporary variable here as VS compiler does not currently
+    // support operator= with initializer_list as a parameter
+    std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+        {user_keys[0], {0, 1, 2, 3}},
+        {user_keys[1], {1, 2, 3, 4}},
+        {user_keys[2], {2, 3, 4, 5}},
+        {user_keys[3], {3, 4, 5, 6}}};
+    hash_map = std::move(hm);
+
+    std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+    std::vector<std::string> keys;
+    for (auto& user_key : user_keys) {
+      keys.push_back(GetInternalKey(user_key, false, type));
+    }
+    uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+    fname = test::PerThreadDBPath("NoCollisionFullKey");
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                         file_options_, &file_writer, nullptr));
+    CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                               100, BytewiseComparator(), 1, false, false,
+                               GetSliceHash, 0 /* column_family_id */,
+                               kDefaultColumnFamilyName);
+    ASSERT_OK(builder.status());
+    for (uint32_t i = 0; i < user_keys.size(); i++) {
+      builder.Add(Slice(keys[i]), Slice(values[i]));
+      ASSERT_EQ(builder.NumEntries(), i + 1);
+      ASSERT_OK(builder.status());
+    }
+    size_t bucket_size = keys[0].size() + values[0].size();
+    ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+    ASSERT_OK(builder.Finish());
+    ASSERT_OK(file_writer->Close());
+    ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+    std::string expected_unused_bucket = GetInternalKey("key00", true);
+    expected_unused_bucket += std::string(values[0].size(), 'a');
+    CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                      expected_table_size, 2, false);
+  }
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {0, 1, 2, 3}},
+      {user_keys[2], {0, 1, 2, 3}},
+      {user_keys[3], {0, 1, 2, 3}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+  fname = test::PerThreadDBPath("WithCollisionFullKey");
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                    expected_table_size, 4, false);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {0, 1, 2, 3}},
+      {user_keys[2], {0, 1, 2, 3}},
+      {user_keys[3], {0, 1, 2, 3}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  uint32_t cuckoo_block_size = 2;
+  fname = test::PerThreadDBPath("WithCollisionFullKey2");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(
+      file_writer.get(), kHashTableRatio, num_hash_fun, 100,
+      BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash,
+      0 /* column_family_id */, kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                    expected_table_size, 3, false, cuckoo_block_size);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) {
+  // Have two hash functions. Insert elements with overlapping hashes.
+  // Finally insert an element with hash value somewhere in the middle
+  // so that it displaces all the elements after that.
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+                                        "key05"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}},
+      {user_keys[3], {3, 4}}, {user_keys[4], {0, 2}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("WithCollisionPathFullKey");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                    expected_table_size, 2, false);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+                                        "key05"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {3, 4}},
+      {user_keys[3], {4, 5}}, {user_keys[4], {0, 3}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {2, 1, 3, 4, 0};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 2, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                    expected_table_size, 2, false, 2);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {1, 2, 3, 4}},
+      {user_keys[2], {2, 3, 4, 5}},
+      {user_keys[3], {3, 4, 5, 6}}};
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("NoCollisionUserKey");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = "key00";
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(user_keys, values, expected_locations,
+                    expected_unused_bucket, expected_table_size, 2, true);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {0, 1, 2, 3}},
+      {user_keys[2], {0, 1, 2, 3}},
+      {user_keys[3], {0, 1, 2, 3}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("WithCollisionUserKey");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = "key00";
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(user_keys, values, expected_locations,
+                    expected_unused_bucket, expected_table_size, 4, true);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) {
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+                                        "key05"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}},
+      {user_keys[3], {3, 4}}, {user_keys[4], {0, 2}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
+  uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("WithCollisionPathUserKey");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             2, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = "key00";
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(user_keys, values, expected_locations,
+                    expected_unused_bucket, expected_table_size, 2, true);
+}
+
+TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
+  // Have two hash functions. Insert elements with overlapping hashes.
+  // Finally try inserting an element with hash value somewhere in the middle
+  // and it should fail because the no. of elements to displace is too high.
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+                                        "key05"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}},
+      {user_keys[3], {3, 4}}, {user_keys[4], {0, 1}},
+  };
+  hash_map = std::move(hm);
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("WithCollisionPathUserKey");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             2, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value"));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  ASSERT_TRUE(builder.Finish().IsNotSupported());
+  ASSERT_OK(file_writer->Close());
+}
+
+TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) {
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {"repeatedkey", {0, 1, 2, 3}}};
+  hash_map = std::move(hm);
+  uint32_t num_hash_fun = 4;
+  std::string user_key = "repeatedkey";
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("FailWhenSameKeyInserted");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+
+  builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1"));
+  ASSERT_EQ(builder.NumEntries(), 1u);
+  ASSERT_OK(builder.status());
+  builder.Add(Slice(GetInternalKey(user_key, true)), Slice("value2"));
+  ASSERT_EQ(builder.NumEntries(), 2u);
+  ASSERT_OK(builder.status());
+
+  ASSERT_TRUE(builder.Finish().IsNotSupported());
+  ASSERT_OK(file_writer->Close());
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc b/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc
new file mode 100644
index 000000000..1253c92dd
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_factory.h"
+
+#include "db/dbformat.h"
+#include "options/configurable_helper.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status CuckooTableFactory::NewTableReader(
+    const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table,
+    bool /*prefetch_index_and_filter_in_cache*/) const {
+  std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(
+      table_reader_options.ioptions, std::move(file), file_size,
+      table_reader_options.internal_comparator.user_comparator(), nullptr));
+  Status s = new_reader->status();
+  if (s.ok()) {
+    *table = std::move(new_reader);
+  }
+  return s;
+}
+
+TableBuilder* CuckooTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFileWriter* file) const {
+  // TODO: change builder to take the option struct
+  return new CuckooTableBuilder(
+      file, table_options_.hash_table_ratio, 64,
+      table_options_.max_search_depth,
+      table_builder_options.internal_comparator.user_comparator(),
+      table_options_.cuckoo_block_size, table_options_.use_module_hash,
+      table_options_.identity_as_first_hash, nullptr /* get_slice_hash */,
+      table_builder_options.column_family_id,
+      table_builder_options.column_family_name, table_builder_options.db_id,
+      table_builder_options.db_session_id, table_builder_options.cur_file_num);
+}
+
+std::string CuckooTableFactory::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(2000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "  hash_table_ratio: %lf\n",
+           table_options_.hash_table_ratio);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  max_search_depth: %u\n",
+           table_options_.max_search_depth);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  cuckoo_block_size: %u\n",
+           table_options_.cuckoo_block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  identity_as_first_hash: %d\n",
+           table_options_.identity_as_first_hash);
+  ret.append(buffer);
+  return ret;
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> cuckoo_table_type_info =
+    {
+#ifndef ROCKSDB_LITE
+        {"hash_table_ratio",
+         {offsetof(struct CuckooTableOptions, hash_table_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"max_search_depth",
+         {offsetof(struct CuckooTableOptions, max_search_depth),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cuckoo_block_size",
+         {offsetof(struct CuckooTableOptions, cuckoo_block_size),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"identity_as_first_hash",
+         {offsetof(struct CuckooTableOptions, identity_as_first_hash),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"use_module_hash",
+         {offsetof(struct CuckooTableOptions, use_module_hash),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+CuckooTableFactory::CuckooTableFactory(const CuckooTableOptions& table_options)
+    : table_options_(table_options) {
+  RegisterOptions(&table_options_, &cuckoo_table_type_info);
+}
+
+TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) {
+  return new CuckooTableFactory(table_options);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_factory.h b/src/rocksdb/table/cuckoo/cuckoo_table_factory.h
new file mode 100644
index 000000000..9937c28dd
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_factory.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "util/murmurhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint32_t kCuckooMurmurSeedMultiplier = 816922183;
+static inline uint64_t CuckooHash(
+    const Slice& user_key, uint32_t hash_cnt, bool use_module_hash,
+    uint64_t table_size_, bool identity_as_first_hash,
+    uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) {
+#if !defined NDEBUG || defined OS_WIN
+  // This part is used only in unit tests but we have to keep it for Windows
+  // build as we run test in both debug and release modes under Windows.
+  if (get_slice_hash != nullptr) {
+    return get_slice_hash(user_key, hash_cnt, table_size_);
+  }
+#else
+  (void)get_slice_hash;
+#endif
+
+  uint64_t value = 0;
+  if (hash_cnt == 0 && identity_as_first_hash) {
+    value = (*reinterpret_cast<const int64_t*>(user_key.data()));
+  } else {
+    value = MurmurHash(user_key.data(), static_cast<int>(user_key.size()),
+                       kCuckooMurmurSeedMultiplier * hash_cnt);
+  }
+  if (use_module_hash) {
+    return value % table_size_;
+  } else {
+    return value & (table_size_ - 1);
+  }
+}
+
+// Cuckoo Table is designed for applications that require fast point lookups
+// but not fast range scans.
+//
+// Some assumptions:
+// - Key length and Value length are fixed.
+// - Does not support Snapshot.
+// - Does not support Merge operations.
+// - Does not support prefix bloom filters.
+class CuckooTableFactory : public TableFactory {
+ public:
+  explicit CuckooTableFactory(
+      const CuckooTableOptions& table_option = CuckooTableOptions());
+  ~CuckooTableFactory() {}
+
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kCuckooTableName(); }
+  const char* Name() const override { return kCuckooTableName(); }
+
+  using TableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
+      bool prefetch_index_and_filter_in_cache = true) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const override;
+
+  std::string GetPrintableOptions() const override;
+
+ private:
+  CuckooTableOptions table_options_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc b/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc
new file mode 100644
index 000000000..1d70909a6
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc
@@ -0,0 +1,411 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_reader.h"
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/table.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
+const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
+}  // namespace
+
+extern const uint64_t kCuckooTableMagicNumber;
+
+CuckooTableReader::CuckooTableReader(
+    const ImmutableOptions& ioptions,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    const Comparator* comparator,
+    uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
+    : file_(std::move(file)),
+      is_last_level_(false),
+      identity_as_first_hash_(false),
+      use_module_hash_(false),
+      num_hash_func_(0),
+      unused_key_(""),
+      key_length_(0),
+      user_key_length_(0),
+      value_length_(0),
+      bucket_length_(0),
+      cuckoo_block_size_(0),
+      cuckoo_block_bytes_minus_one_(0),
+      table_size_(0),
+      ucomp_(comparator),
+      get_slice_hash_(get_slice_hash) {
+  if (!ioptions.allow_mmap_reads) {
+    status_ = Status::InvalidArgument("File is not mmaped");
+    return;
+  }
+  {
+    std::unique_ptr<TableProperties> props;
+    status_ = ReadTableProperties(file_.get(), file_size,
+                                  kCuckooTableMagicNumber, ioptions, &props);
+    if (!status_.ok()) {
+      return;
+    }
+    table_props_ = std::move(props);
+  }
+  auto& user_props = table_props_->user_collected_properties;
+  auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
+  if (hash_funs == user_props.end()) {
+    status_ = Status::Corruption("Number of hash functions not found");
+    return;
+  }
+  num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
+  auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
+  if (unused_key == user_props.end()) {
+    status_ = Status::Corruption("Empty bucket value not found");
+    return;
+  }
+  unused_key_ = unused_key->second;
+
+  key_length_ = static_cast<uint32_t>(table_props_->fixed_key_len);
+  auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
+  if (user_key_len == user_props.end()) {
+    status_ = Status::Corruption("User key length not found");
+    return;
+  }
+  user_key_length_ =
+      *reinterpret_cast<const uint32_t*>(user_key_len->second.data());
+
+  auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
+  if (value_length == user_props.end()) {
+    status_ = Status::Corruption("Value length not found");
+    return;
+  }
+  value_length_ =
+      *reinterpret_cast<const uint32_t*>(value_length->second.data());
+  bucket_length_ = key_length_ + value_length_;
+
+  auto hash_table_size =
+      user_props.find(CuckooTablePropertyNames::kHashTableSize);
+  if (hash_table_size == user_props.end()) {
+    status_ = Status::Corruption("Hash table size not found");
+    return;
+  }
+  table_size_ =
+      *reinterpret_cast<const uint64_t*>(hash_table_size->second.data());
+
+  auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
+  if (is_last_level == user_props.end()) {
+    status_ = Status::Corruption("Is last level not found");
+    return;
+  }
+  is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
+
+  auto identity_as_first_hash =
+      user_props.find(CuckooTablePropertyNames::kIdentityAsFirstHash);
+  if (identity_as_first_hash == user_props.end()) {
+    status_ = Status::Corruption("identity as first hash not found");
+    return;
+  }
+  identity_as_first_hash_ =
+      *reinterpret_cast<const bool*>(identity_as_first_hash->second.data());
+
+  auto use_module_hash =
+      user_props.find(CuckooTablePropertyNames::kUseModuleHash);
+  if (use_module_hash == user_props.end()) {
+    status_ = Status::Corruption("hash type is not found");
+    return;
+  }
+  use_module_hash_ =
+      *reinterpret_cast<const bool*>(use_module_hash->second.data());
+  auto cuckoo_block_size =
+      user_props.find(CuckooTablePropertyNames::kCuckooBlockSize);
+  if (cuckoo_block_size == user_props.end()) {
+    status_ = Status::Corruption("Cuckoo block size not found");
+    return;
+  }
+  cuckoo_block_size_ =
+      *reinterpret_cast<const uint32_t*>(cuckoo_block_size->second.data());
+  cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
+  // TODO: rate limit reads of whole cuckoo tables.
+  status_ =
+      file_->Read(IOOptions(), 0, static_cast<size_t>(file_size), &file_data_,
+                  nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+}
+
+Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
+                              const Slice& key, GetContext* get_context,
+                              const SliceTransform* /* prefix_extractor */,
+                              bool /*skip_filters*/) {
+  assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
+  Slice user_key = ExtractUserKey(key);
+  for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
+    uint64_t offset =
+        bucket_length_ * CuckooHash(user_key, hash_cnt, use_module_hash_,
+                                    table_size_, identity_as_first_hash_,
+                                    get_slice_hash_);
+    const char* bucket = &file_data_.data()[offset];
+    for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+         ++block_idx, bucket += bucket_length_) {
+      if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
+                        Slice(bucket, user_key.size()))) {
+        return Status::OK();
+      }
+      // Here, we compare only the user key part as we support only one entry
+      // per user key and we don't support snapshot.
+      if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
+        Slice value(bucket + key_length_, value_length_);
+        if (is_last_level_) {
+          // Sequence number is not stored at the last level, so we will use
+          // kMaxSequenceNumber since it is unknown.  This could cause some
+          // transactions to fail to lock a key due to known sequence number.
+          // However, it is expected for anyone to use a CuckooTable in a
+          // TransactionDB.
+          get_context->SaveValue(value, kMaxSequenceNumber);
+        } else {
+          Slice full_key(bucket, key_length_);
+          ParsedInternalKey found_ikey;
+          Status s = ParseInternalKey(full_key, &found_ikey,
+                                      false /* log_err_key */);  // TODO
+          if (!s.ok()) return s;
+          bool dont_care __attribute__((__unused__));
+          get_context->SaveValue(found_ikey, value, &dont_care);
+        }
+        // We don't support merge operations. So, we return here.
+        return Status::OK();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void CuckooTableReader::Prepare(const Slice& key) {
+  // Prefetch the first Cuckoo Block.
+  Slice user_key = ExtractUserKey(key);
+  uint64_t addr =
+      reinterpret_cast<uint64_t>(file_data_.data()) +
+      bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
+                                  identity_as_first_hash_, nullptr);
+  uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
+  for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
+    PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
+  }
+}
+
+class CuckooTableIterator : public InternalIterator {
+ public:
+  explicit CuckooTableIterator(CuckooTableReader* reader);
+  // No copying allowed
+  CuckooTableIterator(const CuckooTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+  ~CuckooTableIterator() override {}
+  bool Valid() const override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void Next() override;
+  void Prev() override;
+  Slice key() const override;
+  Slice value() const override;
+  Status status() const override { return Status::OK(); }
+  void InitIfNeeded();
+
+ private:
+  struct BucketComparator {
+    BucketComparator(const Slice& file_data, const Comparator* ucomp,
+                     uint32_t bucket_len, uint32_t user_key_len,
+                     const Slice& target = Slice())
+        : file_data_(file_data),
+          ucomp_(ucomp),
+          bucket_len_(bucket_len),
+          user_key_len_(user_key_len),
+          target_(target) {}
+    bool operator()(const uint32_t first, const uint32_t second) const {
+      const char* first_bucket = (first == kInvalidIndex)
+                                     ? target_.data()
+                                     : &file_data_.data()[first * bucket_len_];
+      const char* second_bucket =
+          (second == kInvalidIndex) ? target_.data()
+                                    : &file_data_.data()[second * bucket_len_];
+      return ucomp_->Compare(Slice(first_bucket, user_key_len_),
+                             Slice(second_bucket, user_key_len_)) < 0;
+    }
+
+   private:
+    const Slice file_data_;
+    const Comparator* ucomp_;
+    const uint32_t bucket_len_;
+    const uint32_t user_key_len_;
+    const Slice target_;
+  };
+
+  const BucketComparator bucket_comparator_;
+  void PrepareKVAtCurrIdx();
+  CuckooTableReader* reader_;
+  bool initialized_;
+  // Contains a map of keys to bucket_id sorted in key order.
+  std::vector<uint32_t> sorted_bucket_ids_;
+  // We assume that the number of items can be stored in uint32 (4 Billion).
+  uint32_t curr_key_idx_;
+  Slice curr_value_;
+  IterKey curr_key_;
+};
+
+CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
+    : bucket_comparator_(reader->file_data_, reader->ucomp_,
+                         reader->bucket_length_, reader->user_key_length_),
+      reader_(reader),
+      initialized_(false),
+      curr_key_idx_(kInvalidIndex) {
+  sorted_bucket_ids_.clear();
+  curr_value_.clear();
+  curr_key_.Clear();
+}
+
+void CuckooTableIterator::InitIfNeeded() {
+  if (initialized_) {
+    return;
+  }
+  sorted_bucket_ids_.reserve(
+      static_cast<size_t>(reader_->GetTableProperties()->num_entries));
+  uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
+  assert(num_buckets < kInvalidIndex);
+  const char* bucket = reader_->file_data_.data();
+  for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
+    if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
+      sorted_bucket_ids_.push_back(bucket_id);
+    }
+    bucket += reader_->bucket_length_;
+  }
+  assert(sorted_bucket_ids_.size() ==
+         reader_->GetTableProperties()->num_entries);
+  std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
+            bucket_comparator_);
+  curr_key_idx_ = kInvalidIndex;
+  initialized_ = true;
+}
+
+void CuckooTableIterator::SeekToFirst() {
+  InitIfNeeded();
+  curr_key_idx_ = 0;
+  PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::SeekToLast() {
+  InitIfNeeded();
+  curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
+  PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::Seek(const Slice& target) {
+  InitIfNeeded();
+  const BucketComparator seek_comparator(
+      reader_->file_data_, reader_->ucomp_, reader_->bucket_length_,
+      reader_->user_key_length_, ExtractUserKey(target));
+  auto seek_it =
+      std::lower_bound(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
+                       kInvalidIndex, seek_comparator);
+  curr_key_idx_ =
+      static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
+  PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) {
+  // Not supported
+  assert(false);
+}
+
+bool CuckooTableIterator::Valid() const {
+  return curr_key_idx_ < sorted_bucket_ids_.size();
+}
+
+void CuckooTableIterator::PrepareKVAtCurrIdx() {
+  if (!Valid()) {
+    curr_value_.clear();
+    curr_key_.Clear();
+    return;
+  }
+  uint32_t id = sorted_bucket_ids_[curr_key_idx_];
+  const char* offset =
+      reader_->file_data_.data() + id * reader_->bucket_length_;
+  if (reader_->is_last_level_) {
+    // Always return internal key.
+    curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), 0,
+                             kTypeValue);
+  } else {
+    curr_key_.SetInternalKey(Slice(offset, reader_->key_length_));
+  }
+  curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
+}
+
+void CuckooTableIterator::Next() {
+  if (!Valid()) {
+    curr_value_.clear();
+    curr_key_.Clear();
+    return;
+  }
+  ++curr_key_idx_;
+  PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::Prev() {
+  if (curr_key_idx_ == 0) {
+    curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
+  }
+  if (!Valid()) {
+    curr_value_.clear();
+    curr_key_.Clear();
+    return;
+  }
+  --curr_key_idx_;
+  PrepareKVAtCurrIdx();
+}
+
+Slice CuckooTableIterator::key() const {
+  assert(Valid());
+  return curr_key_.GetInternalKey();
+}
+
+Slice CuckooTableIterator::value() const {
+  assert(Valid());
+  return curr_value_;
+}
+
+InternalIterator* CuckooTableReader::NewIterator(
+    const ReadOptions& /*read_options*/,
+    const SliceTransform* /* prefix_extractor */, Arena* arena,
+    bool /*skip_filters*/, TableReaderCaller /*caller*/,
+    size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
+  if (!status().ok()) {
+    return NewErrorInternalIterator<Slice>(
+        Status::Corruption("CuckooTableReader status is not okay."), arena);
+  }
+  CuckooTableIterator* iter;
+  if (arena == nullptr) {
+    iter = new CuckooTableIterator(this);
+  } else {
+    auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
+    iter = new (iter_mem) CuckooTableIterator(this);
+  }
+  return iter;
+}
+
+size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader.h b/src/rocksdb/table/cuckoo/cuckoo_table_reader.h
new file mode 100644
index 000000000..f6c599ae8
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader.h
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class TableReader;
+struct ImmutableOptions;
+
+class CuckooTableReader : public TableReader {
+ public:
+  CuckooTableReader(const ImmutableOptions& ioptions,
+                    std::unique_ptr<RandomAccessFileReader>&& file,
+                    uint64_t file_size, const Comparator* user_comparator,
+                    uint64_t (*get_slice_hash)(const Slice&, uint32_t,
+                                               uint64_t));
+  ~CuckooTableReader() {}
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override {
+    return table_props_;
+  }
+
+  Status status() const { return status_; }
+
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
+
+  // Returns a new iterator over table contents
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
+  void Prepare(const Slice& target) override;
+
+  // Report an approximation of how much memory has been used.
+  size_t ApproximateMemoryUsage() const override;
+
+  // Following methods are not implemented for Cuckoo Table Reader
+  uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+                               TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
+  uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+                           TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
+  void SetupForCompaction() override {}
+  // End of methods not implemented.
+
+ private:
+  friend class CuckooTableIterator;
+  void LoadAllKeys(std::vector<std::pair<Slice, uint32_t>>* key_to_bucket_id);
+  std::unique_ptr<RandomAccessFileReader> file_;
+  Slice file_data_;
+  bool is_last_level_;
+  bool identity_as_first_hash_;
+  bool use_module_hash_;
+  std::shared_ptr<const TableProperties> table_props_;
+  Status status_;
+  uint32_t num_hash_func_;
+  std::string unused_key_;
+  uint32_t key_length_;
+  uint32_t user_key_length_;
+  uint32_t value_length_;
+  uint32_t bucket_length_;
+  uint32_t cuckoo_block_size_;
+  uint32_t cuckoo_block_bytes_minus_one_;
+  uint64_t table_size_;
+  const Comparator* ucomp_;
+  uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
+                              uint64_t max_num_buckets);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc b/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc
new file mode 100644
index 000000000..d3d1490c6
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc
@@ -0,0 +1,584 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
+}
+#else
+
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "memory/arena.h"
+#include "rocksdb/db.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+#include "table/get_context.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_string(file_dir, "",
+              "Directory where the files will be created"
+              " for benchmark. Added for using tmpfs.");
+DEFINE_bool(enable_perf, false, "Run Benchmark Tests too.");
+DEFINE_bool(write, false,
+            "Should write new values to file in performance tests?");
+DEFINE_bool(identity_as_first_hash, true, "use identity as first hash");
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const uint32_t kNumHashFunc = 10;
+// Methods, variables related to Hash functions.
+std::unordered_map<std::string, std::vector<uint64_t> > hash_map;
+
+void AddHashLookups(const std::string& s, uint64_t bucket_id,
+                    uint32_t num_hash_fun) {
+  std::vector<uint64_t> v;
+  for (uint32_t i = 0; i < num_hash_fun; i++) {
+    v.push_back(bucket_id + i);
+  }
+  hash_map[s] = v;
+}
+
+uint64_t GetSliceHash(const Slice& s, uint32_t index,
+                      uint64_t /*max_num_buckets*/) {
+  return hash_map[s.ToString()][index];
+}
+}  // namespace
+
+class CuckooReaderTest : public testing::Test {
+ public:
+  using testing::Test::SetUp;
+
+  CuckooReaderTest() {
+    options.allow_mmap_reads = true;
+    env = options.env;
+    file_options = FileOptions(options);
+  }
+
+  void SetUp(int num) {
+    num_items = num;
+    hash_map.clear();
+    keys.clear();
+    keys.resize(num_items);
+    user_keys.clear();
+    user_keys.resize(num_items);
+    values.clear();
+    values.resize(num_items);
+  }
+
+  std::string NumToStr(int64_t i) {
+    return std::string(reinterpret_cast<char*>(&i), sizeof(i));
+  }
+
+  void CreateCuckooFileAndCheckReader(
+      const Comparator* ucomp = BytewiseComparator()) {
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), fname,
+                                         file_options, &file_writer, nullptr));
+    CuckooTableBuilder builder(
+        file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false,
+        GetSliceHash, 0 /* column_family_id */, kDefaultColumnFamilyName);
+    ASSERT_OK(builder.status());
+    for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) {
+      builder.Add(Slice(keys[key_idx]), Slice(values[key_idx]));
+      ASSERT_OK(builder.status());
+      ASSERT_EQ(builder.NumEntries(), key_idx + 1);
+    }
+    ASSERT_OK(builder.Finish());
+    ASSERT_EQ(num_items, builder.NumEntries());
+    file_size = builder.FileSize();
+    ASSERT_OK(file_writer->Close());
+
+    // Check reader now.
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+    const ImmutableOptions ioptions(options);
+    CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
+                             GetSliceHash);
+    ASSERT_OK(reader.status());
+    // Assume no merge/deletion
+    for (uint32_t i = 0; i < num_items; ++i) {
+      PinnableSlice value;
+      GetContext get_context(ucomp, nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, Slice(user_keys[i]), &value,
+                             nullptr, nullptr, nullptr, nullptr, true, nullptr,
+                             nullptr);
+      ASSERT_OK(
+          reader.Get(ReadOptions(), Slice(keys[i]), &get_context, nullptr));
+      ASSERT_STREQ(values[i].c_str(), value.data());
+    }
+  }
+  void UpdateKeys(bool with_zero_seqno) {
+    for (uint32_t i = 0; i < num_items; i++) {
+      ParsedInternalKey ikey(user_keys[i], with_zero_seqno ? 0 : i + 1000,
+                             kTypeValue);
+      keys[i].clear();
+      AppendInternalKey(&keys[i], ikey);
+    }
+  }
+
+  void CheckIterator(const Comparator* ucomp = BytewiseComparator()) {
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+    const ImmutableOptions ioptions(options);
+    CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
+                             GetSliceHash);
+    ASSERT_OK(reader.status());
+    InternalIterator* it = reader.NewIterator(
+        ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(!it->Valid());
+    it->SeekToFirst();
+    int cnt = 0;
+    while (it->Valid()) {
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+      ASSERT_TRUE(Slice(values[cnt]) == it->value());
+      ++cnt;
+      it->Next();
+    }
+    ASSERT_EQ(static_cast<uint32_t>(cnt), num_items);
+
+    it->SeekToLast();
+    cnt = static_cast<int>(num_items) - 1;
+    ASSERT_TRUE(it->Valid());
+    while (it->Valid()) {
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+      ASSERT_TRUE(Slice(values[cnt]) == it->value());
+      --cnt;
+      it->Prev();
+    }
+    ASSERT_EQ(cnt, -1);
+
+    cnt = static_cast<int>(num_items) / 2;
+    it->Seek(keys[cnt]);
+    while (it->Valid()) {
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+      ASSERT_TRUE(Slice(values[cnt]) == it->value());
+      ++cnt;
+      it->Next();
+    }
+    ASSERT_EQ(static_cast<uint32_t>(cnt), num_items);
+    delete it;
+
+    Arena arena;
+    it = reader.NewIterator(ReadOptions(), /*prefix_extractor=*/nullptr, &arena,
+                            /*skip_filters=*/false,
+                            TableReaderCaller::kUncategorized);
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(!it->Valid());
+    it->Seek(keys[num_items / 2]);
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(keys[num_items / 2] == it->key());
+    ASSERT_TRUE(values[num_items / 2] == it->value());
+    ASSERT_OK(it->status());
+    it->~InternalIterator();
+  }
+
+  std::vector<std::string> keys;
+  std::vector<std::string> user_keys;
+  std::vector<std::string> values;
+  uint64_t num_items;
+  std::string fname;
+  uint64_t file_size;
+  Options options;
+  Env* env;
+  FileOptions file_options;
+};
+
+TEST_F(CuckooReaderTest, FileNotMmaped) {
+  options.allow_mmap_reads = false;
+  ImmutableOptions ioptions(options);
+  CuckooTableReader reader(ioptions, nullptr, 0, nullptr, nullptr);
+  ASSERT_TRUE(reader.status().IsInvalidArgument());
+  ASSERT_STREQ("File is not mmaped", reader.status().getState());
+}
+
+TEST_F(CuckooReaderTest, WhenKeyExists) {
+  SetUp(kNumHashFunc);
+  fname = test::PerThreadDBPath("CuckooReader_WhenKeyExists");
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i] = "key" + NumToStr(i);
+    ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values.
+    AddHashLookups(user_keys[i], i, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader();
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader();
+  // Test with collision. Make all hash values collide.
+  hash_map.clear();
+  for (uint32_t i = 0; i < num_items; i++) {
+    AddHashLookups(user_keys[i], 0, kNumHashFunc);
+  }
+  UpdateKeys(false);
+  CreateCuckooFileAndCheckReader();
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader();
+}
+
+TEST_F(CuckooReaderTest, WhenKeyExistsWithUint64Comparator) {
+  SetUp(kNumHashFunc);
+  fname = test::PerThreadDBPath("CuckooReaderUint64_WhenKeyExists");
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i].resize(8);
+    memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
+    ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values.
+    AddHashLookups(user_keys[i], i, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  // Test with collision. Make all hash values collide.
+  hash_map.clear();
+  for (uint32_t i = 0; i < num_items; i++) {
+    AddHashLookups(user_keys[i], 0, kNumHashFunc);
+  }
+  UpdateKeys(false);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+}
+
+TEST_F(CuckooReaderTest, CheckIterator) {
+  SetUp(2 * kNumHashFunc);
+  fname = test::PerThreadDBPath("CuckooReader_CheckIterator");
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i] = "key" + NumToStr(i);
+    ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values, in reverse order.
+    AddHashLookups(user_keys[i], num_items - i - 1, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader();
+  CheckIterator();
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader();
+  CheckIterator();
+}
+
+TEST_F(CuckooReaderTest, CheckIteratorUint64) {
+  SetUp(2 * kNumHashFunc);
+  fname = test::PerThreadDBPath("CuckooReader_CheckIterator");
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i].resize(8);
+    memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
+    ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values, in reverse order.
+    AddHashLookups(user_keys[i], num_items - i - 1, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  CheckIterator(test::Uint64Comparator());
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  CheckIterator(test::Uint64Comparator());
+}
+
+TEST_F(CuckooReaderTest, WhenKeyNotFound) {
+  // Add keys with colliding hash values.
+  SetUp(kNumHashFunc);
+  fname = test::PerThreadDBPath("CuckooReader_WhenKeyNotFound");
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i] = "key" + NumToStr(i);
+    ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Make all hash values collide.
+    AddHashLookups(user_keys[i], 0, kNumHashFunc);
+  }
+  auto* ucmp = BytewiseComparator();
+  CreateCuckooFileAndCheckReader();
+
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(
+      env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+
+  const ImmutableOptions ioptions(options);
+  CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp,
+                           GetSliceHash);
+  ASSERT_OK(reader.status());
+  // Search for a key with colliding hash values.
+  std::string not_found_user_key = "key" + NumToStr(num_items);
+  std::string not_found_key;
+  AddHashLookups(not_found_user_key, 0, kNumHashFunc);
+  ParsedInternalKey ikey(not_found_user_key, 1000, kTypeValue);
+  AppendInternalKey(&not_found_key, ikey);
+  PinnableSlice value;
+  GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound,
+                         Slice(not_found_key), &value, nullptr, nullptr,
+                         nullptr, nullptr, true, nullptr, nullptr);
+  ASSERT_OK(
+      reader.Get(ReadOptions(), Slice(not_found_key), &get_context, nullptr));
+  ASSERT_TRUE(value.empty());
+  ASSERT_OK(reader.status());
+  // Search for a key with an independent hash value.
+  std::string not_found_user_key2 = "key" + NumToStr(num_items + 1);
+  AddHashLookups(not_found_user_key2, kNumHashFunc, kNumHashFunc);
+  ParsedInternalKey ikey2(not_found_user_key2, 1000, kTypeValue);
+  std::string not_found_key2;
+  AppendInternalKey(&not_found_key2, ikey2);
+  value.Reset();
+  GetContext get_context2(ucmp, nullptr, nullptr, nullptr,
+                          GetContext::kNotFound, Slice(not_found_key2), &value,
+                          nullptr, nullptr, nullptr, nullptr, true, nullptr,
+                          nullptr);
+  ASSERT_OK(
+      reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2, nullptr));
+  ASSERT_TRUE(value.empty());
+  ASSERT_OK(reader.status());
+
+  // Test read when key is unused key.
+  std::string unused_key =
+      reader.GetTableProperties()->user_collected_properties.at(
+          CuckooTablePropertyNames::kEmptyKey);
+  // Add hash values that map to empty buckets.
+  AddHashLookups(ExtractUserKey(unused_key).ToString(), kNumHashFunc,
+                 kNumHashFunc);
+  value.Reset();
+  GetContext get_context3(
+      ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(unused_key),
+      &value, nullptr, nullptr, nullptr, nullptr, true, nullptr, nullptr);
+  ASSERT_OK(
+      reader.Get(ReadOptions(), Slice(unused_key), &get_context3, nullptr));
+  ASSERT_TRUE(value.empty());
+  ASSERT_OK(reader.status());
+}
+
+// Performance tests
+namespace {
+void GetKeys(uint64_t num, std::vector<std::string>* keys) {
+  keys->clear();
+  IterKey k;
+  k.SetInternalKey("", 0, kTypeValue);
+  std::string internal_key_suffix = k.GetInternalKey().ToString();
+  ASSERT_EQ(static_cast<size_t>(8), internal_key_suffix.size());
+  for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
+    uint64_t value = 2 * key_idx;
+    std::string new_key(reinterpret_cast<char*>(&value), sizeof(value));
+    new_key += internal_key_suffix;
+    keys->push_back(new_key);
+  }
+}
+
+std::string GetFileName(uint64_t num) {
+  if (FLAGS_file_dir.empty()) {
+    FLAGS_file_dir = test::TmpDir();
+  }
+  return test::PerThreadDBPath(FLAGS_file_dir, "cuckoo_read_benchmark") +
+         std::to_string(num / 1000000) + "Mkeys";
+}
+
+// Create last level file as we are interested in measuring performance of
+// last level file only.
+void WriteFile(const std::vector<std::string>& keys, const uint64_t num,
+               double hash_ratio) {
+  Options options;
+  options.allow_mmap_reads = true;
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions file_options(options);
+  std::string fname = GetFileName(num);
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(fs, fname, file_options, &file_writer,
+                                       nullptr));
+  CuckooTableBuilder builder(
+      file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5,
+      false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */,
+      kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
+    // Value is just a part of key.
+    builder.Add(Slice(keys[key_idx]), Slice(&keys[key_idx][0], 4));
+    ASSERT_EQ(builder.NumEntries(), key_idx + 1);
+    ASSERT_OK(builder.status());
+  }
+  ASSERT_OK(builder.Finish());
+  ASSERT_EQ(num, builder.NumEntries());
+  ASSERT_OK(file_writer->Close());
+
+  uint64_t file_size;
+  ASSERT_OK(
+      fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr));
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options,
+                                           &file_reader, nullptr));
+
+  const ImmutableOptions ioptions(options);
+  CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
+                           test::Uint64Comparator(), nullptr);
+  ASSERT_OK(reader.status());
+  ReadOptions r_options;
+  PinnableSlice value;
+  // Assume only the fast path is triggered
+  GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, Slice(), &value, nullptr,
+                         nullptr, nullptr, true, nullptr, nullptr);
+  for (uint64_t i = 0; i < num; ++i) {
+    value.Reset();
+    value.clear();
+    ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context, nullptr));
+    ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4));
+  }
+}
+
+void ReadKeys(uint64_t num, uint32_t batch_size) {
+  Options options;
+  options.allow_mmap_reads = true;
+  Env* env = options.env;
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions file_options(options);
+  std::string fname = GetFileName(num);
+
+  uint64_t file_size;
+  ASSERT_OK(
+      fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr));
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options,
+                                           &file_reader, nullptr));
+
+  const ImmutableOptions ioptions(options);
+  CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
+                           test::Uint64Comparator(), nullptr);
+  ASSERT_OK(reader.status());
+  const UserCollectedProperties user_props =
+      reader.GetTableProperties()->user_collected_properties;
+  const uint32_t num_hash_fun = *reinterpret_cast<const uint32_t*>(
+      user_props.at(CuckooTablePropertyNames::kNumHashFunc).data());
+  const uint64_t table_size = *reinterpret_cast<const uint64_t*>(
+      user_props.at(CuckooTablePropertyNames::kHashTableSize).data());
+  fprintf(stderr,
+          "With %" PRIu64
+          " items, utilization is %.2f%%, number of"
+          " hash functions: %u.\n",
+          num, num * 100.0 / (table_size), num_hash_fun);
+  ReadOptions r_options;
+
+  std::vector<uint64_t> keys;
+  keys.reserve(num);
+  for (uint64_t i = 0; i < num; ++i) {
+    keys.push_back(2 * i);
+  }
+  RandomShuffle(keys.begin(), keys.end());
+
+  PinnableSlice value;
+  // Assume only the fast path is triggered
+  GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, Slice(), &value, nullptr,
+                         nullptr, nullptr, true, nullptr, nullptr);
+  uint64_t start_time = env->NowMicros();
+  if (batch_size > 0) {
+    for (uint64_t i = 0; i < num; i += batch_size) {
+      for (uint64_t j = i; j < i + batch_size && j < num; ++j) {
+        reader.Prepare(Slice(reinterpret_cast<char*>(&keys[j]), 16));
+      }
+      for (uint64_t j = i; j < i + batch_size && j < num; ++j) {
+        reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[j]), 16),
+                   &get_context, nullptr);
+      }
+    }
+  } else {
+    for (uint64_t i = 0; i < num; i++) {
+      reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[i]), 16),
+                 &get_context, nullptr);
+    }
+  }
+  float time_per_op = (env->NowMicros() - start_time) * 1.0f / num;
+  fprintf(stderr,
+          "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n",
+          time_per_op, 1.0 / time_per_op, batch_size);
+}
+}  // namespace.
+
+TEST_F(CuckooReaderTest, TestReadPerformance) {
+  if (!FLAGS_enable_perf) {
+    return;
+  }
+  double hash_ratio = 0.95;
+  // These numbers are chosen to have a hash utilization % close to
+  // 0.9, 0.75, 0.6 and 0.5 respectively.
+  // They all create 128 M buckets.
+  std::vector<uint64_t> nums = {120 * 1024 * 1024, 100 * 1024 * 1024,
+                                80 * 1024 * 1024, 70 * 1024 * 1024};
+#ifndef NDEBUG
+  fprintf(
+      stdout,
+      "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n");
+#endif
+  for (uint64_t num : nums) {
+    if (FLAGS_write ||
+        Env::Default()->FileExists(GetFileName(num)).IsNotFound()) {
+      std::vector<std::string> all_keys;
+      GetKeys(num, &all_keys);
+      WriteFile(all_keys, num, hash_ratio);
+    }
+    ReadKeys(num, 0);
+    ReadKeys(num, 10);
+    ReadKeys(num, 25);
+    ReadKeys(num, 50);
+    ReadKeys(num, 100);
+    fprintf(stderr, "\n");
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  if (ROCKSDB_NAMESPACE::port::kLittleEndian) {
+    ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+    ::testing::InitGoogleTest(&argc, argv);
+    ParseCommandLineFlags(&argc, &argv, true);
+    return RUN_ALL_TESTS();
+  } else {
+    fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n");
+    return 0;
+  }
+}
+
+#endif  // GFLAGS.
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/format.cc b/src/rocksdb/table/format.cc
new file mode 100644
index 000000000..efde5e169
--- /dev/null
+++ b/src/rocksdb/table/format.cc
@@ -0,0 +1,575 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/format.h"
+
+#include <cinttypes>
+#include <string>
+
+#include "block_fetcher.h"
+#include "file/random_access_file_reader.h"
+#include "memory/memory_allocator.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "options/options_helper.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/persistent_cache_helper.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/hash.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+
+#ifndef ROCKSDB_LITE
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+#else
+// ROCKSDB_LITE doesn't have plain table
+const uint64_t kLegacyPlainTableMagicNumber = 0;
+const uint64_t kPlainTableMagicNumber = 0;
+#endif
+const char* kHostnameForDbHostId = "__hostname__";
+
+bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
+  return env != nullptr && stats != nullptr &&
+         stats->get_stats_level() > kExceptDetailedTimers;
+}
+
+void BlockHandle::EncodeTo(std::string* dst) const {
+  // Sanity check that all fields have been set
+  assert(offset_ != ~uint64_t{0});
+  assert(size_ != ~uint64_t{0});
+  PutVarint64Varint64(dst, offset_, size_);
+}
+
+char* BlockHandle::EncodeTo(char* dst) const {
+  // Sanity check that all fields have been set
+  assert(offset_ != ~uint64_t{0});
+  assert(size_ != ~uint64_t{0});
+  char* cur = EncodeVarint64(dst, offset_);
+  cur = EncodeVarint64(cur, size_);
+  return cur;
+}
+
+Status BlockHandle::DecodeFrom(Slice* input) {
+  if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
+    return Status::OK();
+  } else {
+    // reset in case failure after partially decoding
+    offset_ = 0;
+    size_ = 0;
+    return Status::Corruption("bad block handle");
+  }
+}
+
+Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
+  if (GetVarint64(input, &size_)) {
+    offset_ = _offset;
+    return Status::OK();
+  } else {
+    // reset in case failure after partially decoding
+    offset_ = 0;
+    size_ = 0;
+    return Status::Corruption("bad block handle");
+  }
+}
+
+// Return a string that contains the copy of handle.
+std::string BlockHandle::ToString(bool hex) const {
+  std::string handle_str;
+  EncodeTo(&handle_str);
+  if (hex) {
+    return Slice(handle_str).ToString(true);
+  } else {
+    return handle_str;
+  }
+}
+
+const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
+
+void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
+                          const BlockHandle* previous_handle) const {
+  if (previous_handle) {
+    // WART: this is specific to Block-based table
+    assert(handle.offset() == previous_handle->offset() +
+                                  previous_handle->size() +
+                                  BlockBasedTable::kBlockTrailerSize);
+    PutVarsignedint64(dst, handle.size() - previous_handle->size());
+  } else {
+    handle.EncodeTo(dst);
+  }
+  assert(dst->size() != 0);
+
+  if (have_first_key) {
+    PutLengthPrefixedSlice(dst, first_internal_key);
+  }
+}
+
+Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
+                              const BlockHandle* previous_handle) {
+  if (previous_handle) {
+    int64_t delta;
+    if (!GetVarsignedint64(input, &delta)) {
+      return Status::Corruption("bad delta-encoded index value");
+    }
+    // WART: this is specific to Block-based table
+    handle = BlockHandle(previous_handle->offset() + previous_handle->size() +
+                             BlockBasedTable::kBlockTrailerSize,
+                         previous_handle->size() + delta);
+  } else {
+    Status s = handle.DecodeFrom(input);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (!have_first_key) {
+    first_internal_key = Slice();
+  } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
+    return Status::Corruption("bad first key in block info");
+  }
+
+  return Status::OK();
+}
+
+std::string IndexValue::ToString(bool hex, bool have_first_key) const {
+  std::string s;
+  EncodeTo(&s, have_first_key, nullptr);
+  if (hex) {
+    return Slice(s).ToString(true);
+  } else {
+    return s;
+  }
+}
+
+namespace {
+inline bool IsLegacyFooterFormat(uint64_t magic_number) {
+  return magic_number == kLegacyBlockBasedTableMagicNumber ||
+         magic_number == kLegacyPlainTableMagicNumber;
+}
+inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
+  if (magic_number == kLegacyBlockBasedTableMagicNumber) {
+    return kBlockBasedTableMagicNumber;
+  }
+  if (magic_number == kLegacyPlainTableMagicNumber) {
+    return kPlainTableMagicNumber;
+  }
+  assert(false);
+  return magic_number;
+}
+inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) {
+  if (magic_number == kBlockBasedTableMagicNumber) {
+    return kLegacyBlockBasedTableMagicNumber;
+  }
+  if (magic_number == kPlainTableMagicNumber) {
+    return kLegacyPlainTableMagicNumber;
+  }
+  assert(false);
+  return magic_number;
+}
+inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) {
+  if (magic_number == kBlockBasedTableMagicNumber ||
+      magic_number == kLegacyBlockBasedTableMagicNumber) {
+    return static_cast<uint8_t>(BlockBasedTable::kBlockTrailerSize);
+  } else {
+    return 0;
+  }
+}
+
+// Footer format, in three parts:
+// * Part1
+//   -> format_version == 0 (inferred from legacy magic number)
+//      <empty> (0 bytes)
+//   -> format_version >= 1
+//      checksum type (char, 1 byte)
+// * Part2
+//      metaindex handle (varint64 offset, varint64 size)
+//      index handle     (varint64 offset, varint64 size)
+//      <zero padding> for part2 size = 2 * BlockHandle::kMaxEncodedLength = 40
+// * Part3
+//   -> format_version == 0 (inferred from legacy magic number)
+//      legacy magic number (8 bytes)
+//   -> format_version >= 1 (inferred from NOT legacy magic number)
+//      format_version (uint32LE, 4 bytes), also called "footer version"
+//      newer magic number (8 bytes)
+
+constexpr size_t kFooterPart2Size = 2 * BlockHandle::kMaxEncodedLength;
+}  // namespace
+
+void FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
+                          uint64_t footer_offset, ChecksumType checksum_type,
+                          const BlockHandle& metaindex_handle,
+                          const BlockHandle& index_handle) {
+  (void)footer_offset;  // Future use
+
+  assert(magic_number != Footer::kNullTableMagicNumber);
+  assert(IsSupportedFormatVersion(format_version));
+
+  char* part2;
+  char* part3;
+  if (format_version > 0) {
+    slice_ = Slice(data_.data(), Footer::kNewVersionsEncodedLength);
+    // Generate parts 1 and 3
+    char* cur = data_.data();
+    // Part 1
+    *(cur++) = checksum_type;
+    // Part 2
+    part2 = cur;
+    // Skip over part 2 for now
+    cur += kFooterPart2Size;
+    // Part 3
+    part3 = cur;
+    EncodeFixed32(cur, format_version);
+    cur += 4;
+    EncodeFixed64(cur, magic_number);
+    assert(cur + 8 == slice_.data() + slice_.size());
+  } else {
+    slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength);
+    // Legacy SST files use kCRC32c checksum but it's not stored in footer.
+    assert(checksum_type == kNoChecksum || checksum_type == kCRC32c);
+    // Generate part 3 (part 1 empty, skip part 2 for now)
+    part2 = data_.data();
+    part3 = part2 + kFooterPart2Size;
+    char* cur = part3;
+    // Use legacy magic numbers to indicate format_version=0, for
+    // compatibility. No other cases should use format_version=0.
+    EncodeFixed64(cur, DownconvertToLegacyFooterFormat(magic_number));
+    assert(cur + 8 == slice_.data() + slice_.size());
+  }
+
+  {
+    char* cur = part2;
+    cur = metaindex_handle.EncodeTo(cur);
+    cur = index_handle.EncodeTo(cur);
+    // Zero pad remainder
+    std::fill(cur, part3, char{0});
+  }
+}
+
+Status Footer::DecodeFrom(Slice input, uint64_t input_offset) {
+  (void)input_offset;  // Future use
+
+  // Only decode to unused Footer
+  assert(table_magic_number_ == kNullTableMagicNumber);
+  assert(input != nullptr);
+  assert(input.size() >= kMinEncodedLength);
+
+  const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte;
+  uint64_t magic = DecodeFixed64(magic_ptr);
+
+  // We check for legacy formats here and silently upconvert them
+  bool legacy = IsLegacyFooterFormat(magic);
+  if (legacy) {
+    magic = UpconvertLegacyFooterFormat(magic);
+  }
+  table_magic_number_ = magic;
+  block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic);
+
+  // Parse Part3
+  if (legacy) {
+    // The size is already asserted to be at least kMinEncodedLength
+    // at the beginning of the function
+    input.remove_prefix(input.size() - kVersion0EncodedLength);
+    format_version_ = 0 /* legacy */;
+    checksum_type_ = kCRC32c;
+  } else {
+    const char* part3_ptr = magic_ptr - 4;
+    format_version_ = DecodeFixed32(part3_ptr);
+    if (!IsSupportedFormatVersion(format_version_)) {
+      return Status::Corruption("Corrupt or unsupported format_version: " +
+                                std::to_string(format_version_));
+    }
+    // All known format versions >= 1 occupy exactly this many bytes.
+    if (input.size() < kNewVersionsEncodedLength) {
+      return Status::Corruption("Input is too short to be an SST file");
+    }
+    uint64_t adjustment = input.size() - kNewVersionsEncodedLength;
+    input.remove_prefix(adjustment);
+
+    // Parse Part1
+    char chksum = input.data()[0];
+    checksum_type_ = lossless_cast<ChecksumType>(chksum);
+    if (!IsSupportedChecksumType(checksum_type())) {
+      return Status::Corruption("Corrupt or unsupported checksum type: " +
+                                std::to_string(lossless_cast<uint8_t>(chksum)));
+    }
+    // Consume checksum type field
+    input.remove_prefix(1);
+  }
+
+  // Parse Part2
+  Status result = metaindex_handle_.DecodeFrom(&input);
+  if (result.ok()) {
+    result = index_handle_.DecodeFrom(&input);
+  }
+  return result;
+  // Padding in part2 is ignored
+}
+
+std::string Footer::ToString() const {
+  std::string result;
+  result.reserve(1024);
+
+  bool legacy = IsLegacyFooterFormat(table_magic_number_);
+  if (legacy) {
+    result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n  ");
+    result.append("index handle: " + index_handle_.ToString() + "\n  ");
+    result.append("table_magic_number: " + std::to_string(table_magic_number_) +
+                  "\n  ");
+  } else {
+    result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n  ");
+    result.append("index handle: " + index_handle_.ToString() + "\n  ");
+    result.append("table_magic_number: " + std::to_string(table_magic_number_) +
+                  "\n  ");
+    result.append("format version: " + std::to_string(format_version_) +
+                  "\n  ");
+  }
+  return result;
+}
+
+Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
+                          FilePrefetchBuffer* prefetch_buffer,
+                          uint64_t file_size, Footer* footer,
+                          uint64_t enforce_table_magic_number) {
+  if (file_size < Footer::kMinEncodedLength) {
+    return Status::Corruption("file is too short (" +
+                              std::to_string(file_size) +
+                              " bytes) to be an "
+                              "sstable: " +
+                              file->file_name());
+  }
+
+  std::string footer_buf;
+  AlignedBuf internal_buf;
+  Slice footer_input;
+  uint64_t read_offset = (file_size > Footer::kMaxEncodedLength)
+                             ? file_size - Footer::kMaxEncodedLength
+                             : 0;
+  Status s;
+  // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now,
+  // there is no readahead for point lookups, so TryReadFromCache will fail if
+  // the required data is not in the prefetch buffer. Once deadline is enabled
+  // for iterator, TryReadFromCache might do a readahead. Revisit to see if we
+  // need to pass a timeout at that point
+  // TODO: rate limit footer reads.
+  if (prefetch_buffer == nullptr ||
+      !prefetch_buffer->TryReadFromCache(
+          opts, file, read_offset, Footer::kMaxEncodedLength, &footer_input,
+          nullptr, opts.rate_limiter_priority)) {
+    if (file->use_direct_io()) {
+      s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
+                     &footer_input, nullptr, &internal_buf,
+                     opts.rate_limiter_priority);
+    } else {
+      footer_buf.reserve(Footer::kMaxEncodedLength);
+      s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
+                     &footer_input, &footer_buf[0], nullptr,
+                     opts.rate_limiter_priority);
+    }
+    if (!s.ok()) return s;
+  }
+
+  // Check that we actually read the whole footer from the file. It may be
+  // that size isn't correct.
+  if (footer_input.size() < Footer::kMinEncodedLength) {
+    // FIXME: this error message is bad. We should be checking whether the
+    // provided file_size matches what's on disk, at least in this case.
+    // Unfortunately FileSystem/Env does not provide a way to get the size
+    // of an open file, so getting file size requires a full path seek.
+    return Status::Corruption("file is too short (" +
+                              std::to_string(file_size) +
+                              " bytes) to be an "
+                              "sstable" +
+                              file->file_name());
+  }
+
+  s = footer->DecodeFrom(footer_input, read_offset);
+  if (!s.ok()) {
+    return s;
+  }
+  if (enforce_table_magic_number != 0 &&
+      enforce_table_magic_number != footer->table_magic_number()) {
+    return Status::Corruption("Bad table magic number: expected " +
+                              std::to_string(enforce_table_magic_number) +
+                              ", found " +
+                              std::to_string(footer->table_magic_number()) +
+                              " in " + file->file_name());
+  }
+  return Status::OK();
+}
+
+namespace {
+// Custom handling for the last byte of a block, to avoid invoking streaming
+// API to get an effective block checksum. This function is its own inverse
+// because it uses xor.
+inline uint32_t ModifyChecksumForLastByte(uint32_t checksum, char last_byte) {
+  // This strategy bears some resemblance to extending a CRC checksum by one
+  // more byte, except we don't need to re-mix the input checksum as long as
+  // we do this step only once (per checksum).
+  const uint32_t kRandomPrime = 0x6b9083d9;
+  return checksum ^ lossless_cast<uint8_t>(last_byte) * kRandomPrime;
+}
+}  // namespace
+
+uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
+                                size_t data_size) {
+  switch (type) {
+    case kCRC32c:
+      return crc32c::Mask(crc32c::Value(data, data_size));
+    case kxxHash:
+      return XXH32(data, data_size, /*seed*/ 0);
+    case kxxHash64:
+      return Lower32of64(XXH64(data, data_size, /*seed*/ 0));
+    case kXXH3: {
+      if (data_size == 0) {
+        // Special case because of special handling for last byte, not
+        // present in this case. Can be any value different from other
+        // small input size checksums.
+        return 0;
+      } else {
+        // See corresponding code in ComputeBuiltinChecksumWithLastByte
+        uint32_t v = Lower32of64(XXH3_64bits(data, data_size - 1));
+        return ModifyChecksumForLastByte(v, data[data_size - 1]);
+      }
+    }
+    default:  // including kNoChecksum
+      return 0;
+  }
+}
+
+uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
+                                            size_t data_size, char last_byte) {
+  switch (type) {
+    case kCRC32c: {
+      uint32_t crc = crc32c::Value(data, data_size);
+      // Extend to cover last byte (compression type)
+      crc = crc32c::Extend(crc, &last_byte, 1);
+      return crc32c::Mask(crc);
+    }
+    case kxxHash: {
+      XXH32_state_t* const state = XXH32_createState();
+      XXH32_reset(state, 0);
+      XXH32_update(state, data, data_size);
+      // Extend to cover last byte (compression type)
+      XXH32_update(state, &last_byte, 1);
+      uint32_t v = XXH32_digest(state);
+      XXH32_freeState(state);
+      return v;
+    }
+    case kxxHash64: {
+      XXH64_state_t* const state = XXH64_createState();
+      XXH64_reset(state, 0);
+      XXH64_update(state, data, data_size);
+      // Extend to cover last byte (compression type)
+      XXH64_update(state, &last_byte, 1);
+      uint32_t v = Lower32of64(XXH64_digest(state));
+      XXH64_freeState(state);
+      return v;
+    }
+    case kXXH3: {
+      // XXH3 is a complicated hash function that is extremely fast on
+      // contiguous input, but that makes its streaming support rather
+      // complex. It is worth custom handling of the last byte (`type`)
+      // in order to avoid allocating a large state object and bringing
+      // that code complexity into CPU working set.
+      uint32_t v = Lower32of64(XXH3_64bits(data, data_size));
+      return ModifyChecksumForLastByte(v, last_byte);
+    }
+    default:  // including kNoChecksum
+      return 0;
+  }
+}
+
+Status UncompressBlockData(const UncompressionInfo& uncompression_info,
+                           const char* data, size_t size,
+                           BlockContents* out_contents, uint32_t format_version,
+                           const ImmutableOptions& ioptions,
+                           MemoryAllocator* allocator) {
+  Status ret = Status::OK();
+
+  assert(uncompression_info.type() != kNoCompression &&
+         "Invalid compression type");
+
+  StopWatchNano timer(ioptions.clock,
+                      ShouldReportDetailedTime(ioptions.env, ioptions.stats));
+  size_t uncompressed_size = 0;
+  CacheAllocationPtr ubuf =
+      UncompressData(uncompression_info, data, size, &uncompressed_size,
+                     GetCompressFormatForVersion(format_version), allocator);
+  if (!ubuf) {
+    if (!CompressionTypeSupported(uncompression_info.type())) {
+      return Status::NotSupported(
+          "Unsupported compression method for this build",
+          CompressionTypeToString(uncompression_info.type()));
+    } else {
+      return Status::Corruption(
+          "Corrupted compressed block contents",
+          CompressionTypeToString(uncompression_info.type()));
+    }
+  }
+
+  *out_contents = BlockContents(std::move(ubuf), uncompressed_size);
+
+  if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) {
+    RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS,
+                          timer.ElapsedNanos());
+  }
+  RecordTimeToHistogram(ioptions.stats, BYTES_DECOMPRESSED,
+                        out_contents->data.size());
+  RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED);
+
+  TEST_SYNC_POINT_CALLBACK("UncompressBlockData:TamperWithReturnValue",
+                           static_cast<void*>(&ret));
+  TEST_SYNC_POINT_CALLBACK(
+      "UncompressBlockData:"
+      "TamperWithDecompressionOutput",
+      static_cast<void*>(out_contents));
+
+  return ret;
+}
+
+Status UncompressSerializedBlock(const UncompressionInfo& uncompression_info,
+                                 const char* data, size_t size,
+                                 BlockContents* out_contents,
+                                 uint32_t format_version,
+                                 const ImmutableOptions& ioptions,
+                                 MemoryAllocator* allocator) {
+  assert(data[size] != kNoCompression);
+  assert(data[size] == static_cast<char>(uncompression_info.type()));
+  return UncompressBlockData(uncompression_info, data, size, out_contents,
+                             format_version, ioptions, allocator);
+}
+
+// Replace the contents of db_host_id with the actual hostname, if db_host_id
+// matches the keyword kHostnameForDbHostId
+Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) {
+  assert(db_host_id);
+  if (*db_host_id == kHostnameForDbHostId) {
+    Status s = env->GetHostNameString(db_host_id);
+    if (!s.ok()) {
+      db_host_id->clear();
+    }
+    return s;
+  }
+
+  return Status::OK();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/format.h b/src/rocksdb/table/format.h
new file mode 100644
index 000000000..ffb9fb0ca
--- /dev/null
+++ b/src/rocksdb/table/format.h
@@ -0,0 +1,375 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <string>
+
+#include "file/file_prefetch_buffer.h"
+#include "file/random_access_file_reader.h"
+#include "memory/memory_allocator.h"
+#include "options/cf_options.h"
+#include "port/malloc.h"
+#include "port/port.h"  // noexcept
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFile;
+struct ReadOptions;
+
+bool ShouldReportDetailedTime(Env* env, Statistics* stats);
+
+// the length of the magic number in bytes.
+constexpr uint32_t kMagicNumberLengthByte = 8;
+
+// BlockHandle is a pointer to the extent of a file that stores a data
+// block or a meta block.
+class BlockHandle {
+ public:
+  // Creates a block handle with special values indicating "uninitialized,"
+  // distinct from the "null" block handle.
+  BlockHandle();
+  BlockHandle(uint64_t offset, uint64_t size);
+
+  // The offset of the block in the file.
+  uint64_t offset() const { return offset_; }
+  void set_offset(uint64_t _offset) { offset_ = _offset; }
+
+  // The size of the stored block
+  uint64_t size() const { return size_; }
+  void set_size(uint64_t _size) { size_ = _size; }
+
+  void EncodeTo(std::string* dst) const;
+  char* EncodeTo(char* dst) const;
+  Status DecodeFrom(Slice* input);
+  Status DecodeSizeFrom(uint64_t offset, Slice* input);
+
+  // Return a string that contains the copy of handle.
+  std::string ToString(bool hex = true) const;
+
+  // if the block handle's offset and size are both "0", we will view it
+  // as a null block handle that points to no where.
+  bool IsNull() const { return offset_ == 0 && size_ == 0; }
+
+  static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
+
+  // Maximum encoding length of a BlockHandle
+  static constexpr uint32_t kMaxEncodedLength = 2 * kMaxVarint64Length;
+
+  inline bool operator==(const BlockHandle& rhs) const {
+    return offset_ == rhs.offset_ && size_ == rhs.size_;
+  }
+  inline bool operator!=(const BlockHandle& rhs) const {
+    return !(*this == rhs);
+  }
+
+ private:
+  uint64_t offset_;
+  uint64_t size_;
+
+  static const BlockHandle kNullBlockHandle;
+};
+
+// Value in block-based table file index.
+//
+// The index entry for block n is: y -> h, [x],
+// where: y is some key between the last key of block n (inclusive) and the
+// first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
+// x, if present, is the first key of block n (unshortened).
+// This struct represents the "h, [x]" part.
+struct IndexValue {
+  BlockHandle handle;
+  // Empty means unknown.
+  Slice first_internal_key;
+
+  IndexValue() = default;
+  IndexValue(BlockHandle _handle, Slice _first_internal_key)
+      : handle(_handle), first_internal_key(_first_internal_key) {}
+
+  // have_first_key indicates whether the `first_internal_key` is used.
+  // If previous_handle is not null, delta encoding is used;
+  // in this case, the two handles must point to consecutive blocks:
+  // handle.offset() ==
+  //     previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
+  void EncodeTo(std::string* dst, bool have_first_key,
+                const BlockHandle* previous_handle) const;
+  Status DecodeFrom(Slice* input, bool have_first_key,
+                    const BlockHandle* previous_handle);
+
+  std::string ToString(bool hex, bool have_first_key) const;
+};
+
+inline uint32_t GetCompressFormatForVersion(uint32_t format_version) {
+  // As of format_version 2, we encode compressed block with
+  // compress_format_version == 2. Before that, the version is 1.
+  // DO NOT CHANGE THIS FUNCTION, it affects disk format
+  return format_version >= 2 ? 2 : 1;
+}
+
+constexpr uint32_t kLatestFormatVersion = 5;
+
+inline bool IsSupportedFormatVersion(uint32_t version) {
+  return version <= kLatestFormatVersion;
+}
+
+// Footer encapsulates the fixed information stored at the tail end of every
+// SST file. In general, it should only include things that cannot go
+// elsewhere under the metaindex block. For example, checksum_type is
+// required for verifying metaindex block checksum (when applicable), but
+// index block handle can easily go in metaindex block (possible future).
+// See also FooterBuilder below.
+class Footer {
+ public:
+  // Create empty. Populate using DecodeFrom.
+  Footer() {}
+
+  // Deserialize a footer (populate fields) from `input` and check for various
+  // corruptions. `input_offset` is the offset within the target file of
+  // `input` buffer (future use).
+  Status DecodeFrom(Slice input, uint64_t input_offset);
+
+  // Table magic number identifies file as RocksDB SST file and which kind of
+  // SST format is use.
+  uint64_t table_magic_number() const { return table_magic_number_; }
+
+  // A version (footer and more) within a kind of SST. (It would add more
+  // unnecessary complexity to separate footer versions and
+  // BBTO::format_version.)
+  uint32_t format_version() const { return format_version_; }
+
+  // Block handle for metaindex block.
+  const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
+
+  // Block handle for (top-level) index block.
+  const BlockHandle& index_handle() const { return index_handle_; }
+
+  // Checksum type used in the file.
+  ChecksumType checksum_type() const {
+    return static_cast<ChecksumType>(checksum_type_);
+  }
+
+  // Block trailer size used by file with this footer (e.g. 5 for block-based
+  // table and 0 for plain table). This is inferred from magic number so
+  // not in the serialized form.
+  inline size_t GetBlockTrailerSize() const { return block_trailer_size_; }
+
+  // Convert this object to a human readable form
+  std::string ToString() const;
+
+  // Encoded lengths of Footers. Bytes for serialized Footer will always be
+  // >= kMinEncodedLength and <= kMaxEncodedLength.
+  //
+  // Footer version 0 (legacy) will always occupy exactly this many bytes.
+  // It consists of two block handles, padding, and a magic number.
+  static constexpr uint32_t kVersion0EncodedLength =
+      2 * BlockHandle::kMaxEncodedLength + kMagicNumberLengthByte;
+  static constexpr uint32_t kMinEncodedLength = kVersion0EncodedLength;
+
+  // Footer of versions 1 and higher will always occupy exactly this many
+  // bytes. It originally consisted of the checksum type, two block handles,
+  // padding (to maximum handle encoding size), a format version number, and a
+  // magic number.
+  static constexpr uint32_t kNewVersionsEncodedLength =
+      1 + 2 * BlockHandle::kMaxEncodedLength + 4 + kMagicNumberLengthByte;
+  static constexpr uint32_t kMaxEncodedLength = kNewVersionsEncodedLength;
+
+  static constexpr uint64_t kNullTableMagicNumber = 0;
+
+  static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU;
+
+ private:
+  static constexpr int kInvalidChecksumType =
+      (1 << (sizeof(ChecksumType) * 8)) | kNoChecksum;
+
+  uint64_t table_magic_number_ = kNullTableMagicNumber;
+  uint32_t format_version_ = kInvalidFormatVersion;
+  BlockHandle metaindex_handle_;
+  BlockHandle index_handle_;
+  int checksum_type_ = kInvalidChecksumType;
+  uint8_t block_trailer_size_ = 0;
+};
+
+// Builder for Footer
+class FooterBuilder {
+ public:
+  // Run builder in inputs. This is a single step with lots of parameters for
+  // efficiency (based on perf testing).
+  // * table_magic_number identifies file as RocksDB SST file and which kind of
+  // SST format is use.
+  // * format_version is a version for the footer and can also apply to other
+  // aspects of the SST file (see BlockBasedTableOptions::format_version).
+  // NOTE: To save complexity in the caller, when format_version == 0 and
+  // there is a corresponding legacy magic number to the one specified, the
+  // legacy magic number will be written for forward compatibility.
+  // * footer_offset is the file offset where the footer will be written
+  // (for future use).
+  // * checksum_type is for formats using block checksums.
+  // * index_handle is optional for some kinds of SST files.
+  void Build(uint64_t table_magic_number, uint32_t format_version,
+             uint64_t footer_offset, ChecksumType checksum_type,
+             const BlockHandle& metaindex_handle,
+             const BlockHandle& index_handle = BlockHandle::NullBlockHandle());
+
+  // After Builder, get a Slice for the serialized Footer, backed by this
+  // FooterBuilder.
+  const Slice& GetSlice() const {
+    assert(slice_.size());
+    return slice_;
+  }
+
+ private:
+  Slice slice_;
+  std::array<char, Footer::kMaxEncodedLength> data_;
+};
+
+// Read the footer from file
+// If enforce_table_magic_number != 0, ReadFooterFromFile() will return
+// corruption if table_magic number is not equal to enforce_table_magic_number
+Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
+                          FilePrefetchBuffer* prefetch_buffer,
+                          uint64_t file_size, Footer* footer,
+                          uint64_t enforce_table_magic_number = 0);
+
+// Computes a checksum using the given ChecksumType. Sometimes we need to
+// include one more input byte logically at the end but not part of the main
+// data buffer. If data_size >= 1, then
+// ComputeBuiltinChecksum(type, data, size)
+// ==
+// ComputeBuiltinChecksumWithLastByte(type, data, size - 1, data[size - 1])
+uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
+                                size_t size);
+uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
+                                            size_t size, char last_byte);
+
+// Represents the contents of a block read from an SST file. Depending on how
+// it's created, it may or may not own the actual block bytes. As an example,
+// BlockContents objects representing data read from mmapped files only point
+// into the mmapped region. Depending on context, it might be a serialized
+// (potentially compressed) block, including a trailer beyond `size`, or an
+// uncompressed block.
+//
+// Please try to use this terminology when dealing with blocks:
+// * "Serialized block" - bytes that go into storage. For block-based table
+// (usually the case) this includes the block trailer. Here the `size` does
+// not include the trailer, but other places in code might include the trailer
+// in the size.
+// * "Maybe compressed block" - like a serialized block, but without the
+// trailer (or no promise of including a trailer). Must be accompanied by a
+// CompressionType in some other variable or field.
+// * "Uncompressed block" - "payload" bytes that are either stored with no
+// compression, used as input to compression function, or result of
+// decompression function.
+// * "Parsed block" - an in-memory form of a block in block cache, as it is
+// used by the table reader. Different C++ types are used depending on the
+// block type (see block_like_traits.h). Only trivially parsable block types
+// use BlockContents as the parsed form.
+//
+struct BlockContents {
+  // Points to block payload (without trailer)
+  Slice data;
+  CacheAllocationPtr allocation;
+
+#ifndef NDEBUG
+  // Whether there is a known trailer after what is pointed to by `data`.
+  // See BlockBasedTable::GetCompressionType.
+  bool has_trailer = false;
+#endif  // NDEBUG
+
+  BlockContents() {}
+
+  // Does not take ownership of the underlying data bytes.
+  BlockContents(const Slice& _data) : data(_data) {}
+
+  // Takes ownership of the underlying data bytes.
+  BlockContents(CacheAllocationPtr&& _data, size_t _size)
+      : data(_data.get(), _size), allocation(std::move(_data)) {}
+
+  // Takes ownership of the underlying data bytes.
+  BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
+      : data(_data.get(), _size) {
+    allocation.reset(_data.release());
+  }
+
+  // Returns whether the object has ownership of the underlying data bytes.
+  bool own_bytes() const { return allocation.get() != nullptr; }
+
+  // The additional memory space taken by the block data.
+  size_t usable_size() const {
+    if (allocation.get() != nullptr) {
+      auto allocator = allocation.get_deleter().allocator;
+      if (allocator) {
+        return allocator->UsableSize(allocation.get(), data.size());
+      }
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      return malloc_usable_size(allocation.get());
+#else
+      return data.size();
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    } else {
+      return 0;  // no extra memory is occupied by the data
+    }
+  }
+
+  size_t ApproximateMemoryUsage() const {
+    return usable_size() + sizeof(*this);
+  }
+
+  BlockContents(BlockContents&& other) noexcept { *this = std::move(other); }
+
+  BlockContents& operator=(BlockContents&& other) {
+    data = std::move(other.data);
+    allocation = std::move(other.allocation);
+#ifndef NDEBUG
+    has_trailer = other.has_trailer;
+#endif  // NDEBUG
+    return *this;
+  }
+};
+
+// The `data` points to serialized block contents read in from file, which
+// must be compressed and include a trailer beyond `size`. A new buffer is
+// allocated with the given allocator (or default) and the uncompressed
+// contents are returned in `out_contents`.
+// format_version is as defined in include/rocksdb/table.h, which is
+// used to determine compression format version.
+Status UncompressSerializedBlock(const UncompressionInfo& info,
+                                 const char* data, size_t size,
+                                 BlockContents* out_contents,
+                                 uint32_t format_version,
+                                 const ImmutableOptions& ioptions,
+                                 MemoryAllocator* allocator = nullptr);
+
+// This is a variant of UncompressSerializedBlock that does not expect a
+// block trailer beyond `size`. (CompressionType is taken from `info`.)
+Status UncompressBlockData(const UncompressionInfo& info, const char* data,
+                           size_t size, BlockContents* out_contents,
+                           uint32_t format_version,
+                           const ImmutableOptions& ioptions,
+                           MemoryAllocator* allocator = nullptr);
+
+// Replace db_host_id contents with the real hostname if necessary
+Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id);
+
+// Implementation details follow.  Clients should ignore,
+
+// TODO(andrewkr): we should prefer one way of representing a null/uninitialized
+// BlockHandle. Currently we use zeros for null and use negation-of-zeros for
+// uninitialized.
+inline BlockHandle::BlockHandle() : BlockHandle(~uint64_t{0}, ~uint64_t{0}) {}
+
+inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
+    : offset_(_offset), size_(_size) {}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/get_context.cc b/src/rocksdb/table/get_context.cc
new file mode 100644
index 000000000..69e752714
--- /dev/null
+++ b/src/rocksdb/table/get_context.cc
@@ -0,0 +1,604 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/get_context.h"
+
+#include "db/blob//blob_fetcher.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/read_callback.h"
+#include "db/wide/wide_column_serialization.h"
+#include "monitoring/file_read_sample.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
+#ifndef ROCKSDB_LITE
+  if (replay_log) {
+    if (replay_log->empty()) {
+      // Optimization: in the common case of only one operation in the
+      // log, we allocate the exact amount of space needed.
+      replay_log->reserve(1 + VarintLength(value.size()) + value.size());
+    }
+    replay_log->push_back(type);
+    PutLengthPrefixedSlice(replay_log, value);
+  }
+#else
+  (void)replay_log;
+  (void)type;
+  (void)value;
+#endif  // ROCKSDB_LITE
+}
+
+}  // namespace
+
+GetContext::GetContext(
+    const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
+    Statistics* statistics, GetState init_state, const Slice& user_key,
+    PinnableSlice* pinnable_val, PinnableWideColumns* columns,
+    std::string* timestamp, bool* value_found, MergeContext* merge_context,
+    bool do_merge, SequenceNumber* _max_covering_tombstone_seq,
+    SystemClock* clock, SequenceNumber* seq,
+    PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback,
+    bool* is_blob_index, uint64_t tracing_get_id, BlobFetcher* blob_fetcher)
+    : ucmp_(ucmp),
+      merge_operator_(merge_operator),
+      logger_(logger),
+      statistics_(statistics),
+      state_(init_state),
+      user_key_(user_key),
+      pinnable_val_(pinnable_val),
+      columns_(columns),
+      timestamp_(timestamp),
+      value_found_(value_found),
+      merge_context_(merge_context),
+      max_covering_tombstone_seq_(_max_covering_tombstone_seq),
+      clock_(clock),
+      seq_(seq),
+      replay_log_(nullptr),
+      pinned_iters_mgr_(_pinned_iters_mgr),
+      callback_(callback),
+      do_merge_(do_merge),
+      is_blob_index_(is_blob_index),
+      tracing_get_id_(tracing_get_id),
+      blob_fetcher_(blob_fetcher) {
+  if (seq_) {
+    *seq_ = kMaxSequenceNumber;
+  }
+  sample_ = should_sample_file_read();
+}
+
+GetContext::GetContext(const Comparator* ucmp,
+                       const MergeOperator* merge_operator, Logger* logger,
+                       Statistics* statistics, GetState init_state,
+                       const Slice& user_key, PinnableSlice* pinnable_val,
+                       PinnableWideColumns* columns, bool* value_found,
+                       MergeContext* merge_context, bool do_merge,
+                       SequenceNumber* _max_covering_tombstone_seq,
+                       SystemClock* clock, SequenceNumber* seq,
+                       PinnedIteratorsManager* _pinned_iters_mgr,
+                       ReadCallback* callback, bool* is_blob_index,
+                       uint64_t tracing_get_id, BlobFetcher* blob_fetcher)
+    : GetContext(ucmp, merge_operator, logger, statistics, init_state, user_key,
+                 pinnable_val, columns, /*timestamp=*/nullptr, value_found,
+                 merge_context, do_merge, _max_covering_tombstone_seq, clock,
+                 seq, _pinned_iters_mgr, callback, is_blob_index,
+                 tracing_get_id, blob_fetcher) {}
+
+// Called from TableCache::Get and Table::Get when file/block in which
+// key may exist are not there in TableCache/BlockCache respectively. In this
+// case we can't guarantee that key does not exist and are not permitted to do
+// IO to be certain.Set the status=kFound and value_found=false to let the
+// caller know that key may exist but is not there in memory
+void GetContext::MarkKeyMayExist() {
+  state_ = kFound;
+  if (value_found_ != nullptr) {
+    *value_found_ = false;
+  }
+}
+
+void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) {
+  assert(state_ == kNotFound);
+  appendToReplayLog(replay_log_, kTypeValue, value);
+
+  state_ = kFound;
+  if (LIKELY(pinnable_val_ != nullptr)) {
+    pinnable_val_->PinSelf(value);
+  }
+}
+
+void GetContext::ReportCounters() {
+  if (get_context_stats_.num_cache_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit);
+  }
+  if (get_context_stats_.num_cache_index_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT,
+               get_context_stats_.num_cache_index_hit);
+  }
+  if (get_context_stats_.num_cache_data_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_HIT,
+               get_context_stats_.num_cache_data_hit);
+  }
+  if (get_context_stats_.num_cache_filter_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT,
+               get_context_stats_.num_cache_filter_hit);
+  }
+  if (get_context_stats_.num_cache_compression_dict_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_HIT,
+               get_context_stats_.num_cache_compression_dict_hit);
+  }
+  if (get_context_stats_.num_cache_index_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS,
+               get_context_stats_.num_cache_index_miss);
+  }
+  if (get_context_stats_.num_cache_filter_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS,
+               get_context_stats_.num_cache_filter_miss);
+  }
+  if (get_context_stats_.num_cache_data_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_MISS,
+               get_context_stats_.num_cache_data_miss);
+  }
+  if (get_context_stats_.num_cache_compression_dict_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_MISS,
+               get_context_stats_.num_cache_compression_dict_miss);
+  }
+  if (get_context_stats_.num_cache_bytes_read > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_BYTES_READ,
+               get_context_stats_.num_cache_bytes_read);
+  }
+  if (get_context_stats_.num_cache_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_MISS,
+               get_context_stats_.num_cache_miss);
+  }
+  if (get_context_stats_.num_cache_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add);
+  }
+  if (get_context_stats_.num_cache_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_ADD_REDUNDANT,
+               get_context_stats_.num_cache_add_redundant);
+  }
+  if (get_context_stats_.num_cache_bytes_write > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE,
+               get_context_stats_.num_cache_bytes_write);
+  }
+  if (get_context_stats_.num_cache_index_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD,
+               get_context_stats_.num_cache_index_add);
+  }
+  if (get_context_stats_.num_cache_index_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD_REDUNDANT,
+               get_context_stats_.num_cache_index_add_redundant);
+  }
+  if (get_context_stats_.num_cache_index_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT,
+               get_context_stats_.num_cache_index_bytes_insert);
+  }
+  if (get_context_stats_.num_cache_data_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_ADD,
+               get_context_stats_.num_cache_data_add);
+  }
+  if (get_context_stats_.num_cache_data_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_ADD_REDUNDANT,
+               get_context_stats_.num_cache_data_add_redundant);
+  }
+  if (get_context_stats_.num_cache_data_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT,
+               get_context_stats_.num_cache_data_bytes_insert);
+  }
+  if (get_context_stats_.num_cache_filter_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD,
+               get_context_stats_.num_cache_filter_add);
+  }
+  if (get_context_stats_.num_cache_filter_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD_REDUNDANT,
+               get_context_stats_.num_cache_filter_add_redundant);
+  }
+  if (get_context_stats_.num_cache_filter_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT,
+               get_context_stats_.num_cache_filter_bytes_insert);
+  }
+  if (get_context_stats_.num_cache_compression_dict_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD,
+               get_context_stats_.num_cache_compression_dict_add);
+  }
+  if (get_context_stats_.num_cache_compression_dict_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
+               get_context_stats_.num_cache_compression_dict_add_redundant);
+  }
+  if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+               get_context_stats_.num_cache_compression_dict_bytes_insert);
+  }
+}
+
+bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
+                           const Slice& value, bool* matched,
+                           Cleanable* value_pinner) {
+  assert(matched);
+  assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
+         merge_context_ != nullptr);
+  if (ucmp_->EqualWithoutTimestamp(parsed_key.user_key, user_key_)) {
+    *matched = true;
+    // If the value is not in the snapshot, skip it
+    if (!CheckCallback(parsed_key.sequence)) {
+      return true;  // to continue to the next seq
+    }
+
+    appendToReplayLog(replay_log_, parsed_key.type, value);
+
+    if (seq_ != nullptr) {
+      // Set the sequence number if it is uninitialized
+      if (*seq_ == kMaxSequenceNumber) {
+        *seq_ = parsed_key.sequence;
+      }
+      if (max_covering_tombstone_seq_) {
+        *seq_ = std::max(*seq_, *max_covering_tombstone_seq_);
+      }
+    }
+
+    size_t ts_sz = ucmp_->timestamp_size();
+    if (ts_sz > 0 && timestamp_ != nullptr) {
+      if (!timestamp_->empty()) {
+        assert(ts_sz == timestamp_->size());
+        // `timestamp` can be set before `SaveValue` is ever called
+        // when max_covering_tombstone_seq_ was set.
+        // If this key has a higher sequence number than range tombstone,
+        // then timestamp should be updated. `ts_from_rangetombstone_` is
+        // set to false afterwards so that only the key with highest seqno
+        // updates the timestamp.
+        if (ts_from_rangetombstone_) {
+          assert(max_covering_tombstone_seq_);
+          if (parsed_key.sequence > *max_covering_tombstone_seq_) {
+            Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz);
+            timestamp_->assign(ts.data(), ts.size());
+            ts_from_rangetombstone_ = false;
+          }
+        }
+      }
+      // TODO optimize for small size ts
+      const std::string kMaxTs(ts_sz, '\xff');
+      if (timestamp_->empty() ||
+          ucmp_->CompareTimestamp(*timestamp_, kMaxTs) == 0) {
+        Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz);
+        timestamp_->assign(ts.data(), ts.size());
+      }
+    }
+
+    auto type = parsed_key.type;
+    // Key matches. Process it
+    if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex ||
+         type == kTypeWideColumnEntity || type == kTypeDeletion ||
+         type == kTypeDeletionWithTimestamp || type == kTypeSingleDeletion) &&
+        max_covering_tombstone_seq_ != nullptr &&
+        *max_covering_tombstone_seq_ > parsed_key.sequence) {
+      // Note that deletion types are also considered, this is for the case
+      // when we need to return timestamp to user. If a range tombstone has a
+      // higher seqno than point tombstone, its timestamp should be returned.
+      type = kTypeRangeDeletion;
+    }
+    switch (type) {
+      case kTypeValue:
+      case kTypeBlobIndex:
+      case kTypeWideColumnEntity:
+        assert(state_ == kNotFound || state_ == kMerge);
+        if (type == kTypeBlobIndex) {
+          if (is_blob_index_ == nullptr) {
+            // Blob value not supported. Stop.
+            state_ = kUnexpectedBlobIndex;
+            return false;
+          }
+        }
+
+        if (is_blob_index_ != nullptr) {
+          *is_blob_index_ = (type == kTypeBlobIndex);
+        }
+
+        if (kNotFound == state_) {
+          state_ = kFound;
+          if (do_merge_) {
+            if (LIKELY(pinnable_val_ != nullptr)) {
+              Slice value_to_use = value;
+
+              if (type == kTypeWideColumnEntity) {
+                Slice value_copy = value;
+
+                if (!WideColumnSerialization::GetValueOfDefaultColumn(
+                         value_copy, value_to_use)
+                         .ok()) {
+                  state_ = kCorrupt;
+                  return false;
+                }
+              }
+
+              if (LIKELY(value_pinner != nullptr)) {
+                // If the backing resources for the value are provided, pin them
+                pinnable_val_->PinSlice(value_to_use, value_pinner);
+              } else {
+                TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf",
+                                         this);
+                // Otherwise copy the value
+                pinnable_val_->PinSelf(value_to_use);
+              }
+            } else if (columns_ != nullptr) {
+              if (type == kTypeWideColumnEntity) {
+                if (!columns_->SetWideColumnValue(value, value_pinner).ok()) {
+                  state_ = kCorrupt;
+                  return false;
+                }
+              } else {
+                columns_->SetPlainValue(value, value_pinner);
+              }
+            }
+          } else {
+            // It means this function is called as part of DB GetMergeOperands
+            // API and the current value should be part of
+            // merge_context_->operand_list
+            if (type == kTypeBlobIndex) {
+              PinnableSlice pin_val;
+              if (GetBlobValue(value, &pin_val) == false) {
+                return false;
+              }
+              Slice blob_value(pin_val);
+              push_operand(blob_value, nullptr);
+            } else if (type == kTypeWideColumnEntity) {
+              Slice value_copy = value;
+              Slice value_of_default;
+
+              if (!WideColumnSerialization::GetValueOfDefaultColumn(
+                       value_copy, value_of_default)
+                       .ok()) {
+                state_ = kCorrupt;
+                return false;
+              }
+
+              push_operand(value_of_default, value_pinner);
+            } else {
+              assert(type == kTypeValue);
+              push_operand(value, value_pinner);
+            }
+          }
+        } else if (kMerge == state_) {
+          assert(merge_operator_ != nullptr);
+          if (type == kTypeBlobIndex) {
+            PinnableSlice pin_val;
+            if (GetBlobValue(value, &pin_val) == false) {
+              return false;
+            }
+            Slice blob_value(pin_val);
+            state_ = kFound;
+            if (do_merge_) {
+              Merge(&blob_value);
+            } else {
+              // It means this function is called as part of DB GetMergeOperands
+              // API and the current value should be part of
+              // merge_context_->operand_list
+              push_operand(blob_value, nullptr);
+            }
+          } else if (type == kTypeWideColumnEntity) {
+            state_ = kFound;
+
+            if (do_merge_) {
+              MergeWithEntity(value);
+            } else {
+              // It means this function is called as part of DB GetMergeOperands
+              // API and the current value should be part of
+              // merge_context_->operand_list
+              Slice value_copy = value;
+              Slice value_of_default;
+
+              if (!WideColumnSerialization::GetValueOfDefaultColumn(
+                       value_copy, value_of_default)
+                       .ok()) {
+                state_ = kCorrupt;
+                return false;
+              }
+
+              push_operand(value_of_default, value_pinner);
+            }
+          } else {
+            assert(type == kTypeValue);
+
+            state_ = kFound;
+            if (do_merge_) {
+              Merge(&value);
+            } else {
+              // It means this function is called as part of DB GetMergeOperands
+              // API and the current value should be part of
+              // merge_context_->operand_list
+              push_operand(value, value_pinner);
+            }
+          }
+        }
+        return false;
+
+      case kTypeDeletion:
+      case kTypeDeletionWithTimestamp:
+      case kTypeSingleDeletion:
+      case kTypeRangeDeletion:
+        // TODO(noetzli): Verify correctness once merge of single-deletes
+        // is supported
+        assert(state_ == kNotFound || state_ == kMerge);
+        if (kNotFound == state_) {
+          state_ = kDeleted;
+        } else if (kMerge == state_) {
+          state_ = kFound;
+          if (do_merge_) {
+            Merge(nullptr);
+          }
+          // If do_merge_ = false then the current value shouldn't be part of
+          // merge_context_->operand_list
+        }
+        return false;
+
+      case kTypeMerge:
+        assert(state_ == kNotFound || state_ == kMerge);
+        state_ = kMerge;
+        // value_pinner is not set from plain_table_reader.cc for example.
+        push_operand(value, value_pinner);
+        if (do_merge_ && merge_operator_ != nullptr &&
+            merge_operator_->ShouldMerge(
+                merge_context_->GetOperandsDirectionBackward())) {
+          state_ = kFound;
+          Merge(nullptr);
+          return false;
+        }
+        return true;
+
+      default:
+        assert(false);
+        break;
+    }
+  }
+
+  // state_ could be Corrupt, merge or notfound
+  return false;
+}
+
+void GetContext::Merge(const Slice* value) {
+  assert(do_merge_);
+  assert(!pinnable_val_ || !columns_);
+
+  std::string result;
+  const Status s = MergeHelper::TimedFullMerge(
+      merge_operator_, user_key_, value, merge_context_->GetOperands(), &result,
+      logger_, statistics_, clock_, /* result_operand */ nullptr,
+      /* update_num_ops_stats */ true);
+  if (!s.ok()) {
+    state_ = kCorrupt;
+    return;
+  }
+
+  if (LIKELY(pinnable_val_ != nullptr)) {
+    *(pinnable_val_->GetSelf()) = std::move(result);
+    pinnable_val_->PinSelf();
+    return;
+  }
+
+  assert(columns_);
+  columns_->SetPlainValue(result);
+}
+
+void GetContext::MergeWithEntity(Slice entity) {
+  assert(do_merge_);
+  assert(!pinnable_val_ || !columns_);
+
+  if (LIKELY(pinnable_val_ != nullptr)) {
+    Slice value_of_default;
+
+    {
+      const Status s = WideColumnSerialization::GetValueOfDefaultColumn(
+          entity, value_of_default);
+      if (!s.ok()) {
+        state_ = kCorrupt;
+        return;
+      }
+    }
+
+    {
+      const Status s = MergeHelper::TimedFullMerge(
+          merge_operator_, user_key_, &value_of_default,
+          merge_context_->GetOperands(), pinnable_val_->GetSelf(), logger_,
+          statistics_, clock_, /* result_operand */ nullptr,
+          /* update_num_ops_stats */ true);
+      if (!s.ok()) {
+        state_ = kCorrupt;
+        return;
+      }
+    }
+
+    pinnable_val_->PinSelf();
+    return;
+  }
+
+  std::string result;
+
+  {
+    const Status s = MergeHelper::TimedFullMergeWithEntity(
+        merge_operator_, user_key_, entity, merge_context_->GetOperands(),
+        &result, logger_, statistics_, clock_, /* update_num_ops_stats */ true);
+    if (!s.ok()) {
+      state_ = kCorrupt;
+      return;
+    }
+  }
+
+  {
+    assert(columns_);
+    const Status s = columns_->SetWideColumnValue(result);
+    if (!s.ok()) {
+      state_ = kCorrupt;
+      return;
+    }
+  }
+}
+
+bool GetContext::GetBlobValue(const Slice& blob_index,
+                              PinnableSlice* blob_value) {
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr uint64_t* bytes_read = nullptr;
+
+  Status status = blob_fetcher_->FetchBlob(
+      user_key_, blob_index, prefetch_buffer, blob_value, bytes_read);
+  if (!status.ok()) {
+    if (status.IsIncomplete()) {
+      // FIXME: this code is not covered by unit tests
+      MarkKeyMayExist();
+      return false;
+    }
+    state_ = kCorrupt;
+    return false;
+  }
+  *is_blob_index_ = false;
+  return true;
+}
+
+void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
+  // TODO(yanqin) preserve timestamps information in merge_context
+  if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
+      value_pinner != nullptr) {
+    value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
+    merge_context_->PushOperand(value, true /*value_pinned*/);
+  } else {
+    merge_context_->PushOperand(value, false);
+  }
+}
+
+void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
+                         GetContext* get_context, Cleanable* value_pinner) {
+#ifndef ROCKSDB_LITE
+  Slice s = replay_log;
+  while (s.size()) {
+    auto type = static_cast<ValueType>(*s.data());
+    s.remove_prefix(1);
+    Slice value;
+    bool ret = GetLengthPrefixedSlice(&s, &value);
+    assert(ret);
+    (void)ret;
+
+    bool dont_care __attribute__((__unused__));
+    // Since SequenceNumber is not stored and unknown, we will use
+    // kMaxSequenceNumber.
+    get_context->SaveValue(
+        ParsedInternalKey(user_key, kMaxSequenceNumber, type), value,
+        &dont_care, value_pinner);
+  }
+#else   // ROCKSDB_LITE
+  (void)replay_log;
+  (void)user_key;
+  (void)get_context;
+  (void)value_pinner;
+  assert(false);
+#endif  // ROCKSDB_LITE
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/get_context.h b/src/rocksdb/table/get_context.h
new file mode 100644
index 000000000..dcc7ab8d6
--- /dev/null
+++ b/src/rocksdb/table/get_context.h
@@ -0,0 +1,231 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+
+#include "db/read_callback.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+class BlobFetcher;
+class Comparator;
+class Logger;
+class MergeContext;
+class MergeOperator;
+class PinnableWideColumns;
+class PinnedIteratorsManager;
+class Statistics;
+class SystemClock;
+struct ParsedInternalKey;
+
+// Data structure for accumulating statistics during a point lookup. At the
+// end of the point lookup, the corresponding ticker stats are updated. This
+// avoids the overhead of frequent ticker stats updates
+struct GetContextStats {
+  uint64_t num_cache_hit = 0;
+  uint64_t num_cache_index_hit = 0;
+  uint64_t num_cache_data_hit = 0;
+  uint64_t num_cache_filter_hit = 0;
+  uint64_t num_cache_compression_dict_hit = 0;
+  uint64_t num_cache_index_miss = 0;
+  uint64_t num_cache_filter_miss = 0;
+  uint64_t num_cache_data_miss = 0;
+  uint64_t num_cache_compression_dict_miss = 0;
+  uint64_t num_cache_bytes_read = 0;
+  uint64_t num_cache_miss = 0;
+  uint64_t num_cache_add = 0;
+  uint64_t num_cache_add_redundant = 0;
+  uint64_t num_cache_bytes_write = 0;
+  uint64_t num_cache_index_add = 0;
+  uint64_t num_cache_index_add_redundant = 0;
+  uint64_t num_cache_index_bytes_insert = 0;
+  uint64_t num_cache_data_add = 0;
+  uint64_t num_cache_data_add_redundant = 0;
+  uint64_t num_cache_data_bytes_insert = 0;
+  uint64_t num_cache_filter_add = 0;
+  uint64_t num_cache_filter_add_redundant = 0;
+  uint64_t num_cache_filter_bytes_insert = 0;
+  uint64_t num_cache_compression_dict_add = 0;
+  uint64_t num_cache_compression_dict_add_redundant = 0;
+  uint64_t num_cache_compression_dict_bytes_insert = 0;
+  // MultiGet stats.
+  uint64_t num_filter_read = 0;
+  uint64_t num_index_read = 0;
+  uint64_t num_sst_read = 0;
+};
+
+// A class to hold context about a point lookup, such as pointer to value
+// slice, key, merge context etc, as well as the current state of the
+// lookup. Any user using GetContext to track the lookup result must call
+// SaveValue() whenever the internal key is found. This can happen
+// repeatedly in case of merge operands. In case the key may exist with
+// high probability, but IO is required to confirm and the user doesn't allow
+// it, MarkKeyMayExist() must be called instead of SaveValue().
+class GetContext {
+ public:
+  // Current state of the point lookup. All except kNotFound and kMerge are
+  // terminal states
+  enum GetState {
+    kNotFound,
+    kFound,
+    kDeleted,
+    kCorrupt,
+    kMerge,  // saver contains the current merge result (the operands)
+    kUnexpectedBlobIndex,
+  };
+  GetContextStats get_context_stats_;
+
+  // Constructor
+  // @param value Holds the value corresponding to user_key. If its nullptr
+  //              then return all merge operands corresponding to user_key
+  //              via merge_context
+  // @param value_found If non-nullptr, set to false if key may be present
+  //                    but we can't be certain because we cannot do IO
+  // @param max_covering_tombstone_seq Pointer to highest sequence number of
+  //                    range deletion covering the key. When an internal key
+  //                    is found with smaller sequence number, the lookup
+  //                    terminates
+  // @param seq If non-nullptr, the sequence number of the found key will be
+  //            saved here
+  // @param callback Pointer to ReadCallback to perform additional checks
+  //                 for visibility of a key
+  // @param is_blob_index If non-nullptr, will be used to indicate if a found
+  //                      key is of type blob index
+  // @param do_merge True if value associated with user_key has to be returned
+  // and false if all the merge operands associated with user_key has to be
+  // returned. Id do_merge=false then all the merge operands are stored in
+  // merge_context and they are never merged. The value pointer is untouched.
+  GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
+             Logger* logger, Statistics* statistics, GetState init_state,
+             const Slice& user_key, PinnableSlice* value,
+             PinnableWideColumns* columns, bool* value_found,
+             MergeContext* merge_context, bool do_merge,
+             SequenceNumber* max_covering_tombstone_seq, SystemClock* clock,
+             SequenceNumber* seq = nullptr,
+             PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
+             ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+             uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr);
+  GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
+             Logger* logger, Statistics* statistics, GetState init_state,
+             const Slice& user_key, PinnableSlice* value,
+             PinnableWideColumns* columns, std::string* timestamp,
+             bool* value_found, MergeContext* merge_context, bool do_merge,
+             SequenceNumber* max_covering_tombstone_seq, SystemClock* clock,
+             SequenceNumber* seq = nullptr,
+             PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
+             ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+             uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr);
+
+  GetContext() = delete;
+
+  // This can be called to indicate that a key may be present, but cannot be
+  // confirmed due to IO not allowed
+  void MarkKeyMayExist();
+
+  // Records this key, value, and any meta-data (such as sequence number and
+  // state) into this GetContext.
+  //
+  // If the parsed_key matches the user key that we are looking for, sets
+  // matched to true.
+  //
+  // Returns True if more keys need to be read (due to merges) or
+  //         False if the complete value has been found.
+  bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value,
+                 bool* matched, Cleanable* value_pinner = nullptr);
+
+  // Simplified version of the previous function. Should only be used when we
+  // know that the operation is a Put.
+  void SaveValue(const Slice& value, SequenceNumber seq);
+
+  GetState State() const { return state_; }
+
+  SequenceNumber* max_covering_tombstone_seq() {
+    return max_covering_tombstone_seq_;
+  }
+
+  bool NeedTimestamp() { return timestamp_ != nullptr; }
+
+  void SetTimestampFromRangeTombstone(const Slice& timestamp) {
+    assert(timestamp_);
+    timestamp_->assign(timestamp.data(), timestamp.size());
+    ts_from_rangetombstone_ = true;
+  }
+
+  PinnedIteratorsManager* pinned_iters_mgr() { return pinned_iters_mgr_; }
+
+  // If a non-null string is passed, all the SaveValue calls will be
+  // logged into the string. The operations can then be replayed on
+  // another GetContext with replayGetContextLog.
+  void SetReplayLog(std::string* replay_log) { replay_log_ = replay_log; }
+
+  // Do we need to fetch the SequenceNumber for this key?
+  bool NeedToReadSequence() const { return (seq_ != nullptr); }
+
+  bool sample() const { return sample_; }
+
+  bool CheckCallback(SequenceNumber seq) {
+    if (callback_) {
+      return callback_->IsVisible(seq);
+    }
+    return true;
+  }
+
+  void ReportCounters();
+
+  bool has_callback() const { return callback_ != nullptr; }
+
+  uint64_t get_tracing_get_id() const { return tracing_get_id_; }
+
+  void push_operand(const Slice& value, Cleanable* value_pinner);
+
+ private:
+  void Merge(const Slice* value);
+  void MergeWithEntity(Slice entity);
+  bool GetBlobValue(const Slice& blob_index, PinnableSlice* blob_value);
+
+  const Comparator* ucmp_;
+  const MergeOperator* merge_operator_;
+  // the merge operations encountered;
+  Logger* logger_;
+  Statistics* statistics_;
+
+  GetState state_;
+  Slice user_key_;
+  PinnableSlice* pinnable_val_;
+  PinnableWideColumns* columns_;
+  std::string* timestamp_;
+  bool ts_from_rangetombstone_{false};
+  bool* value_found_;  // Is value set correctly? Used by KeyMayExist
+  MergeContext* merge_context_;
+  SequenceNumber* max_covering_tombstone_seq_;
+  SystemClock* clock_;
+  // If a key is found, seq_ will be set to the SequenceNumber of most recent
+  // write to the key or kMaxSequenceNumber if unknown
+  SequenceNumber* seq_;
+  std::string* replay_log_;
+  // Used to temporarily pin blocks when state_ == GetContext::kMerge
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  ReadCallback* callback_;
+  bool sample_;
+  // Value is true if it's called as part of DB Get API and false if it's
+  // called as part of DB GetMergeOperands API. When it's false merge operators
+  // are never merged.
+  bool do_merge_;
+  bool* is_blob_index_;
+  // Used for block cache tracing only. A tracing get id uniquely identifies a
+  // Get or a MultiGet.
+  const uint64_t tracing_get_id_;
+  BlobFetcher* blob_fetcher_;
+};
+
+// Call this to replay a log and bring the get_context up to date. The replay
+// log must have been created by another GetContext object, whose replay log
+// must have been set by calling GetContext::SetReplayLog().
+void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
+                         GetContext* get_context,
+                         Cleanable* value_pinner = nullptr);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/internal_iterator.h b/src/rocksdb/table/internal_iterator.h
new file mode 100644
index 000000000..945dec806
--- /dev/null
+++ b/src/rocksdb/table/internal_iterator.h
@@ -0,0 +1,226 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/readahead_file_info.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/status.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PinnedIteratorsManager;
+
+enum class IterBoundCheck : char {
+  kUnknown = 0,
+  kOutOfBound,
+  kInbound,
+};
+
+struct IterateResult {
+  Slice key;
+  IterBoundCheck bound_check_result = IterBoundCheck::kUnknown;
+  // If false, PrepareValue() needs to be called before value().
+  bool value_prepared = true;
+};
+
+template <class TValue>
+class InternalIteratorBase : public Cleanable {
+ public:
+  InternalIteratorBase() {}
+
+  // No copying allowed
+  InternalIteratorBase(const InternalIteratorBase&) = delete;
+  InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
+
+  virtual ~InternalIteratorBase() {}
+
+  // An iterator is either positioned at a key/value pair, or
+  // not valid.  This method returns true iff the iterator is valid.
+  // Always returns false if !status().ok().
+  virtual bool Valid() const = 0;
+
+  // Position at the first key in the source.  The iterator is Valid()
+  // after this call iff the source is not empty.
+  virtual void SeekToFirst() = 0;
+
+  // Position at the last key in the source.  The iterator is
+  // Valid() after this call iff the source is not empty.
+  virtual void SeekToLast() = 0;
+
+  // Position at the first key in the source that at or past target
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or past target.
+  // All Seek*() methods clear any error status() that the iterator had prior to
+  // the call; after the seek, status() indicates only the error (if any) that
+  // happened during the seek, not any past errors.
+  // 'target' contains user timestamp if timestamp is enabled.
+  virtual void Seek(const Slice& target) = 0;
+
+  // Position at the first key in the source that at or before target
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or before target.
+  virtual void SeekForPrev(const Slice& target) = 0;
+
+  // Moves to the next entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Moves to the next entry in the source, and return result. Iterator
+  // implementation should override this method to help methods inline better,
+  // or when UpperBoundCheckResult() is non-trivial.
+  // REQUIRES: Valid()
+  virtual bool NextAndGetResult(IterateResult* result) {
+    Next();
+    bool is_valid = Valid();
+    if (is_valid) {
+      result->key = key();
+      // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual
+      // call. If an implementation has non-trivial UpperBoundCheckResult(),
+      // it should also override NextAndGetResult().
+      result->bound_check_result = IterBoundCheck::kUnknown;
+      result->value_prepared = false;
+      assert(UpperBoundCheckResult() != IterBoundCheck::kOutOfBound);
+    }
+    return is_valid;
+  }
+
+  // Moves to the previous entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the first entry in source.
+  // REQUIRES: Valid()
+  virtual void Prev() = 0;
+
+  // Return the key for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual Slice key() const = 0;
+
+  // Return user key for the current entry.
+  // REQUIRES: Valid()
+  virtual Slice user_key() const { return ExtractUserKey(key()); }
+
+  // Return the value for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  // REQUIRES: PrepareValue() has been called if needed (see PrepareValue()).
+  virtual TValue value() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  // If non-blocking IO is requested and this operation cannot be
+  // satisfied without doing some IO, then this returns Status::Incomplete().
+  virtual Status status() const = 0;
+
+  // For some types of iterators, sometimes Seek()/Next()/SeekForPrev()/etc may
+  // load key but not value (to avoid the IO cost of reading the value from disk
+  // if it won't be not needed). This method loads the value in such situation.
+  //
+  // Needs to be called before value() at least once after each iterator
+  // movement (except if IterateResult::value_prepared = true), for iterators
+  // created with allow_unprepared_value = true.
+  //
+  // Returns false if an error occurred; in this case Valid() is also changed
+  // to false, and status() is changed to non-ok.
+  // REQUIRES: Valid()
+  virtual bool PrepareValue() { return true; }
+
+  // Keys return from this iterator can be smaller than iterate_lower_bound.
+  virtual bool MayBeOutOfLowerBound() { return true; }
+
+  // If the iterator has checked the key against iterate_upper_bound, returns
+  // the result here. The function can be used by user of the iterator to skip
+  // their own checks. If Valid() = true, IterBoundCheck::kUnknown is always
+  // a valid value. If Valid() = false, IterBoundCheck::kOutOfBound indicates
+  // that the iterator is filtered out by upper bound checks.
+  virtual IterBoundCheck UpperBoundCheckResult() {
+    return IterBoundCheck::kUnknown;
+  }
+
+  // Pass the PinnedIteratorsManager to the Iterator, most Iterators don't
+  // communicate with PinnedIteratorsManager so default implementation is no-op
+  // but for Iterators that need to communicate with PinnedIteratorsManager
+  // they will implement this function and use the passed pointer to communicate
+  // with PinnedIteratorsManager.
+  virtual void SetPinnedItersMgr(PinnedIteratorsManager* /*pinned_iters_mgr*/) {
+  }
+
+  // If true, this means that the Slice returned by key() is valid as long as
+  // PinnedIteratorsManager::ReleasePinnedData is not called and the
+  // Iterator is not deleted.
+  //
+  // IsKeyPinned() is guaranteed to always return true if
+  //  - Iterator is created with ReadOptions::pin_data = true
+  //  - DB tables were created with BlockBasedTableOptions::use_delta_encoding
+  //    set to false.
+  virtual bool IsKeyPinned() const { return false; }
+
+  // If true, this means that the Slice returned by value() is valid as long as
+  // PinnedIteratorsManager::ReleasePinnedData is not called and the
+  // Iterator is not deleted.
+  // REQUIRES: Same as for value().
+  virtual bool IsValuePinned() const { return false; }
+
+  virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) {
+    return Status::NotSupported("");
+  }
+
+  // When iterator moves from one file to another file at same level, new file's
+  // readahead state (details of last block read) is updated with previous
+  // file's readahead state. This way internal readahead_size of Prefetch Buffer
+  // doesn't start from scratch and can fall back to 8KB with no prefetch if
+  // reads are not sequential.
+  //
+  // Default implementation is no-op and its implemented by iterators.
+  virtual void GetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {}
+
+  // Default implementation is no-op and its implemented by iterators.
+  virtual void SetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {}
+
+  // When used under merging iterator, LevelIterator treats file boundaries
+  // as sentinel keys to prevent it from moving to next SST file before range
+  // tombstones in the current SST file are no longer needed. This method makes
+  // it cheap to check if the current key is a sentinel key. This should only be
+  // used by MergingIterator and LevelIterator for now.
+  virtual bool IsDeleteRangeSentinelKey() const { return false; }
+
+ protected:
+  void SeekForPrevImpl(const Slice& target, const CompareInterface* cmp) {
+    Seek(target);
+    if (!Valid()) {
+      SeekToLast();
+    }
+    while (Valid() && cmp->Compare(target, key()) < 0) {
+      Prev();
+    }
+  }
+
+  bool is_mutable_;
+};
+
+using InternalIterator = InternalIteratorBase<Slice>;
+
+// Return an empty iterator (yields nothing).
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewEmptyInternalIterator();
+
+// Return an empty iterator with the specified status.
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
+    const Status& status);
+
+// Return an empty iterator with the specified status, allocated arena.
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/iter_heap.h b/src/rocksdb/table/iter_heap.h
new file mode 100644
index 000000000..6ad94be9b
--- /dev/null
+++ b/src/rocksdb/table/iter_heap.h
@@ -0,0 +1,44 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include "db/dbformat.h"
+#include "table/iterator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// When used with std::priority_queue, this comparison functor puts the
+// iterator with the max/largest key on top.
+class MaxIteratorComparator {
+ public:
+  MaxIteratorComparator(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
+
+  bool operator()(IteratorWrapper* a, IteratorWrapper* b) const {
+    return comparator_->Compare(a->key(), b->key()) < 0;
+  }
+
+ private:
+  const InternalKeyComparator* comparator_;
+};
+
+// When used with std::priority_queue, this comparison functor puts the
+// iterator with the min/smallest key on top.
+class MinIteratorComparator {
+ public:
+  MinIteratorComparator(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
+
+  bool operator()(IteratorWrapper* a, IteratorWrapper* b) const {
+    return comparator_->Compare(a->key(), b->key()) > 0;
+  }
+
+ private:
+  const InternalKeyComparator* comparator_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/iterator.cc b/src/rocksdb/table/iterator.cc
new file mode 100644
index 000000000..14e280a07
--- /dev/null
+++ b/src/rocksdb/table/iterator.cc
@@ -0,0 +1,130 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/iterator.h"
+
+#include "memory/arena.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status Iterator::GetProperty(std::string prop_name, std::string* prop) {
+  if (prop == nullptr) {
+    return Status::InvalidArgument("prop is nullptr");
+  }
+  if (prop_name == "rocksdb.iterator.is-key-pinned") {
+    *prop = "0";
+    return Status::OK();
+  }
+  return Status::InvalidArgument("Unidentified property.");
+}
+
+namespace {
+class EmptyIterator : public Iterator {
+ public:
+  explicit EmptyIterator(const Status& s) : status_(s) {}
+  bool Valid() const override { return false; }
+  void Seek(const Slice& /*target*/) override {}
+  void SeekForPrev(const Slice& /*target*/) override {}
+  void SeekToFirst() override {}
+  void SeekToLast() override {}
+  void Next() override { assert(false); }
+  void Prev() override { assert(false); }
+  Slice key() const override {
+    assert(false);
+    return Slice();
+  }
+  Slice value() const override {
+    assert(false);
+    return Slice();
+  }
+  Status status() const override { return status_; }
+
+ private:
+  Status status_;
+};
+
+template <class TValue = Slice>
+class EmptyInternalIterator : public InternalIteratorBase<TValue> {
+ public:
+  explicit EmptyInternalIterator(const Status& s) : status_(s) {}
+  bool Valid() const override { return false; }
+  void Seek(const Slice& /*target*/) override {}
+  void SeekForPrev(const Slice& /*target*/) override {}
+  void SeekToFirst() override {}
+  void SeekToLast() override {}
+  void Next() override { assert(false); }
+  void Prev() override { assert(false); }
+  Slice key() const override {
+    assert(false);
+    return Slice();
+  }
+  TValue value() const override {
+    assert(false);
+    return TValue();
+  }
+  Status status() const override { return status_; }
+
+ private:
+  Status status_;
+};
+}  // namespace
+
+Iterator* NewEmptyIterator() { return new EmptyIterator(Status::OK()); }
+
+Iterator* NewErrorIterator(const Status& status) {
+  return new EmptyIterator(status);
+}
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status) {
+  return new EmptyInternalIterator<TValue>(status);
+}
+template InternalIteratorBase<IndexValue>* NewErrorInternalIterator(
+    const Status& status);
+template InternalIteratorBase<Slice>* NewErrorInternalIterator(
+    const Status& status);
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status,
+                                                       Arena* arena) {
+  if (arena == nullptr) {
+    return NewErrorInternalIterator<TValue>(status);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator<TValue>));
+    return new (mem) EmptyInternalIterator<TValue>(status);
+  }
+}
+template InternalIteratorBase<IndexValue>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
+template InternalIteratorBase<Slice>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewEmptyInternalIterator() {
+  return new EmptyInternalIterator<TValue>(Status::OK());
+}
+template InternalIteratorBase<IndexValue>* NewEmptyInternalIterator();
+template InternalIteratorBase<Slice>* NewEmptyInternalIterator();
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena) {
+  if (arena == nullptr) {
+    return NewEmptyInternalIterator<TValue>();
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator<TValue>));
+    return new (mem) EmptyInternalIterator<TValue>(Status::OK());
+  }
+}
+template InternalIteratorBase<IndexValue>* NewEmptyInternalIterator(
+    Arena* arena);
+template InternalIteratorBase<Slice>* NewEmptyInternalIterator(Arena* arena);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/iterator_wrapper.h b/src/rocksdb/table/iterator_wrapper.h
new file mode 100644
index 000000000..17abef4ac
--- /dev/null
+++ b/src/rocksdb/table/iterator_wrapper.h
@@ -0,0 +1,190 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <set>
+
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A internal wrapper class with an interface similar to Iterator that caches
+// the valid() and key() results for an underlying iterator.
+// This can help avoid virtual function calls and also gives better
+// cache locality.
+template <class TValue = Slice>
+class IteratorWrapperBase {
+ public:
+  IteratorWrapperBase() : iter_(nullptr), valid_(false) {}
+  explicit IteratorWrapperBase(InternalIteratorBase<TValue>* _iter)
+      : iter_(nullptr) {
+    Set(_iter);
+  }
+  ~IteratorWrapperBase() {}
+  InternalIteratorBase<TValue>* iter() const { return iter_; }
+
+  // Set the underlying Iterator to _iter and return
+  // previous underlying Iterator.
+  InternalIteratorBase<TValue>* Set(InternalIteratorBase<TValue>* _iter) {
+    InternalIteratorBase<TValue>* old_iter = iter_;
+
+    iter_ = _iter;
+    if (iter_ == nullptr) {
+      valid_ = false;
+    } else {
+      Update();
+    }
+    return old_iter;
+  }
+
+  void DeleteIter(bool is_arena_mode) {
+    if (iter_) {
+      if (!is_arena_mode) {
+        delete iter_;
+      } else {
+        iter_->~InternalIteratorBase<TValue>();
+      }
+    }
+  }
+
+  // Iterator interface methods
+  bool Valid() const { return valid_; }
+  Slice key() const {
+    assert(Valid());
+    return result_.key;
+  }
+  TValue value() const {
+    assert(Valid());
+    return iter_->value();
+  }
+  // Methods below require iter() != nullptr
+  Status status() const {
+    assert(iter_);
+    return iter_->status();
+  }
+  bool PrepareValue() {
+    assert(Valid());
+    if (result_.value_prepared) {
+      return true;
+    }
+    if (iter_->PrepareValue()) {
+      result_.value_prepared = true;
+      return true;
+    }
+
+    assert(!iter_->Valid());
+    valid_ = false;
+    return false;
+  }
+  void Next() {
+    assert(iter_);
+    valid_ = iter_->NextAndGetResult(&result_);
+    assert(!valid_ || iter_->status().ok());
+  }
+  bool NextAndGetResult(IterateResult* result) {
+    assert(iter_);
+    valid_ = iter_->NextAndGetResult(&result_);
+    *result = result_;
+    assert(!valid_ || iter_->status().ok());
+    return valid_;
+  }
+  void Prev() {
+    assert(iter_);
+    iter_->Prev();
+    Update();
+  }
+  void Seek(const Slice& k) {
+    assert(iter_);
+    iter_->Seek(k);
+    Update();
+  }
+  void SeekForPrev(const Slice& k) {
+    assert(iter_);
+    iter_->SeekForPrev(k);
+    Update();
+  }
+  void SeekToFirst() {
+    assert(iter_);
+    iter_->SeekToFirst();
+    Update();
+  }
+  void SeekToLast() {
+    assert(iter_);
+    iter_->SeekToLast();
+    Update();
+  }
+
+  bool MayBeOutOfLowerBound() {
+    assert(Valid());
+    return iter_->MayBeOutOfLowerBound();
+  }
+
+  IterBoundCheck UpperBoundCheckResult() {
+    assert(Valid());
+    return result_.bound_check_result;
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {
+    assert(iter_);
+    iter_->SetPinnedItersMgr(pinned_iters_mgr);
+  }
+  bool IsKeyPinned() const {
+    assert(Valid());
+    return iter_->IsKeyPinned();
+  }
+  bool IsValuePinned() const {
+    assert(Valid());
+    return iter_->IsValuePinned();
+  }
+
+  bool IsValuePrepared() const { return result_.value_prepared; }
+
+  Slice user_key() const {
+    assert(Valid());
+    return iter_->user_key();
+  }
+
+  void UpdateReadaheadState(InternalIteratorBase<TValue>* old_iter) {
+    if (old_iter && iter_) {
+      ReadaheadFileInfo readahead_file_info;
+      old_iter->GetReadaheadState(&readahead_file_info);
+      iter_->SetReadaheadState(&readahead_file_info);
+    }
+  }
+
+  bool IsDeleteRangeSentinelKey() const {
+    return iter_->IsDeleteRangeSentinelKey();
+  }
+
+ private:
+  void Update() {
+    valid_ = iter_->Valid();
+    if (valid_) {
+      assert(iter_->status().ok());
+      result_.key = iter_->key();
+      result_.bound_check_result = IterBoundCheck::kUnknown;
+      result_.value_prepared = false;
+    }
+  }
+
+  InternalIteratorBase<TValue>* iter_;
+  IterateResult result_;
+  bool valid_;
+};
+
+using IteratorWrapper = IteratorWrapperBase<Slice>;
+
+class Arena;
+// Return an empty iterator (yields nothing) allocated from arena.
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/merger_test.cc b/src/rocksdb/table/merger_test.cc
new file mode 100644
index 000000000..71dc798e5
--- /dev/null
+++ b/src/rocksdb/table/merger_test.cc
@@ -0,0 +1,182 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+#include <vector>
+
+#include "table/merging_iterator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergerTest : public testing::Test {
+ public:
+  MergerTest()
+      : icomp_(BytewiseComparator()),
+        rnd_(3),
+        merging_iterator_(nullptr),
+        single_iterator_(nullptr) {}
+  ~MergerTest() override = default;
+  std::vector<std::string> GenerateStrings(size_t len, int string_len) {
+    std::vector<std::string> ret;
+
+    for (size_t i = 0; i < len; ++i) {
+      InternalKey ik(rnd_.HumanReadableString(string_len), 0,
+                     ValueType::kTypeValue);
+      ret.push_back(ik.Encode().ToString(false));
+    }
+    return ret;
+  }
+
+  void AssertEquivalence() {
+    auto a = merging_iterator_.get();
+    auto b = single_iterator_.get();
+    if (!a->Valid()) {
+      ASSERT_TRUE(!b->Valid());
+    } else {
+      ASSERT_TRUE(b->Valid());
+      ASSERT_EQ(b->key().ToString(), a->key().ToString());
+      ASSERT_EQ(b->value().ToString(), a->value().ToString());
+    }
+  }
+
+  void SeekToRandom() {
+    InternalKey ik(rnd_.HumanReadableString(5), 0, ValueType::kTypeValue);
+    Seek(ik.Encode().ToString(false));
+  }
+
+  void Seek(std::string target) {
+    merging_iterator_->Seek(target);
+    single_iterator_->Seek(target);
+  }
+
+  void SeekToFirst() {
+    merging_iterator_->SeekToFirst();
+    single_iterator_->SeekToFirst();
+  }
+
+  void SeekToLast() {
+    merging_iterator_->SeekToLast();
+    single_iterator_->SeekToLast();
+  }
+
+  void Next(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      merging_iterator_->Next();
+      single_iterator_->Next();
+    }
+    AssertEquivalence();
+  }
+
+  void Prev(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      merging_iterator_->Prev();
+      single_iterator_->Prev();
+    }
+    AssertEquivalence();
+  }
+
+  void NextAndPrev(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      if (rnd_.OneIn(2)) {
+        merging_iterator_->Prev();
+        single_iterator_->Prev();
+      } else {
+        merging_iterator_->Next();
+        single_iterator_->Next();
+      }
+    }
+    AssertEquivalence();
+  }
+
+  void Generate(size_t num_iterators, size_t strings_per_iterator,
+                int letters_per_string) {
+    std::vector<InternalIterator*> small_iterators;
+    for (size_t i = 0; i < num_iterators; ++i) {
+      auto strings = GenerateStrings(strings_per_iterator, letters_per_string);
+      small_iterators.push_back(new VectorIterator(strings, strings, &icomp_));
+      all_keys_.insert(all_keys_.end(), strings.begin(), strings.end());
+    }
+
+    merging_iterator_.reset(
+        NewMergingIterator(&icomp_, &small_iterators[0],
+                           static_cast<int>(small_iterators.size())));
+    single_iterator_.reset(new VectorIterator(all_keys_, all_keys_, &icomp_));
+  }
+
+  InternalKeyComparator icomp_;
+  Random rnd_;
+  std::unique_ptr<InternalIterator> merging_iterator_;
+  std::unique_ptr<InternalIterator> single_iterator_;
+  std::vector<std::string> all_keys_;
+};
+
+TEST_F(MergerTest, SeekToRandomNextTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToRandomNextSmallStringsTest) {
+  Generate(1000, 50, 2);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToRandomPrevTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Prev(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToRandomRandomTest) {
+  Generate(200, 50, 50);
+  for (int i = 0; i < 3; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    NextAndPrev(5000);
+  }
+}
+
+TEST_F(MergerTest, SeekToFirstTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToFirst();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToLastTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToLast();
+    AssertEquivalence();
+    Prev(50000);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/merging_iterator.cc b/src/rocksdb/table/merging_iterator.cc
new file mode 100644
index 000000000..beb35ea9a
--- /dev/null
+++ b/src/rocksdb/table/merging_iterator.cc
@@ -0,0 +1,1403 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/merging_iterator.h"
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "memory/arena.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iter_heap.h"
+#include "table/iterator_wrapper.h"
+#include "test_util/sync_point.h"
+#include "util/autovector.h"
+#include "util/heap.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+// For merging iterator to process range tombstones, we treat the start and end
+// keys of a range tombstone as point keys and put them into the minHeap/maxHeap
+// used in merging iterator. Take minHeap for example, we are able to keep track
+// of currently "active" range tombstones (the ones whose start keys are popped
+// but end keys are still in the heap) in `active_`. This `active_` set of range
+// tombstones is then used to quickly determine whether the point key at heap
+// top is deleted (by heap property, the point key at heap top must be within
+// internal key range of active range tombstones).
+//
+// The HeapItem struct represents 3 types of elements in the minHeap/maxHeap:
+// point key and the start and end keys of a range tombstone.
+struct HeapItem {
+  HeapItem() = default;
+
+  enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END };
+  IteratorWrapper iter;
+  size_t level = 0;
+  std::string pinned_key;
+  // Will be overwritten before use, initialize here so compiler does not
+  // complain.
+  Type type = ITERATOR;
+
+  explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
+      : level(_level), type(Type::ITERATOR) {
+    iter.Set(_iter);
+  }
+
+  void SetTombstoneKey(ParsedInternalKey&& pik) {
+    pinned_key.clear();
+    // Range tombstone end key is exclusive. If a point internal key has the
+    // same user key and sequence number as the start or end key of a range
+    // tombstone, the order will be start < end key < internal key with the
+    // following op_type change. This is helpful to ensure keys popped from
+    // heap are in expected order since range tombstone start/end keys will
+    // be distinct from point internal keys. Strictly speaking, this is only
+    // needed for tombstone end points that are truncated in
+    // TruncatedRangeDelIterator since untruncated tombstone end points always
+    // have kMaxSequenceNumber and kTypeRangeDeletion (see
+    // TruncatedRangeDelIterator::start_key()/end_key()).
+    ParsedInternalKey p(pik.user_key, pik.sequence, kTypeMaxValid);
+    AppendInternalKey(&pinned_key, p);
+  }
+
+  Slice key() const {
+    if (type == Type::ITERATOR) {
+      return iter.key();
+    }
+    return pinned_key;
+  }
+
+  bool IsDeleteRangeSentinelKey() const {
+    if (type == Type::ITERATOR) {
+      return iter.IsDeleteRangeSentinelKey();
+    }
+    return false;
+  }
+};
+
+class MinHeapItemComparator {
+ public:
+  MinHeapItemComparator(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
+  bool operator()(HeapItem* a, HeapItem* b) const {
+    return comparator_->Compare(a->key(), b->key()) > 0;
+  }
+
+ private:
+  const InternalKeyComparator* comparator_;
+};
+
+class MaxHeapItemComparator {
+ public:
+  MaxHeapItemComparator(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
+  bool operator()(HeapItem* a, HeapItem* b) const {
+    return comparator_->Compare(a->key(), b->key()) < 0;
+  }
+
+ private:
+  const InternalKeyComparator* comparator_;
+};
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>;
+using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>;
+}  // namespace
+
+class MergingIterator : public InternalIterator {
+ public:
+  MergingIterator(const InternalKeyComparator* comparator,
+                  InternalIterator** children, int n, bool is_arena_mode,
+                  bool prefix_seek_mode,
+                  const Slice* iterate_upper_bound = nullptr)
+      : is_arena_mode_(is_arena_mode),
+        prefix_seek_mode_(prefix_seek_mode),
+        direction_(kForward),
+        comparator_(comparator),
+        current_(nullptr),
+        minHeap_(comparator_),
+        pinned_iters_mgr_(nullptr),
+        iterate_upper_bound_(iterate_upper_bound) {
+    children_.resize(n);
+    for (int i = 0; i < n; i++) {
+      children_[i].level = i;
+      children_[i].iter.Set(children[i]);
+    }
+  }
+
+  void considerStatus(Status s) {
+    if (!s.ok() && status_.ok()) {
+      status_ = s;
+    }
+  }
+
+  virtual void AddIterator(InternalIterator* iter) {
+    children_.emplace_back(children_.size(), iter);
+    if (pinned_iters_mgr_) {
+      iter->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+    // Invalidate to ensure `Seek*()` is called to construct the heaps before
+    // use.
+    current_ = nullptr;
+  }
+
+  // Merging iterator can optionally process range tombstones: if a key is
+  // covered by a range tombstone, the merging iterator will not output it but
+  // skip it.
+  //
+  // Add the next range tombstone iterator to this merging iterator.
+  // There must be either no range tombstone iterator, or same number of
+  // range tombstone iterators as point iterators after all range tombstone
+  // iters are added. The i-th added range tombstone iterator and the i-th point
+  // iterator must point to the same sorted run.
+  // Merging iterator takes ownership of the range tombstone iterator and
+  // is responsible for freeing it. Note that during Iterator::Refresh()
+  // and when a level iterator moves to a different SST file, the range
+  // tombstone iterator could be updated. In that case, the merging iterator
+  // is only responsible to freeing the new range tombstone iterator
+  // that it has pointers to in range_tombstone_iters_.
+  void AddRangeTombstoneIterator(TruncatedRangeDelIterator* iter) {
+    range_tombstone_iters_.emplace_back(iter);
+  }
+
+  // Called by MergingIteratorBuilder when all point iterators and range
+  // tombstone iterators are added. Initializes HeapItems for range tombstone
+  // iterators so that no further allocation is needed for HeapItem.
+  void Finish() {
+    if (!range_tombstone_iters_.empty()) {
+      pinned_heap_item_.resize(range_tombstone_iters_.size());
+      for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
+        pinned_heap_item_[i].level = i;
+      }
+    }
+  }
+
+  ~MergingIterator() override {
+    for (auto child : range_tombstone_iters_) {
+      delete child;
+    }
+
+    for (auto& child : children_) {
+      child.iter.DeleteIter(is_arena_mode_);
+    }
+    status_.PermitUncheckedError();
+  }
+
+  bool Valid() const override { return current_ != nullptr && status_.ok(); }
+
+  Status status() const override { return status_; }
+
+  // Add range_tombstone_iters_[level] into min heap.
+  // Updates active_ if the end key of a range tombstone is inserted.
+  // @param start_key specifies which end point of the range tombstone to add.
+  void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true,
+                                     bool replace_top = false) {
+    assert(!range_tombstone_iters_.empty() &&
+           range_tombstone_iters_[level]->Valid());
+    if (start_key) {
+      ParsedInternalKey pik = range_tombstone_iters_[level]->start_key();
+      // iterate_upper_bound does not have timestamp
+      if (iterate_upper_bound_ &&
+          comparator_->user_comparator()->CompareWithoutTimestamp(
+              pik.user_key, true /* a_has_ts */, *iterate_upper_bound_,
+              false /* b_has_ts */) >= 0) {
+        if (replace_top) {
+          // replace_top implies this range tombstone iterator is still in
+          // minHeap_ and at the top.
+          minHeap_.pop();
+        }
+        return;
+      }
+      pinned_heap_item_[level].SetTombstoneKey(std::move(pik));
+      pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
+      assert(active_.count(level) == 0);
+    } else {
+      // allow end key to go over upper bound (if present) since start key is
+      // before upper bound and the range tombstone could still cover a
+      // range before upper bound.
+      pinned_heap_item_[level].SetTombstoneKey(
+          range_tombstone_iters_[level]->end_key());
+      pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
+      active_.insert(level);
+    }
+    if (replace_top) {
+      minHeap_.replace_top(&pinned_heap_item_[level]);
+    } else {
+      minHeap_.push(&pinned_heap_item_[level]);
+    }
+  }
+
+  // Add range_tombstone_iters_[level] into max heap.
+  // Updates active_ if the start key of a range tombstone is inserted.
+  // @param end_key specifies which end point of the range tombstone to add.
+  void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true,
+                                     bool replace_top = false) {
+    assert(!range_tombstone_iters_.empty() &&
+           range_tombstone_iters_[level]->Valid());
+    if (end_key) {
+      pinned_heap_item_[level].SetTombstoneKey(
+          range_tombstone_iters_[level]->end_key());
+      pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
+      assert(active_.count(level) == 0);
+    } else {
+      pinned_heap_item_[level].SetTombstoneKey(
+          range_tombstone_iters_[level]->start_key());
+      pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
+      active_.insert(level);
+    }
+    if (replace_top) {
+      maxHeap_->replace_top(&pinned_heap_item_[level]);
+    } else {
+      maxHeap_->push(&pinned_heap_item_[level]);
+    }
+  }
+
+  // Remove HeapItems from top of minHeap_ that are of type DELETE_RANGE_START
+  // until minHeap_ is empty or the top of the minHeap_ is not of type
+  // DELETE_RANGE_START. Each such item means a range tombstone becomes active,
+  // so `active_` is updated accordingly.
+  void PopDeleteRangeStart() {
+    while (!minHeap_.empty() &&
+           minHeap_.top()->type == HeapItem::DELETE_RANGE_START) {
+      TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr);
+      // insert end key of this range tombstone and updates active_
+      InsertRangeTombstoneToMinHeap(
+          minHeap_.top()->level, false /* start_key */, true /* replace_top */);
+    }
+  }
+
+  // Remove HeapItems from top of maxHeap_ that are of type DELETE_RANGE_END
+  // until maxHeap_ is empty or the top of the maxHeap_ is not of type
+  // DELETE_RANGE_END. Each such item means a range tombstone becomes active,
+  // so `active_` is updated accordingly.
+  void PopDeleteRangeEnd() {
+    while (!maxHeap_->empty() &&
+           maxHeap_->top()->type == HeapItem::DELETE_RANGE_END) {
+      // insert start key of this range tombstone and updates active_
+      InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */,
+                                    true /* replace_top */);
+    }
+  }
+
+  void SeekToFirst() override {
+    ClearHeaps();
+    status_ = Status::OK();
+    for (auto& child : children_) {
+      child.iter.SeekToFirst();
+      AddToMinHeapOrCheckStatus(&child);
+    }
+
+    for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
+      if (range_tombstone_iters_[i]) {
+        range_tombstone_iters_[i]->SeekToFirst();
+        if (range_tombstone_iters_[i]->Valid()) {
+          // It is possible to be invalid due to snapshots.
+          InsertRangeTombstoneToMinHeap(i);
+        }
+      }
+    }
+    FindNextVisibleKey();
+    direction_ = kForward;
+    current_ = CurrentForward();
+  }
+
+  void SeekToLast() override {
+    ClearHeaps();
+    InitMaxHeap();
+    status_ = Status::OK();
+    for (auto& child : children_) {
+      child.iter.SeekToLast();
+      AddToMaxHeapOrCheckStatus(&child);
+    }
+
+    for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
+      if (range_tombstone_iters_[i]) {
+        range_tombstone_iters_[i]->SeekToLast();
+        if (range_tombstone_iters_[i]->Valid()) {
+          // It is possible to be invalid due to snapshots.
+          InsertRangeTombstoneToMaxHeap(i);
+        }
+      }
+    }
+    FindPrevVisibleKey();
+    direction_ = kReverse;
+    current_ = CurrentReverse();
+  }
+
+  // Position this merging iterator at the first key >= target (internal key).
+  // If range tombstones are present, keys covered by range tombstones are
+  // skipped, and this merging iter points to the first non-range-deleted key >=
+  // target after Seek(). If !Valid() and status().ok() then end of the iterator
+  // is reached.
+  //
+  // Internally, this involves positioning all child iterators at the first key
+  // >= target. If range tombstones are present, we apply a similar
+  // optimization, cascading seek, as in Pebble
+  // (https://github.com/cockroachdb/pebble). Specifically, if there is a range
+  // tombstone [start, end) that covers the target user key at level L, then
+  // this range tombstone must cover the range [target key, end) in all levels >
+  // L. So for all levels > L, we can pretend the target key is `end`. This
+  // optimization is applied at each level and hence the name "cascading seek".
+  // After a round of (cascading) seeks, the top of the heap is checked to see
+  // if it is covered by a range tombstone (see FindNextVisibleKey() for more
+  // detail), and advanced if so. The process is repeated until a
+  // non-range-deleted key is at the top of the heap, or heap becomes empty.
+  //
+  // As mentioned in comments above HeapItem, to make the checking of whether
+  // top of the heap is covered by some range tombstone efficient, we treat each
+  // range deletion [start, end) as two point keys and insert them into the same
+  // min/maxHeap_ where point iterators are. The set `active_` tracks the levels
+  // that have active range tombstones. If level L is in `active_`, and the
+  // point key at top of the heap is from level >= L, then the point key is
+  // within the internal key range of the range tombstone that
+  // range_tombstone_iters_[L] currently points to. For correctness reasoning,
+  // one invariant that Seek() (and every other public APIs Seek*(),
+  // Next/Prev()) guarantees is as follows. After Seek(), suppose `k` is the
+  // current key of level L's point iterator. Then for each range tombstone
+  // iterator at level <= L, it is at or before the first range tombstone with
+  // end key > `k`. This ensures that when level L's point iterator reaches top
+  // of the heap, `active_` is calculated correctly (it contains the covering
+  // range tombstone's level if there is one), since no range tombstone iterator
+  // was skipped beyond that point iterator's current key during Seek().
+  // Next()/Prev() maintains a stronger version of this invariant where all
+  // range tombstone iterators from level <= L are *at* the first range
+  // tombstone with end key > `k`.
+  void Seek(const Slice& target) override {
+    assert(range_tombstone_iters_.empty() ||
+           range_tombstone_iters_.size() == children_.size());
+    SeekImpl(target);
+    FindNextVisibleKey();
+
+    direction_ = kForward;
+    {
+      PERF_TIMER_GUARD(seek_min_heap_time);
+      current_ = CurrentForward();
+    }
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    assert(range_tombstone_iters_.empty() ||
+           range_tombstone_iters_.size() == children_.size());
+    SeekForPrevImpl(target);
+    FindPrevVisibleKey();
+
+    direction_ = kReverse;
+    {
+      PERF_TIMER_GUARD(seek_max_heap_time);
+      current_ = CurrentReverse();
+    }
+  }
+
+  void Next() override {
+    assert(Valid());
+    // Ensure that all children are positioned after key().
+    // If we are moving in the forward direction, it is already
+    // true for all of the non-current children since current_ is
+    // the smallest child and key() == current_->key().
+    if (direction_ != kForward) {
+      // The loop advanced all non-current children to be > key() so current_
+      // should still be strictly the smallest key.
+      SwitchToForward();
+    }
+
+    // For the heap modifications below to be correct, current_ must be the
+    // current top of the heap.
+    assert(current_ == CurrentForward());
+    // as the current points to the current record. move the iterator forward.
+    current_->Next();
+    if (current_->Valid()) {
+      // current is still valid after the Next() call above.  Call
+      // replace_top() to restore the heap property.  When the same child
+      // iterator yields a sequence of keys, this is cheap.
+      assert(current_->status().ok());
+      minHeap_.replace_top(minHeap_.top());
+    } else {
+      // current stopped being valid, remove it from the heap.
+      considerStatus(current_->status());
+      minHeap_.pop();
+    }
+    FindNextVisibleKey();
+    current_ = CurrentForward();
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    Next();
+    bool is_valid = Valid();
+    if (is_valid) {
+      result->key = key();
+      result->bound_check_result = UpperBoundCheckResult();
+      result->value_prepared = current_->IsValuePrepared();
+    }
+    return is_valid;
+  }
+
+  void Prev() override {
+    assert(Valid());
+    // Ensure that all children are positioned before key().
+    // If we are moving in the reverse direction, it is already
+    // true for all of the non-current children since current_ is
+    // the largest child and key() == current_->key().
+    if (direction_ != kReverse) {
+      // Otherwise, retreat the non-current children.  We retreat current_
+      // just after the if-block.
+      SwitchToBackward();
+    }
+
+    // For the heap modifications below to be correct, current_ must be the
+    // current top of the heap.
+    assert(current_ == CurrentReverse());
+    current_->Prev();
+    if (current_->Valid()) {
+      // current is still valid after the Prev() call above.  Call
+      // replace_top() to restore the heap property.  When the same child
+      // iterator yields a sequence of keys, this is cheap.
+      assert(current_->status().ok());
+      maxHeap_->replace_top(maxHeap_->top());
+    } else {
+      // current stopped being valid, remove it from the heap.
+      considerStatus(current_->status());
+      maxHeap_->pop();
+    }
+    FindPrevVisibleKey();
+    current_ = CurrentReverse();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    return current_->key();
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    return current_->value();
+  }
+
+  bool PrepareValue() override {
+    assert(Valid());
+    if (current_->PrepareValue()) {
+      return true;
+    }
+
+    considerStatus(current_->status());
+    assert(!status_.ok());
+    return false;
+  }
+
+  // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
+  // from current child iterator. Potentially as long as one of child iterator
+  // report out of bound is not possible, we know current key is within bound.
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return current_->MayBeOutOfLowerBound();
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(Valid());
+    return current_->UpperBoundCheckResult();
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+    for (auto& child : children_) {
+      child.iter.SetPinnedItersMgr(pinned_iters_mgr);
+    }
+  }
+
+  bool IsKeyPinned() const override {
+    assert(Valid());
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           current_->IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    assert(Valid());
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           current_->IsValuePinned();
+  }
+
+ private:
+  friend class MergeIteratorBuilder;
+  // Clears heaps for both directions, used when changing direction or seeking
+  void ClearHeaps(bool clear_active = true);
+  // Ensures that maxHeap_ is initialized when starting to go in the reverse
+  // direction
+  void InitMaxHeap();
+
+  // Advance this merging iterator until the current key (top of min heap) is
+  // not covered by any range tombstone or that there is no more keys (heap is
+  // empty). After this call, if Valid(), current_ points to the next key that
+  // is not covered by any range tombstone.
+  void FindNextVisibleKey();
+  void FindPrevVisibleKey();
+
+  void SeekImpl(const Slice& target, size_t starting_level = 0,
+                bool range_tombstone_reseek = false);
+
+  // Seek to fist key <= target key (internal key) for
+  // children_[starting_level:].
+  void SeekForPrevImpl(const Slice& target, size_t starting_level = 0,
+                       bool range_tombstone_reseek = false);
+
+  bool is_arena_mode_;
+  bool prefix_seek_mode_;
+  // Which direction is the iterator moving?
+  enum Direction : uint8_t { kForward, kReverse };
+  Direction direction_;
+  const InternalKeyComparator* comparator_;
+  // We could also use an autovector with a larger reserved size.
+  // HeapItem for all child point iterators.
+  std::vector<HeapItem> children_;
+  // HeapItem for range tombstone start and end keys. Each range tombstone
+  // iterator will have at most one side (start key or end key) in a heap
+  // at the same time, so this vector will be of size children_.size();
+  // pinned_heap_item_[i] corresponds to the start key and end key HeapItem
+  // for range_tombstone_iters_[i].
+  std::vector<HeapItem> pinned_heap_item_;
+  // range_tombstone_iters_[i] contains range tombstones in the sorted run that
+  // corresponds to children_[i]. range_tombstone_iters_.empty() means not
+  // handling range tombstones in merging iterator. range_tombstone_iters_[i] ==
+  // nullptr means the sorted run of children_[i] does not have range
+  // tombstones.
+  std::vector<TruncatedRangeDelIterator*> range_tombstone_iters_;
+
+  // Levels (indices into range_tombstone_iters_/children_ ) that currently have
+  // "active" range tombstones. See comments above Seek() for meaning of
+  // "active".
+  std::set<size_t> active_;
+
+  bool SkipNextDeleted();
+  bool SkipPrevDeleted();
+
+  // Cached pointer to child iterator with the current key, or nullptr if no
+  // child iterators are valid.  This is the top of minHeap_ or maxHeap_
+  // depending on the direction.
+  IteratorWrapper* current_;
+  // If any of the children have non-ok status, this is one of them.
+  Status status_;
+  MergerMinIterHeap minHeap_;
+
+  // Max heap is used for reverse iteration, which is way less common than
+  // forward.  Lazily initialize it to save memory.
+  std::unique_ptr<MergerMaxIterHeap> maxHeap_;
+  PinnedIteratorsManager* pinned_iters_mgr_;
+
+  // Used to bound range tombstones. For point keys, DBIter and SSTable iterator
+  // take care of boundary checking.
+  const Slice* iterate_upper_bound_;
+
+  // In forward direction, process a child that is not in the min heap.
+  // If valid, add to the min heap. Otherwise, check status.
+  void AddToMinHeapOrCheckStatus(HeapItem*);
+
+  // In backward direction, process a child that is not in the max heap.
+  // If valid, add to the min heap. Otherwise, check status.
+  void AddToMaxHeapOrCheckStatus(HeapItem*);
+
+  void SwitchToForward();
+
+  // Switch the direction from forward to backward without changing the
+  // position. Iterator should still be valid.
+  void SwitchToBackward();
+
+  IteratorWrapper* CurrentForward() const {
+    assert(direction_ == kForward);
+    assert(minHeap_.empty() || minHeap_.top()->type == HeapItem::ITERATOR);
+    return !minHeap_.empty() ? &minHeap_.top()->iter : nullptr;
+  }
+
+  IteratorWrapper* CurrentReverse() const {
+    assert(direction_ == kReverse);
+    assert(maxHeap_);
+    assert(maxHeap_->empty() || maxHeap_->top()->type == HeapItem::ITERATOR);
+    return !maxHeap_->empty() ? &maxHeap_->top()->iter : nullptr;
+  }
+};
+
+// Seek to fist key >= target key (internal key) for children_[starting_level:].
+// Cascading seek optimizations are applied if range tombstones are present (see
+// comment above Seek() for more).
+//
+// @param range_tombstone_reseek Whether target is some range tombstone
+// end, i.e., whether this SeekImpl() call is a part of a "cascading seek". This
+// is used only for recoding relevant perf_context.
+void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
+                               bool range_tombstone_reseek) {
+  // active range tombstones before `starting_level` remain active
+  ClearHeaps(false /* clear_active */);
+  ParsedInternalKey pik;
+  if (!range_tombstone_iters_.empty()) {
+    // pik is only used in InsertRangeTombstoneToMinHeap().
+    ParseInternalKey(target, &pik, false).PermitUncheckedError();
+  }
+
+  // TODO: perhaps we could save some upheap cost by add all child iters first
+  //  and then do a single heapify.
+  for (size_t level = 0; level < starting_level; ++level) {
+    PERF_TIMER_GUARD(seek_min_heap_time);
+    AddToMinHeapOrCheckStatus(&children_[level]);
+  }
+  if (!range_tombstone_iters_.empty()) {
+    // Add range tombstones from levels < starting_level. We can insert from
+    // pinned_heap_item_ for the following reasons:
+    // - pinned_heap_item_[level] is in minHeap_ iff
+    // range_tombstone_iters[level]->Valid().
+    // - If `level` is in active_, then range_tombstone_iters_[level]->Valid()
+    // and pinned_heap_item_[level] is of type RANGE_DELETION_END.
+    for (size_t level = 0; level < starting_level; ++level) {
+      if (range_tombstone_iters_[level] &&
+          range_tombstone_iters_[level]->Valid()) {
+        // use an iterator on active_ if performance becomes an issue here
+        if (active_.count(level) > 0) {
+          assert(pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END);
+          // if it was active, then start key must be within upper_bound,
+          // so we can add to minHeap_ directly.
+          minHeap_.push(&pinned_heap_item_[level]);
+        } else {
+          // this takes care of checking iterate_upper_bound, but with an extra
+          // key comparison if range_tombstone_iters_[level] was already out of
+          // bound. Consider using a new HeapItem type or some flag to remember
+          // boundary checking result.
+          InsertRangeTombstoneToMinHeap(level);
+        }
+      } else {
+        assert(!active_.count(level));
+      }
+    }
+    // levels >= starting_level will be reseeked below, so clearing their active
+    // state here.
+    active_.erase(active_.lower_bound(starting_level), active_.end());
+  }
+
+  status_ = Status::OK();
+  IterKey current_search_key;
+  current_search_key.SetInternalKey(target, false /* copy */);
+  // Seek target might change to some range tombstone end key, so
+  // we need to remember them for async requests.
+  // (level, target) pairs
+  autovector<std::pair<size_t, std::string>> prefetched_target;
+  for (auto level = starting_level; level < children_.size(); ++level) {
+    {
+      PERF_TIMER_GUARD(seek_child_seek_time);
+      children_[level].iter.Seek(current_search_key.GetInternalKey());
+    }
+
+    PERF_COUNTER_ADD(seek_child_seek_count, 1);
+
+    if (!range_tombstone_iters_.empty()) {
+      if (range_tombstone_reseek) {
+        // This seek is to some range tombstone end key.
+        // Should only happen when there are range tombstones.
+        PERF_COUNTER_ADD(internal_range_del_reseek_count, 1);
+      }
+      if (children_[level].iter.status().IsTryAgain()) {
+        prefetched_target.emplace_back(
+            level, current_search_key.GetInternalKey().ToString());
+      }
+      auto range_tombstone_iter = range_tombstone_iters_[level];
+      if (range_tombstone_iter) {
+        range_tombstone_iter->Seek(current_search_key.GetUserKey());
+        if (range_tombstone_iter->Valid()) {
+          // insert the range tombstone end that is closer to and >=
+          // current_search_key. Strictly speaking, since the Seek() call above
+          // is on user key, it is possible that range_tombstone_iter->end_key()
+          // < current_search_key. This can happen when range_tombstone_iter is
+          // truncated and range_tombstone_iter.largest_ has the same user key
+          // as current_search_key.GetUserKey() but with a larger sequence
+          // number than current_search_key. Correctness is not affected as this
+          // tombstone end key will be popped during FindNextVisibleKey().
+          InsertRangeTombstoneToMinHeap(
+              level, comparator_->Compare(range_tombstone_iter->start_key(),
+                                          pik) > 0 /* start_key */);
+          // current_search_key < end_key guaranteed by the Seek() and Valid()
+          // calls above. Only interested in user key coverage since older
+          // sorted runs must have smaller sequence numbers than this range
+          // tombstone.
+          //
+          // TODO: range_tombstone_iter->Seek() finds the max covering
+          //  sequence number, can make it cheaper by not looking for max.
+          if (comparator_->user_comparator()->Compare(
+                  range_tombstone_iter->start_key().user_key,
+                  current_search_key.GetUserKey()) <= 0) {
+            // Since range_tombstone_iter->Valid(), seqno should be valid, so
+            // there is no need to check it.
+            range_tombstone_reseek = true;
+            // Current target user key is covered by this range tombstone.
+            // All older sorted runs will seek to range tombstone end key.
+            // Note that for prefix seek case, it is possible that the prefix
+            // is not the same as the original target, it should not affect
+            // correctness. Besides, in most cases, range tombstone start and
+            // end key should have the same prefix?
+            // If range_tombstone_iter->end_key() is truncated to its largest_
+            // boundary, the timestamp in user_key will not be max timestamp,
+            // but the timestamp of `range_tombstone_iter.largest_`. This should
+            // be fine here as current_search_key is used to Seek into lower
+            // levels.
+            current_search_key.SetInternalKey(
+                range_tombstone_iter->end_key().user_key, kMaxSequenceNumber);
+          }
+        }
+      }
+    }
+    // child.iter.status() is set to Status::TryAgain indicating asynchronous
+    // request for retrieval of data blocks has been submitted. So it should
+    // return at this point and Seek should be called again to retrieve the
+    // requested block and add the child to min heap.
+    if (children_[level].iter.status().IsTryAgain()) {
+      continue;
+    }
+    {
+      // Strictly, we timed slightly more than min heap operation,
+      // but these operations are very cheap.
+      PERF_TIMER_GUARD(seek_min_heap_time);
+      AddToMinHeapOrCheckStatus(&children_[level]);
+    }
+  }
+
+  if (range_tombstone_iters_.empty()) {
+    for (auto& child : children_) {
+      if (child.iter.status().IsTryAgain()) {
+        child.iter.Seek(target);
+        {
+          PERF_TIMER_GUARD(seek_min_heap_time);
+          AddToMinHeapOrCheckStatus(&child);
+        }
+        PERF_COUNTER_ADD(number_async_seek, 1);
+      }
+    }
+  } else {
+    for (auto& prefetch : prefetched_target) {
+      // (level, target) pairs
+      children_[prefetch.first].iter.Seek(prefetch.second);
+      {
+        PERF_TIMER_GUARD(seek_min_heap_time);
+        AddToMinHeapOrCheckStatus(&children_[prefetch.first]);
+      }
+      PERF_COUNTER_ADD(number_async_seek, 1);
+    }
+  }
+}
+
+// Returns true iff the current key (min heap top) should not be returned
+// to user (of the merging iterator). This can be because the current key
+// is deleted by some range tombstone, the current key is some fake file
+// boundary sentinel key, or the current key is an end point of a range
+// tombstone. Advance the iterator at heap top if needed. Heap order is restored
+// and `active_` is updated accordingly.
+// See FindNextVisibleKey() for more detail on internal implementation
+// of advancing child iters.
+//
+// REQUIRES:
+// - min heap is currently not empty, and iter is in kForward direction.
+// - minHeap_ top is not DELETE_RANGE_START (so that `active_` is current).
+bool MergingIterator::SkipNextDeleted() {
+  // 3 types of keys:
+  // - point key
+  // - file boundary sentinel keys
+  // - range deletion end key
+  auto current = minHeap_.top();
+  if (current->type == HeapItem::DELETE_RANGE_END) {
+    active_.erase(current->level);
+    assert(range_tombstone_iters_[current->level] &&
+           range_tombstone_iters_[current->level]->Valid());
+    range_tombstone_iters_[current->level]->Next();
+    if (range_tombstone_iters_[current->level]->Valid()) {
+      InsertRangeTombstoneToMinHeap(current->level, true /* start_key */,
+                                    true /* replace_top */);
+    } else {
+      minHeap_.pop();
+    }
+    return true /* current key deleted */;
+  }
+  if (current->iter.IsDeleteRangeSentinelKey()) {
+    // If the file boundary is defined by a range deletion, the range
+    // tombstone's end key must come before this sentinel key (see op_type in
+    // SetTombstoneKey()).
+    assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion ||
+           active_.count(current->level) == 0);
+    // LevelIterator enters a new SST file
+    current->iter.Next();
+    if (current->iter.Valid()) {
+      assert(current->iter.status().ok());
+      minHeap_.replace_top(current);
+    } else {
+      minHeap_.pop();
+    }
+    // Remove last SST file's range tombstone end key if there is one.
+    // This means file boundary is before range tombstone end key,
+    // which could happen when a range tombstone and a user key
+    // straddle two SST files. Note that in TruncatedRangeDelIterator
+    // constructor, parsed_largest.sequence is decremented 1 in this case.
+    if (!minHeap_.empty() && minHeap_.top()->level == current->level &&
+        minHeap_.top()->type == HeapItem::DELETE_RANGE_END) {
+      minHeap_.pop();
+      active_.erase(current->level);
+    }
+    if (range_tombstone_iters_[current->level] &&
+        range_tombstone_iters_[current->level]->Valid()) {
+      InsertRangeTombstoneToMinHeap(current->level);
+    }
+    return true /* current key deleted */;
+  }
+  assert(current->type == HeapItem::ITERATOR);
+  // Point key case: check active_ for range tombstone coverage.
+  ParsedInternalKey pik;
+  ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
+  if (!active_.empty()) {
+    auto i = *active_.begin();
+    if (i < current->level) {
+      // range tombstone is from a newer level, definitely covers
+      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
+                                  pik) <= 0);
+      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
+             0);
+      std::string target;
+      AppendInternalKey(&target, range_tombstone_iters_[i]->end_key());
+      SeekImpl(target, current->level, true);
+      return true /* current key deleted */;
+    } else if (i == current->level) {
+      // range tombstone is from the same level as current, check sequence
+      // number. By `active_` we know current key is between start key and end
+      // key.
+      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
+                                  pik) <= 0);
+      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
+             0);
+      if (pik.sequence < range_tombstone_iters_[current->level]->seq()) {
+        // covered by range tombstone
+        current->iter.Next();
+        if (current->iter.Valid()) {
+          minHeap_.replace_top(current);
+        } else {
+          minHeap_.pop();
+        }
+        return true /* current key deleted */;
+      } else {
+        return false /* current key not deleted */;
+      }
+    } else {
+      return false /* current key not deleted */;
+      // range tombstone from an older sorted run with current key < end key.
+      // current key is not deleted and the older sorted run will have its range
+      // tombstone updated when the range tombstone's end key are popped from
+      // minHeap_.
+    }
+  }
+  // we can reach here only if active_ is empty
+  assert(active_.empty());
+  assert(minHeap_.top()->type == HeapItem::ITERATOR);
+  return false /* current key not deleted */;
+}
+
+void MergingIterator::SeekForPrevImpl(const Slice& target,
+                                      size_t starting_level,
+                                      bool range_tombstone_reseek) {
+  // active range tombstones before `starting_level` remain active
+  ClearHeaps(false /* clear_active */);
+  InitMaxHeap();
+  ParsedInternalKey pik;
+  if (!range_tombstone_iters_.empty()) {
+    ParseInternalKey(target, &pik, false).PermitUncheckedError();
+  }
+  for (size_t level = 0; level < starting_level; ++level) {
+    PERF_TIMER_GUARD(seek_max_heap_time);
+    AddToMaxHeapOrCheckStatus(&children_[level]);
+  }
+  if (!range_tombstone_iters_.empty()) {
+    // Add range tombstones before starting_level.
+    for (size_t level = 0; level < starting_level; ++level) {
+      if (range_tombstone_iters_[level] &&
+          range_tombstone_iters_[level]->Valid()) {
+        assert(static_cast<bool>(active_.count(level)) ==
+               (pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_START));
+        maxHeap_->push(&pinned_heap_item_[level]);
+      } else {
+        assert(!active_.count(level));
+      }
+    }
+    // levels >= starting_level will be reseeked below,
+    active_.erase(active_.lower_bound(starting_level), active_.end());
+  }
+
+  status_ = Status::OK();
+  IterKey current_search_key;
+  current_search_key.SetInternalKey(target, false /* copy */);
+  // Seek target might change to some range tombstone end key, so
+  // we need to remember them for async requests.
+  // (level, target) pairs
+  autovector<std::pair<size_t, std::string>> prefetched_target;
+  for (auto level = starting_level; level < children_.size(); ++level) {
+    {
+      PERF_TIMER_GUARD(seek_child_seek_time);
+      children_[level].iter.SeekForPrev(current_search_key.GetInternalKey());
+    }
+
+    PERF_COUNTER_ADD(seek_child_seek_count, 1);
+
+    if (!range_tombstone_iters_.empty()) {
+      if (range_tombstone_reseek) {
+        // This seek is to some range tombstone end key.
+        // Should only happen when there are range tombstones.
+        PERF_COUNTER_ADD(internal_range_del_reseek_count, 1);
+      }
+      if (children_[level].iter.status().IsTryAgain()) {
+        prefetched_target.emplace_back(
+            level, current_search_key.GetInternalKey().ToString());
+      }
+      auto range_tombstone_iter = range_tombstone_iters_[level];
+      if (range_tombstone_iter) {
+        range_tombstone_iter->SeekForPrev(current_search_key.GetUserKey());
+        if (range_tombstone_iter->Valid()) {
+          InsertRangeTombstoneToMaxHeap(
+              level, comparator_->Compare(range_tombstone_iter->end_key(),
+                                          pik) <= 0 /* end_key */);
+          // start key <= current_search_key guaranteed by the Seek() call above
+          // Only interested in user key coverage since older sorted runs must
+          // have smaller sequence numbers than this tombstone.
+          if (comparator_->user_comparator()->Compare(
+                  current_search_key.GetUserKey(),
+                  range_tombstone_iter->end_key().user_key) < 0) {
+            range_tombstone_reseek = true;
+            current_search_key.SetInternalKey(
+                range_tombstone_iter->start_key().user_key, kMaxSequenceNumber,
+                kValueTypeForSeekForPrev);
+          }
+        }
+      }
+    }
+    // child.iter.status() is set to Status::TryAgain indicating asynchronous
+    // request for retrieval of data blocks has been submitted. So it should
+    // return at this point and Seek should be called again to retrieve the
+    // requested block and add the child to min heap.
+    if (children_[level].iter.status().IsTryAgain()) {
+      continue;
+    }
+    {
+      // Strictly, we timed slightly more than min heap operation,
+      // but these operations are very cheap.
+      PERF_TIMER_GUARD(seek_max_heap_time);
+      AddToMaxHeapOrCheckStatus(&children_[level]);
+    }
+  }
+
+  if (range_tombstone_iters_.empty()) {
+    for (auto& child : children_) {
+      if (child.iter.status().IsTryAgain()) {
+        child.iter.SeekForPrev(target);
+        {
+          PERF_TIMER_GUARD(seek_min_heap_time);
+          AddToMaxHeapOrCheckStatus(&child);
+        }
+        PERF_COUNTER_ADD(number_async_seek, 1);
+      }
+    }
+  } else {
+    for (auto& prefetch : prefetched_target) {
+      // (level, target) pairs
+      children_[prefetch.first].iter.SeekForPrev(prefetch.second);
+      {
+        PERF_TIMER_GUARD(seek_max_heap_time);
+        AddToMaxHeapOrCheckStatus(&children_[prefetch.first]);
+      }
+      PERF_COUNTER_ADD(number_async_seek, 1);
+    }
+  }
+}
+
+// See more in comments above SkipNextDeleted().
+// REQUIRES:
+// - max heap is currently not empty, and iter is in kReverse direction.
+// - maxHeap_ top is not DELETE_RANGE_END (so that `active_` is current).
+bool MergingIterator::SkipPrevDeleted() {
+  // 3 types of keys:
+  // - point key
+  // - file boundary sentinel keys
+  // - range deletion start key
+  auto current = maxHeap_->top();
+  if (current->type == HeapItem::DELETE_RANGE_START) {
+    active_.erase(current->level);
+    assert(range_tombstone_iters_[current->level] &&
+           range_tombstone_iters_[current->level]->Valid());
+    range_tombstone_iters_[current->level]->Prev();
+    if (range_tombstone_iters_[current->level]->Valid()) {
+      InsertRangeTombstoneToMaxHeap(current->level, true /* end_key */,
+                                    true /* replace_top */);
+    } else {
+      maxHeap_->pop();
+    }
+    return true /* current key deleted */;
+  }
+  if (current->iter.IsDeleteRangeSentinelKey()) {
+    // LevelIterator enters a new SST file
+    current->iter.Prev();
+    if (current->iter.Valid()) {
+      assert(current->iter.status().ok());
+      maxHeap_->replace_top(current);
+    } else {
+      maxHeap_->pop();
+    }
+    if (!maxHeap_->empty() && maxHeap_->top()->level == current->level &&
+        maxHeap_->top()->type == HeapItem::DELETE_RANGE_START) {
+      maxHeap_->pop();
+      active_.erase(current->level);
+    }
+    if (range_tombstone_iters_[current->level] &&
+        range_tombstone_iters_[current->level]->Valid()) {
+      InsertRangeTombstoneToMaxHeap(current->level);
+    }
+    return true /* current key deleted */;
+  }
+  assert(current->type == HeapItem::ITERATOR);
+  // Point key case: check active_ for range tombstone coverage.
+  ParsedInternalKey pik;
+  ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
+  if (!active_.empty()) {
+    auto i = *active_.begin();
+    if (i < current->level) {
+      // range tombstone is from a newer level, definitely covers
+      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
+                                  pik) <= 0);
+      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
+             0);
+      std::string target;
+      AppendInternalKey(&target, range_tombstone_iters_[i]->start_key());
+      // This is different from SkipNextDeleted() which does reseek at sorted
+      // runs >= level (instead of i+1 here). With min heap, if level L is at
+      // top of the heap, then levels <L all have internal keys > level L's
+      // current internal key, which means levels <L are already at a different
+      // user key. With max heap, if level L is at top of the heap, then levels
+      // <L all have internal keys smaller than level L's current internal key,
+      // which might still be the same user key.
+      SeekForPrevImpl(target, i + 1, true);
+      return true /* current key deleted */;
+    } else if (i == current->level) {
+      // By `active_` we know current key is between start key and end key.
+      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
+                                  pik) <= 0);
+      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
+             0);
+      if (pik.sequence < range_tombstone_iters_[current->level]->seq()) {
+        current->iter.Prev();
+        if (current->iter.Valid()) {
+          maxHeap_->replace_top(current);
+        } else {
+          maxHeap_->pop();
+        }
+        return true /* current key deleted */;
+      } else {
+        return false /* current key not deleted */;
+      }
+    } else {
+      return false /* current key not deleted */;
+    }
+  }
+
+  assert(active_.empty());
+  assert(maxHeap_->top()->type == HeapItem::ITERATOR);
+  return false /* current key not deleted */;
+}
+
+void MergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) {
+  if (child->iter.Valid()) {
+    assert(child->iter.status().ok());
+    minHeap_.push(child);
+  } else {
+    considerStatus(child->iter.status());
+  }
+}
+
+void MergingIterator::AddToMaxHeapOrCheckStatus(HeapItem* child) {
+  if (child->iter.Valid()) {
+    assert(child->iter.status().ok());
+    maxHeap_->push(child);
+  } else {
+    considerStatus(child->iter.status());
+  }
+}
+
+// Advance all non current_ child to > current_.key().
+// We advance current_ after the this function call as it does not require
+// Seek().
+// Advance all range tombstones iters, including the one corresponding to
+// current_, to the first tombstone with end_key > current_.key().
+// TODO: potentially do cascading seek here too
+void MergingIterator::SwitchToForward() {
+  ClearHeaps();
+  Slice target = key();
+  for (auto& child : children_) {
+    if (&child.iter != current_) {
+      child.iter.Seek(target);
+      // child.iter.status() is set to Status::TryAgain indicating asynchronous
+      // request for retrieval of data blocks has been submitted. So it should
+      // return at this point and Seek should be called again to retrieve the
+      // requested block and add the child to min heap.
+      if (child.iter.status() == Status::TryAgain()) {
+        continue;
+      }
+      if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
+        assert(child.iter.status().ok());
+        child.iter.Next();
+      }
+    }
+    AddToMinHeapOrCheckStatus(&child);
+  }
+
+  for (auto& child : children_) {
+    if (child.iter.status() == Status::TryAgain()) {
+      child.iter.Seek(target);
+      if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
+        assert(child.iter.status().ok());
+        child.iter.Next();
+      }
+      AddToMinHeapOrCheckStatus(&child);
+    }
+  }
+
+  // Current range tombstone iter also needs to seek for the following case:
+  // Previous direction is backward, so range tombstone iter may point to a
+  // tombstone before current_. If there is no such tombstone, then the range
+  // tombstone iter is !Valid(). Need to reseek here to make it valid again.
+  if (!range_tombstone_iters_.empty()) {
+    ParsedInternalKey pik;
+    ParseInternalKey(target, &pik, false /* log_err_key */)
+        .PermitUncheckedError();
+    for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
+      auto iter = range_tombstone_iters_[i];
+      if (iter) {
+        iter->Seek(pik.user_key);
+        // The while loop is needed as the Seek() call above is only for user
+        // key. We could have a range tombstone with end_key covering user_key,
+        // but still is smaller than target. This happens when the range
+        // tombstone is truncated at iter.largest_.
+        while (iter->Valid() &&
+               comparator_->Compare(iter->end_key(), pik) <= 0) {
+          iter->Next();
+        }
+        if (range_tombstone_iters_[i]->Valid()) {
+          InsertRangeTombstoneToMinHeap(
+              i, comparator_->Compare(range_tombstone_iters_[i]->start_key(),
+                                      pik) > 0 /* start_key */);
+        }
+      }
+    }
+  }
+
+  direction_ = kForward;
+  assert(current_ == CurrentForward());
+}
+
+// Advance all range tombstones iters, including the one corresponding to
+// current_, to the first tombstone with start_key <= current_.key().
+void MergingIterator::SwitchToBackward() {
+  ClearHeaps();
+  InitMaxHeap();
+  Slice target = key();
+  for (auto& child : children_) {
+    if (&child.iter != current_) {
+      child.iter.SeekForPrev(target);
+      TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
+      if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
+        assert(child.iter.status().ok());
+        child.iter.Prev();
+      }
+    }
+    AddToMaxHeapOrCheckStatus(&child);
+  }
+
+  ParsedInternalKey pik;
+  ParseInternalKey(target, &pik, false /* log_err_key */)
+      .PermitUncheckedError();
+  for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
+    auto iter = range_tombstone_iters_[i];
+    if (iter) {
+      iter->SeekForPrev(pik.user_key);
+      // Since the SeekForPrev() call above is only for user key,
+      // we may end up with some range tombstone with start key having the
+      // same user key at current_, but with a smaller sequence number. This
+      // makes current_ not at maxHeap_ top for the CurrentReverse() call
+      // below. If there is a range tombstone start key with the same user
+      // key and the same sequence number as current_.key(), it will be fine as
+      // in InsertRangeTombstoneToMaxHeap() we change op_type to be the smallest
+      // op_type.
+      while (iter->Valid() &&
+             comparator_->Compare(iter->start_key(), pik) > 0) {
+        iter->Prev();
+      }
+      if (iter->Valid()) {
+        InsertRangeTombstoneToMaxHeap(
+            i, comparator_->Compare(range_tombstone_iters_[i]->end_key(),
+                                    pik) <= 0 /* end_key */);
+      }
+    }
+  }
+
+  direction_ = kReverse;
+  if (!prefix_seek_mode_) {
+    // Note that we don't do assert(current_ == CurrentReverse()) here
+    // because it is possible to have some keys larger than the seek-key
+    // inserted between Seek() and SeekToLast(), which makes current_ not
+    // equal to CurrentReverse().
+    current_ = CurrentReverse();
+  }
+  assert(current_ == CurrentReverse());
+}
+
+void MergingIterator::ClearHeaps(bool clear_active) {
+  minHeap_.clear();
+  if (maxHeap_) {
+    maxHeap_->clear();
+  }
+  if (clear_active) {
+    active_.clear();
+  }
+}
+
+void MergingIterator::InitMaxHeap() {
+  if (!maxHeap_) {
+    maxHeap_ = std::make_unique<MergerMaxIterHeap>(comparator_);
+  }
+}
+
+// Repeatedly check and remove heap top key if it is not a point key
+// that is not covered by range tombstones. SeekImpl() is called to seek to end
+// of a range tombstone if the heap top is a point key covered by some range
+// tombstone from a newer sorted run. If the covering tombstone is from current
+// key's level, then the current child iterator is simply advanced to its next
+// key without reseeking.
+inline void MergingIterator::FindNextVisibleKey() {
+  // When active_ is empty, we know heap top cannot be a range tombstone end
+  // key. It cannot be a range tombstone start key per PopDeleteRangeStart().
+  PopDeleteRangeStart();
+  while (!minHeap_.empty() &&
+         (!active_.empty() || minHeap_.top()->IsDeleteRangeSentinelKey()) &&
+         SkipNextDeleted()) {
+    PopDeleteRangeStart();
+  }
+}
+
+inline void MergingIterator::FindPrevVisibleKey() {
+  PopDeleteRangeEnd();
+  while (!maxHeap_->empty() &&
+         (!active_.empty() || maxHeap_->top()->IsDeleteRangeSentinelKey()) &&
+         SkipPrevDeleted()) {
+    PopDeleteRangeEnd();
+  }
+}
+
+InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp,
+                                     InternalIterator** list, int n,
+                                     Arena* arena, bool prefix_seek_mode) {
+  assert(n >= 0);
+  if (n == 0) {
+    return NewEmptyInternalIterator<Slice>(arena);
+  } else if (n == 1) {
+    return list[0];
+  } else {
+    if (arena == nullptr) {
+      return new MergingIterator(cmp, list, n, false, prefix_seek_mode);
+    } else {
+      auto mem = arena->AllocateAligned(sizeof(MergingIterator));
+      return new (mem) MergingIterator(cmp, list, n, true, prefix_seek_mode);
+    }
+  }
+}
+
+MergeIteratorBuilder::MergeIteratorBuilder(
+    const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode,
+    const Slice* iterate_upper_bound)
+    : first_iter(nullptr), use_merging_iter(false), arena(a) {
+  auto mem = arena->AllocateAligned(sizeof(MergingIterator));
+  merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true,
+                                         prefix_seek_mode, iterate_upper_bound);
+}
+
+MergeIteratorBuilder::~MergeIteratorBuilder() {
+  if (first_iter != nullptr) {
+    first_iter->~InternalIterator();
+  }
+  if (merge_iter != nullptr) {
+    merge_iter->~MergingIterator();
+  }
+}
+
+void MergeIteratorBuilder::AddIterator(InternalIterator* iter) {
+  if (!use_merging_iter && first_iter != nullptr) {
+    merge_iter->AddIterator(first_iter);
+    use_merging_iter = true;
+    first_iter = nullptr;
+  }
+  if (use_merging_iter) {
+    merge_iter->AddIterator(iter);
+  } else {
+    first_iter = iter;
+  }
+}
+
+void MergeIteratorBuilder::AddPointAndTombstoneIterator(
+    InternalIterator* point_iter, TruncatedRangeDelIterator* tombstone_iter,
+    TruncatedRangeDelIterator*** tombstone_iter_ptr) {
+  // tombstone_iter_ptr != nullptr means point_iter is a LevelIterator.
+  bool add_range_tombstone = tombstone_iter ||
+                             !merge_iter->range_tombstone_iters_.empty() ||
+                             tombstone_iter_ptr;
+  if (!use_merging_iter && (add_range_tombstone || first_iter)) {
+    use_merging_iter = true;
+    if (first_iter) {
+      merge_iter->AddIterator(first_iter);
+      first_iter = nullptr;
+    }
+  }
+  if (use_merging_iter) {
+    merge_iter->AddIterator(point_iter);
+    if (add_range_tombstone) {
+      // If there was a gap, fill in nullptr as empty range tombstone iterators.
+      while (merge_iter->range_tombstone_iters_.size() <
+             merge_iter->children_.size() - 1) {
+        merge_iter->AddRangeTombstoneIterator(nullptr);
+      }
+      merge_iter->AddRangeTombstoneIterator(tombstone_iter);
+    }
+
+    if (tombstone_iter_ptr) {
+      // This is needed instead of setting to &range_tombstone_iters_[i]
+      // directly here since the memory address of range_tombstone_iters_[i]
+      // might change during vector resizing.
+      range_del_iter_ptrs_.emplace_back(
+          merge_iter->range_tombstone_iters_.size() - 1, tombstone_iter_ptr);
+    }
+  } else {
+    first_iter = point_iter;
+  }
+}
+
+InternalIterator* MergeIteratorBuilder::Finish(ArenaWrappedDBIter* db_iter) {
+  InternalIterator* ret = nullptr;
+  if (!use_merging_iter) {
+    ret = first_iter;
+    first_iter = nullptr;
+  } else {
+    for (auto& p : range_del_iter_ptrs_) {
+      *(p.second) = &(merge_iter->range_tombstone_iters_[p.first]);
+    }
+    if (db_iter && !merge_iter->range_tombstone_iters_.empty()) {
+      // memtable is always the first level
+      db_iter->SetMemtableRangetombstoneIter(
+          &merge_iter->range_tombstone_iters_.front());
+    }
+    merge_iter->Finish();
+    ret = merge_iter;
+    merge_iter = nullptr;
+  }
+  return ret;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/merging_iterator.h b/src/rocksdb/table/merging_iterator.h
new file mode 100644
index 000000000..16fc0877e
--- /dev/null
+++ b/src/rocksdb/table/merging_iterator.h
@@ -0,0 +1,92 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/range_del_aggregator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class ArenaWrappedDBIter;
+class InternalKeyComparator;
+
+template <class TValue>
+class InternalIteratorBase;
+using InternalIterator = InternalIteratorBase<Slice>;
+
+// Return an iterator that provided the union of the data in
+// children[0,n-1].  Takes ownership of the child iterators and
+// will delete them when the result iterator is deleted.
+//
+// The result does no duplicate suppression.  I.e., if a particular
+// key is present in K child iterators, it will be yielded K times.
+//
+// REQUIRES: n >= 0
+extern InternalIterator* NewMergingIterator(
+    const InternalKeyComparator* comparator, InternalIterator** children, int n,
+    Arena* arena = nullptr, bool prefix_seek_mode = false);
+
+class MergingIterator;
+
+// A builder class to build a merging iterator by adding iterators one by one.
+// User should call only one of AddIterator() or AddPointAndTombstoneIterator()
+// exclusively for the same builder.
+class MergeIteratorBuilder {
+ public:
+  // comparator: the comparator used in merging comparator
+  // arena: where the merging iterator needs to be allocated from.
+  explicit MergeIteratorBuilder(const InternalKeyComparator* comparator,
+                                Arena* arena, bool prefix_seek_mode = false,
+                                const Slice* iterate_upper_bound = nullptr);
+  ~MergeIteratorBuilder();
+
+  // Add iter to the merging iterator.
+  void AddIterator(InternalIterator* iter);
+
+  // Add a point key iterator and a range tombstone iterator.
+  // `tombstone_iter_ptr` should and only be set by LevelIterator.
+  // *tombstone_iter_ptr will be set to where the merging iterator stores
+  // `tombstone_iter` when MergeIteratorBuilder::Finish() is called. This is
+  // used by LevelIterator to update range tombstone iters when switching to a
+  // different SST file. If a single point iterator with a nullptr range
+  // tombstone iterator is provided, and the point iterator is not a level
+  // iterator, then this builder will return the point iterator directly,
+  // instead of creating a merging iterator on top of it. Internally, if all
+  // point iterators are not LevelIterator, then range tombstone iterator is
+  // only added to the merging iter if there is a non-null `tombstone_iter`.
+  void AddPointAndTombstoneIterator(
+      InternalIterator* point_iter, TruncatedRangeDelIterator* tombstone_iter,
+      TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr);
+
+  // Get arena used to build the merging iterator. It is called one a child
+  // iterator needs to be allocated.
+  Arena* GetArena() { return arena; }
+
+  // Return the result merging iterator.
+  // If db_iter is not nullptr, then db_iter->SetMemtableRangetombstoneIter()
+  // will be called with pointer to where the merging iterator
+  // stores the memtable range tombstone iterator.
+  // This is used for DB iterator to refresh memtable range tombstones.
+  InternalIterator* Finish(ArenaWrappedDBIter* db_iter = nullptr);
+
+ private:
+  MergingIterator* merge_iter;
+  InternalIterator* first_iter;
+  bool use_merging_iter;
+  Arena* arena;
+  // Used to set LevelIterator.range_tombstone_iter_.
+  // See AddRangeTombstoneIterator() implementation for more detail.
+  std::vector<std::pair<size_t, TruncatedRangeDelIterator***>>
+      range_del_iter_ptrs_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/meta_blocks.cc b/src/rocksdb/table/meta_blocks.cc
new file mode 100644
index 000000000..78a62359d
--- /dev/null
+++ b/src/rocksdb/table/meta_blocks.cc
@@ -0,0 +1,553 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "table/meta_blocks.h"
+
+#include <map>
+#include <string>
+
+#include "block_fetcher.h"
+#include "db/table_properties_collector.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block.h"
+#include "table/block_based/reader_common.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "table/persistent_cache_helper.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_properties_internal.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kPropertiesBlockName = "rocksdb.properties";
+// Old property block name for backward compatibility
+const std::string kPropertiesBlockOldName = "rocksdb.stats";
+const std::string kCompressionDictBlockName = "rocksdb.compression_dict";
+const std::string kRangeDelBlockName = "rocksdb.range_del";
+
+MetaIndexBuilder::MetaIndexBuilder()
+    : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}
+
+void MetaIndexBuilder::Add(const std::string& key, const BlockHandle& handle) {
+  std::string handle_encoding;
+  handle.EncodeTo(&handle_encoding);
+  meta_block_handles_.insert({key, handle_encoding});
+}
+
+Slice MetaIndexBuilder::Finish() {
+  for (const auto& metablock : meta_block_handles_) {
+    meta_index_block_->Add(metablock.first, metablock.second);
+  }
+  return meta_index_block_->Finish();
+}
+
+// Property block will be read sequentially and cached in a heap located
+// object, so there's no need for restart points. Thus we set the restart
+// interval to infinity to save space.
+PropertyBlockBuilder::PropertyBlockBuilder()
+    : properties_block_(new BlockBuilder(
+          std::numeric_limits<int32_t>::max() /* restart interval */)) {}
+
+void PropertyBlockBuilder::Add(const std::string& name,
+                               const std::string& val) {
+  props_.insert({name, val});
+}
+
+void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) {
+  assert(props_.find(name) == props_.end());
+
+  std::string dst;
+  PutVarint64(&dst, val);
+
+  Add(name, dst);
+}
+
+void PropertyBlockBuilder::Add(
+    const UserCollectedProperties& user_collected_properties) {
+  for (const auto& prop : user_collected_properties) {
+    Add(prop.first, prop.second);
+  }
+}
+
+void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
+  TEST_SYNC_POINT_CALLBACK("PropertyBlockBuilder::AddTableProperty:Start",
+                           const_cast<TableProperties*>(&props));
+
+  Add(TablePropertiesNames::kOriginalFileNumber, props.orig_file_number);
+  Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
+  Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
+  Add(TablePropertiesNames::kDataSize, props.data_size);
+  Add(TablePropertiesNames::kIndexSize, props.index_size);
+  if (props.index_partitions != 0) {
+    Add(TablePropertiesNames::kIndexPartitions, props.index_partitions);
+    Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size);
+  }
+  Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key);
+  Add(TablePropertiesNames::kIndexValueIsDeltaEncoded,
+      props.index_value_is_delta_encoded);
+  Add(TablePropertiesNames::kNumEntries, props.num_entries);
+  Add(TablePropertiesNames::kNumFilterEntries, props.num_filter_entries);
+  Add(TablePropertiesNames::kDeletedKeys, props.num_deletions);
+  Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands);
+  Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
+  Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
+  Add(TablePropertiesNames::kFilterSize, props.filter_size);
+  Add(TablePropertiesNames::kFormatVersion, props.format_version);
+  Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
+  Add(TablePropertiesNames::kColumnFamilyId, props.column_family_id);
+  Add(TablePropertiesNames::kCreationTime, props.creation_time);
+  Add(TablePropertiesNames::kOldestKeyTime, props.oldest_key_time);
+  if (props.file_creation_time > 0) {
+    Add(TablePropertiesNames::kFileCreationTime, props.file_creation_time);
+  }
+  if (props.slow_compression_estimated_data_size > 0) {
+    Add(TablePropertiesNames::kSlowCompressionEstimatedDataSize,
+        props.slow_compression_estimated_data_size);
+  }
+  if (props.fast_compression_estimated_data_size > 0) {
+    Add(TablePropertiesNames::kFastCompressionEstimatedDataSize,
+        props.fast_compression_estimated_data_size);
+  }
+  if (!props.db_id.empty()) {
+    Add(TablePropertiesNames::kDbId, props.db_id);
+  }
+  if (!props.db_session_id.empty()) {
+    Add(TablePropertiesNames::kDbSessionId, props.db_session_id);
+  }
+  if (!props.db_host_id.empty()) {
+    Add(TablePropertiesNames::kDbHostId, props.db_host_id);
+  }
+
+  if (!props.filter_policy_name.empty()) {
+    Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name);
+  }
+  if (!props.comparator_name.empty()) {
+    Add(TablePropertiesNames::kComparator, props.comparator_name);
+  }
+
+  if (!props.merge_operator_name.empty()) {
+    Add(TablePropertiesNames::kMergeOperator, props.merge_operator_name);
+  }
+  if (!props.prefix_extractor_name.empty()) {
+    Add(TablePropertiesNames::kPrefixExtractorName,
+        props.prefix_extractor_name);
+  }
+  if (!props.property_collectors_names.empty()) {
+    Add(TablePropertiesNames::kPropertyCollectors,
+        props.property_collectors_names);
+  }
+  if (!props.column_family_name.empty()) {
+    Add(TablePropertiesNames::kColumnFamilyName, props.column_family_name);
+  }
+
+  if (!props.compression_name.empty()) {
+    Add(TablePropertiesNames::kCompression, props.compression_name);
+  }
+  if (!props.compression_options.empty()) {
+    Add(TablePropertiesNames::kCompressionOptions, props.compression_options);
+  }
+  if (!props.seqno_to_time_mapping.empty()) {
+    Add(TablePropertiesNames::kSequenceNumberTimeMapping,
+        props.seqno_to_time_mapping);
+  }
+}
+
+Slice PropertyBlockBuilder::Finish() {
+  for (const auto& prop : props_) {
+    properties_block_->Add(prop.first, prop.second);
+  }
+
+  return properties_block_->Finish();
+}
+
+void LogPropertiesCollectionError(Logger* info_log, const std::string& method,
+                                  const std::string& name) {
+  assert(method == "Add" || method == "Finish");
+
+  std::string msg =
+      "Encountered error when calling TablePropertiesCollector::" + method +
+      "() with collector name: " + name;
+  ROCKS_LOG_ERROR(info_log, "%s", msg.c_str());
+}
+
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key, const Slice& value, uint64_t file_size,
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    Logger* info_log) {
+  bool all_succeeded = true;
+  for (auto& collector : collectors) {
+    Status s = collector->InternalAdd(key, value, file_size);
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(info_log, "Add" /* method */,
+                                   collector->Name());
+    }
+  }
+  return all_succeeded;
+}
+
+void NotifyCollectTableCollectorsOnBlockAdd(
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    const uint64_t block_uncomp_bytes,
+    const uint64_t block_compressed_bytes_fast,
+    const uint64_t block_compressed_bytes_slow) {
+  for (auto& collector : collectors) {
+    collector->BlockAdd(block_uncomp_bytes, block_compressed_bytes_fast,
+                        block_compressed_bytes_slow);
+  }
+}
+
+bool NotifyCollectTableCollectorsOnFinish(
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    Logger* info_log, PropertyBlockBuilder* builder) {
+  bool all_succeeded = true;
+  for (auto& collector : collectors) {
+    UserCollectedProperties user_collected_properties;
+    Status s = collector->Finish(&user_collected_properties);
+
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(info_log, "Finish" /* method */,
+                                   collector->Name());
+    } else {
+      builder->Add(user_collected_properties);
+    }
+  }
+
+  return all_succeeded;
+}
+
+// FIXME: should be a parameter for reading table properties to use persistent
+// cache?
+Status ReadTablePropertiesHelper(
+    const ReadOptions& ro, const BlockHandle& handle,
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ImmutableOptions& ioptions,
+    std::unique_ptr<TableProperties>* table_properties,
+    MemoryAllocator* memory_allocator) {
+  assert(table_properties);
+
+  // If this is an external SST file ingested with write_global_seqno set to
+  // true, then we expect the checksum mismatch because checksum was written
+  // by SstFileWriter, but its global seqno in the properties block may have
+  // been changed during ingestion. For this reason, we initially read
+  // and process without checksum verification, then later try checksum
+  // verification so that if it fails, we can copy to a temporary buffer with
+  // global seqno set to its original value, i.e. 0, and attempt checksum
+  // verification again.
+  ReadOptions modified_ro = ro;
+  modified_ro.verify_checksums = false;
+  BlockContents block_contents;
+  BlockFetcher block_fetcher(file, prefetch_buffer, footer, modified_ro, handle,
+                             &block_contents, ioptions, false /* decompress */,
+                             false /*maybe_compressed*/, BlockType::kProperties,
+                             UncompressionDict::GetEmptyDict(),
+                             PersistentCacheOptions::kEmpty, memory_allocator);
+  Status s = block_fetcher.ReadBlockContents();
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Unfortunately, Block::size() might not equal block_contents.data.size(),
+  // and Block hides block_contents
+  uint64_t block_size = block_contents.data.size();
+  Block properties_block(std::move(block_contents));
+  std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
+
+  std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
+  // All pre-defined properties of type uint64_t
+  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
+      {TablePropertiesNames::kOriginalFileNumber,
+       &new_table_properties->orig_file_number},
+      {TablePropertiesNames::kDataSize, &new_table_properties->data_size},
+      {TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
+      {TablePropertiesNames::kIndexPartitions,
+       &new_table_properties->index_partitions},
+      {TablePropertiesNames::kTopLevelIndexSize,
+       &new_table_properties->top_level_index_size},
+      {TablePropertiesNames::kIndexKeyIsUserKey,
+       &new_table_properties->index_key_is_user_key},
+      {TablePropertiesNames::kIndexValueIsDeltaEncoded,
+       &new_table_properties->index_value_is_delta_encoded},
+      {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
+      {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
+      {TablePropertiesNames::kRawValueSize,
+       &new_table_properties->raw_value_size},
+      {TablePropertiesNames::kNumDataBlocks,
+       &new_table_properties->num_data_blocks},
+      {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
+      {TablePropertiesNames::kNumFilterEntries,
+       &new_table_properties->num_filter_entries},
+      {TablePropertiesNames::kDeletedKeys,
+       &new_table_properties->num_deletions},
+      {TablePropertiesNames::kMergeOperands,
+       &new_table_properties->num_merge_operands},
+      {TablePropertiesNames::kNumRangeDeletions,
+       &new_table_properties->num_range_deletions},
+      {TablePropertiesNames::kFormatVersion,
+       &new_table_properties->format_version},
+      {TablePropertiesNames::kFixedKeyLen,
+       &new_table_properties->fixed_key_len},
+      {TablePropertiesNames::kColumnFamilyId,
+       &new_table_properties->column_family_id},
+      {TablePropertiesNames::kCreationTime,
+       &new_table_properties->creation_time},
+      {TablePropertiesNames::kOldestKeyTime,
+       &new_table_properties->oldest_key_time},
+      {TablePropertiesNames::kFileCreationTime,
+       &new_table_properties->file_creation_time},
+      {TablePropertiesNames::kSlowCompressionEstimatedDataSize,
+       &new_table_properties->slow_compression_estimated_data_size},
+      {TablePropertiesNames::kFastCompressionEstimatedDataSize,
+       &new_table_properties->fast_compression_estimated_data_size},
+  };
+
+  std::string last_key;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    s = iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    auto key = iter->key().ToString();
+    // properties block should be strictly sorted with no duplicate key.
+    if (!last_key.empty() &&
+        BytewiseComparator()->Compare(key, last_key) <= 0) {
+      s = Status::Corruption("properties unsorted");
+      break;
+    }
+    last_key = key;
+
+    auto raw_val = iter->value();
+    auto pos = predefined_uint64_properties.find(key);
+
+    if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
+      new_table_properties->external_sst_file_global_seqno_offset =
+          handle.offset() + iter->ValueOffset();
+    }
+
+    if (pos != predefined_uint64_properties.end()) {
+      if (key == TablePropertiesNames::kDeletedKeys ||
+          key == TablePropertiesNames::kMergeOperands) {
+        // Insert in user-collected properties for API backwards compatibility
+        new_table_properties->user_collected_properties.insert(
+            {key, raw_val.ToString()});
+      }
+      // handle predefined rocksdb properties
+      uint64_t val;
+      if (!GetVarint64(&raw_val, &val)) {
+        // skip malformed value
+        auto error_msg =
+            "Detect malformed value in properties meta-block:"
+            "\tkey: " +
+            key + "\tval: " + raw_val.ToString();
+        ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
+        continue;
+      }
+      *(pos->second) = val;
+    } else if (key == TablePropertiesNames::kDbId) {
+      new_table_properties->db_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kDbSessionId) {
+      new_table_properties->db_session_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kDbHostId) {
+      new_table_properties->db_host_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kFilterPolicy) {
+      new_table_properties->filter_policy_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kColumnFamilyName) {
+      new_table_properties->column_family_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kComparator) {
+      new_table_properties->comparator_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kMergeOperator) {
+      new_table_properties->merge_operator_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kPrefixExtractorName) {
+      new_table_properties->prefix_extractor_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kPropertyCollectors) {
+      new_table_properties->property_collectors_names = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kCompression) {
+      new_table_properties->compression_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kCompressionOptions) {
+      new_table_properties->compression_options = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
+      new_table_properties->seqno_to_time_mapping = raw_val.ToString();
+    } else {
+      // handle user-collected properties
+      new_table_properties->user_collected_properties.insert(
+          {key, raw_val.ToString()});
+    }
+  }
+
+  // Modified version of BlockFetcher checksum verification
+  // (See write_global_seqno comment above)
+  if (s.ok() && footer.GetBlockTrailerSize() > 0) {
+    s = VerifyBlockChecksum(footer.checksum_type(), properties_block.data(),
+                            block_size, file->file_name(), handle.offset());
+    if (s.IsCorruption()) {
+      if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
+        std::string tmp_buf(properties_block.data(),
+                            block_fetcher.GetBlockSizeWithTrailer());
+        uint64_t global_seqno_offset =
+            new_table_properties->external_sst_file_global_seqno_offset -
+            handle.offset();
+        EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
+        s = VerifyBlockChecksum(footer.checksum_type(), tmp_buf.data(),
+                                block_size, file->file_name(), handle.offset());
+      }
+    }
+  }
+
+  if (s.ok()) {
+    *table_properties = std::move(new_table_properties);
+  }
+
+  return s;
+}
+
+Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
+                           uint64_t table_magic_number,
+                           const ImmutableOptions& ioptions,
+                           std::unique_ptr<TableProperties>* properties,
+                           MemoryAllocator* memory_allocator,
+                           FilePrefetchBuffer* prefetch_buffer) {
+  BlockHandle block_handle;
+  Footer footer;
+  Status s = FindMetaBlockInFile(file, file_size, table_magic_number, ioptions,
+                                 kPropertiesBlockName, &block_handle,
+                                 memory_allocator, prefetch_buffer, &footer);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (!block_handle.IsNull()) {
+    s = ReadTablePropertiesHelper(ReadOptions(), block_handle, file,
+                                  prefetch_buffer, footer, ioptions, properties,
+                                  memory_allocator);
+  } else {
+    s = Status::NotFound();
+  }
+  return s;
+}
+
+Status FindOptionalMetaBlock(InternalIterator* meta_index_iter,
+                             const std::string& meta_block_name,
+                             BlockHandle* block_handle) {
+  assert(block_handle != nullptr);
+  meta_index_iter->Seek(meta_block_name);
+  if (meta_index_iter->status().ok()) {
+    if (meta_index_iter->Valid() && meta_index_iter->key() == meta_block_name) {
+      Slice v = meta_index_iter->value();
+      return block_handle->DecodeFrom(&v);
+    } else if (meta_block_name == kPropertiesBlockName) {
+      // Have to try old name for compatibility
+      meta_index_iter->Seek(kPropertiesBlockOldName);
+      if (meta_index_iter->status().ok() && meta_index_iter->Valid() &&
+          meta_index_iter->key() == kPropertiesBlockOldName) {
+        Slice v = meta_index_iter->value();
+        return block_handle->DecodeFrom(&v);
+      }
+    }
+  }
+  // else
+  *block_handle = BlockHandle::NullBlockHandle();
+  return meta_index_iter->status();
+}
+
+Status FindMetaBlock(InternalIterator* meta_index_iter,
+                     const std::string& meta_block_name,
+                     BlockHandle* block_handle) {
+  Status s =
+      FindOptionalMetaBlock(meta_index_iter, meta_block_name, block_handle);
+  if (s.ok() && block_handle->IsNull()) {
+    return Status::Corruption("Cannot find the meta block", meta_block_name);
+  } else {
+    return s;
+  }
+}
+
+Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file,
+                                uint64_t file_size, uint64_t table_magic_number,
+                                const ImmutableOptions& ioptions,
+                                BlockContents* metaindex_contents,
+                                MemoryAllocator* memory_allocator,
+                                FilePrefetchBuffer* prefetch_buffer,
+                                Footer* footer_out) {
+  Footer footer;
+  IOOptions opts;
+  auto s = ReadFooterFromFile(opts, file, prefetch_buffer, file_size, &footer,
+                              table_magic_number);
+  if (!s.ok()) {
+    return s;
+  }
+  if (footer_out) {
+    *footer_out = footer;
+  }
+
+  auto metaindex_handle = footer.metaindex_handle();
+  return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(),
+                      metaindex_handle, metaindex_contents, ioptions,
+                      false /* do decompression */, false /*maybe_compressed*/,
+                      BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(),
+                      PersistentCacheOptions::kEmpty, memory_allocator)
+      .ReadBlockContents();
+}
+
+Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size,
+                           uint64_t table_magic_number,
+                           const ImmutableOptions& ioptions,
+                           const std::string& meta_block_name,
+                           BlockHandle* block_handle,
+                           MemoryAllocator* memory_allocator,
+                           FilePrefetchBuffer* prefetch_buffer,
+                           Footer* footer_out) {
+  BlockContents metaindex_contents;
+  auto s = ReadMetaIndexBlockInFile(
+      file, file_size, table_magic_number, ioptions, &metaindex_contents,
+      memory_allocator, prefetch_buffer, footer_out);
+  if (!s.ok()) {
+    return s;
+  }
+  // meta blocks are never compressed. Need to add uncompress logic if we are to
+  // compress it.
+  Block metaindex_block(std::move(metaindex_contents));
+
+  std::unique_ptr<InternalIterator> meta_iter;
+  meta_iter.reset(metaindex_block.NewMetaIterator());
+
+  return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
+}
+
+Status ReadMetaBlock(RandomAccessFileReader* file,
+                     FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
+                     uint64_t table_magic_number,
+                     const ImmutableOptions& ioptions,
+                     const std::string& meta_block_name, BlockType block_type,
+                     BlockContents* contents,
+                     MemoryAllocator* memory_allocator) {
+  // TableProperties requires special handling because of checksum issues.
+  // Call ReadTableProperties instead for that case.
+  assert(block_type != BlockType::kProperties);
+
+  BlockHandle block_handle;
+  Footer footer;
+  Status status = FindMetaBlockInFile(
+      file, file_size, table_magic_number, ioptions, meta_block_name,
+      &block_handle, memory_allocator, prefetch_buffer, &footer);
+  if (!status.ok()) {
+    return status;
+  }
+
+  return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(),
+                      block_handle, contents, ioptions, false /* decompress */,
+                      false /*maybe_compressed*/, block_type,
+                      UncompressionDict::GetEmptyDict(),
+                      PersistentCacheOptions::kEmpty, memory_allocator)
+      .ReadBlockContents();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/meta_blocks.h b/src/rocksdb/table/meta_blocks.h
new file mode 100644
index 000000000..b867dd01d
--- /dev/null
+++ b/src/rocksdb/table/meta_blocks.h
@@ -0,0 +1,168 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/table_properties_collector.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/memory_allocator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/block_type.h"
+#include "table/format.h"
+#include "util/kv_map.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class Env;
+class Footer;
+class Logger;
+class RandomAccessFile;
+struct TableProperties;
+
+// Meta block names for metaindex
+extern const std::string kPropertiesBlockName;
+extern const std::string kPropertiesBlockOldName;
+extern const std::string kCompressionDictBlockName;
+extern const std::string kRangeDelBlockName;
+
+class MetaIndexBuilder {
+ public:
+  MetaIndexBuilder(const MetaIndexBuilder&) = delete;
+  MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete;
+
+  MetaIndexBuilder();
+  void Add(const std::string& key, const BlockHandle& handle);
+
+  // Write all the added key/value pairs to the block and return the contents
+  // of the block.
+  Slice Finish();
+
+ private:
+  // store the sorted key/handle of the metablocks.
+  stl_wrappers::KVMap meta_block_handles_;
+  std::unique_ptr<BlockBuilder> meta_index_block_;
+};
+
+class PropertyBlockBuilder {
+ public:
+  PropertyBlockBuilder(const PropertyBlockBuilder&) = delete;
+  PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete;
+
+  PropertyBlockBuilder();
+
+  void AddTableProperty(const TableProperties& props);
+  void Add(const std::string& key, uint64_t value);
+  void Add(const std::string& key, const std::string& value);
+  void Add(const UserCollectedProperties& user_collected_properties);
+
+  // Write all the added entries to the block and return the block contents
+  Slice Finish();
+
+ private:
+  std::unique_ptr<BlockBuilder> properties_block_;
+  stl_wrappers::KVMap props_;
+};
+
+// Were we encounter any error occurs during user-defined statistics collection,
+// we'll write the warning message to info log.
+void LogPropertiesCollectionError(Logger* info_log, const std::string& method,
+                                  const std::string& name);
+
+// Utility functions help table builder to trigger batch events for user
+// defined property collectors.
+// Return value indicates if there is any error occurred; if error occurred,
+// the warning message will be logged.
+// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all
+// property collectors.
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key, const Slice& value, uint64_t file_size,
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    Logger* info_log);
+
+void NotifyCollectTableCollectorsOnBlockAdd(
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    uint64_t block_uncomp_bytes, uint64_t block_compressed_bytes_fast,
+    uint64_t block_compressed_bytes_slow);
+
+// NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all
+// property collectors. The collected properties will be added to `builder`.
+bool NotifyCollectTableCollectorsOnFinish(
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    Logger* info_log, PropertyBlockBuilder* builder);
+
+// Read table properties from a file using known BlockHandle.
+// @returns a status to indicate if the operation succeeded. On success,
+//          *table_properties will point to a heap-allocated TableProperties
+//          object, otherwise value of `table_properties` will not be modified.
+Status ReadTablePropertiesHelper(
+    const ReadOptions& ro, const BlockHandle& handle,
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ImmutableOptions& ioptions,
+    std::unique_ptr<TableProperties>* table_properties,
+    MemoryAllocator* memory_allocator = nullptr);
+
+// Read table properties from the properties block of a plain table.
+// @returns a status to indicate if the operation succeeded. On success,
+//          *table_properties will point to a heap-allocated TableProperties
+//          object, otherwise value of `table_properties` will not be modified.
+Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
+                           uint64_t table_magic_number,
+                           const ImmutableOptions& ioptions,
+                           std::unique_ptr<TableProperties>* properties,
+                           MemoryAllocator* memory_allocator = nullptr,
+                           FilePrefetchBuffer* prefetch_buffer = nullptr);
+
+// Find the meta block from the meta index block. Returns OK and
+// block_handle->IsNull() if not found.
+Status FindOptionalMetaBlock(InternalIterator* meta_index_iter,
+                             const std::string& meta_block_name,
+                             BlockHandle* block_handle);
+
+// Find the meta block from the meta index block. Returns Corruption if not
+// found.
+Status FindMetaBlock(InternalIterator* meta_index_iter,
+                     const std::string& meta_block_name,
+                     BlockHandle* block_handle);
+
+// Find the meta block
+Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size,
+                           uint64_t table_magic_number,
+                           const ImmutableOptions& ioptions,
+                           const std::string& meta_block_name,
+                           BlockHandle* block_handle,
+                           MemoryAllocator* memory_allocator = nullptr,
+                           FilePrefetchBuffer* prefetch_buffer = nullptr,
+                           Footer* footer_out = nullptr);
+
+// Read meta block contents
+Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file,
+                                uint64_t file_size, uint64_t table_magic_number,
+                                const ImmutableOptions& ioptions,
+                                BlockContents* block_contents,
+                                MemoryAllocator* memory_allocator = nullptr,
+                                FilePrefetchBuffer* prefetch_buffer = nullptr,
+                                Footer* footer_out = nullptr);
+
+// Read the specified meta block with name meta_block_name
+// from `file` and initialize `contents` with contents of this block.
+// Return Status::OK in case of success.
+Status ReadMetaBlock(RandomAccessFileReader* file,
+                     FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
+                     uint64_t table_magic_number,
+                     const ImmutableOptions& ioptions,
+                     const std::string& meta_block_name, BlockType block_type,
+                     BlockContents* contents,
+                     MemoryAllocator* memory_allocator = nullptr);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/mock_table.cc b/src/rocksdb/table/mock_table.cc
new file mode 100644
index 000000000..130889eaa
--- /dev/null
+++ b/src/rocksdb/table/mock_table.cc
@@ -0,0 +1,344 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/mock_table.h"
+
+#include "db/dbformat.h"
+#include "env/composite_env_wrapper.h"
+#include "file/random_access_file_reader.h"
+#include "port/port.h"
+#include "rocksdb/table_properties.h"
+#include "table/get_context.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace mock {
+
+KVVector MakeMockFile(std::initializer_list<KVPair> l) { return KVVector(l); }
+
+void SortKVVector(KVVector* kv_vector, const Comparator* ucmp) {
+  InternalKeyComparator icmp(ucmp);
+  std::sort(kv_vector->begin(), kv_vector->end(),
+            [icmp](KVPair a, KVPair b) -> bool {
+              return icmp.Compare(a.first, b.first) < 0;
+            });
+}
+
+class MockTableReader : public TableReader {
+ public:
+  explicit MockTableReader(const KVVector& table) : table_(table) {}
+
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
+
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+                               TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
+  uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+                           TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
+  size_t ApproximateMemoryUsage() const override { return 0; }
+
+  void SetupForCompaction() override {}
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+  ~MockTableReader() {}
+
+ private:
+  const KVVector& table_;
+};
+
+class MockTableIterator : public InternalIterator {
+ public:
+  explicit MockTableIterator(const KVVector& table) : table_(table) {
+    itr_ = table_.end();
+  }
+
+  bool Valid() const override { return itr_ != table_.end(); }
+
+  void SeekToFirst() override { itr_ = table_.begin(); }
+
+  void SeekToLast() override {
+    itr_ = table_.end();
+    --itr_;
+  }
+
+  void Seek(const Slice& target) override {
+    KVPair target_pair(target.ToString(), "");
+    InternalKeyComparator icmp(BytewiseComparator());
+    itr_ = std::lower_bound(table_.begin(), table_.end(), target_pair,
+                            [icmp](KVPair a, KVPair b) -> bool {
+                              return icmp.Compare(a.first, b.first) < 0;
+                            });
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    KVPair target_pair(target.ToString(), "");
+    InternalKeyComparator icmp(BytewiseComparator());
+    itr_ = std::upper_bound(table_.begin(), table_.end(), target_pair,
+                            [icmp](KVPair a, KVPair b) -> bool {
+                              return icmp.Compare(a.first, b.first) < 0;
+                            });
+    Prev();
+  }
+
+  void Next() override { ++itr_; }
+
+  void Prev() override {
+    if (itr_ == table_.begin()) {
+      itr_ = table_.end();
+    } else {
+      --itr_;
+    }
+  }
+
+  Slice key() const override { return Slice(itr_->first); }
+
+  Slice value() const override { return Slice(itr_->second); }
+
+  Status status() const override { return Status::OK(); }
+
+ private:
+  const KVVector& table_;
+  KVVector::const_iterator itr_;
+};
+
+class MockTableBuilder : public TableBuilder {
+ public:
+  MockTableBuilder(uint32_t id, MockTableFileSystem* file_system,
+                   MockTableFactory::MockCorruptionMode corrupt_mode =
+                       MockTableFactory::kCorruptNone,
+                   size_t key_value_size = 1)
+      : id_(id),
+        file_system_(file_system),
+        corrupt_mode_(corrupt_mode),
+        key_value_size_(key_value_size) {
+    table_ = MakeMockFile({});
+  }
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~MockTableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override {
+    if (corrupt_mode_ == MockTableFactory::kCorruptValue) {
+      // Corrupt the value
+      table_.push_back({key.ToString(), value.ToString() + " "});
+      corrupt_mode_ = MockTableFactory::kCorruptNone;
+    } else if (corrupt_mode_ == MockTableFactory::kCorruptKey) {
+      table_.push_back({key.ToString() + " ", value.ToString()});
+      corrupt_mode_ = MockTableFactory::kCorruptNone;
+    } else if (corrupt_mode_ == MockTableFactory::kCorruptReorderKey) {
+      if (prev_key_.empty()) {
+        prev_key_ = key.ToString();
+        prev_value_ = value.ToString();
+      } else {
+        table_.push_back({key.ToString(), value.ToString()});
+        table_.push_back({prev_key_, prev_value_});
+        corrupt_mode_ = MockTableFactory::kCorruptNone;
+      }
+    } else {
+      table_.push_back({key.ToString(), value.ToString()});
+    }
+  }
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override { return Status::OK(); }
+
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override { return IOStatus::OK(); }
+
+  Status Finish() override {
+    MutexLock lock_guard(&file_system_->mutex);
+    file_system_->files.insert({id_, table_});
+    return Status::OK();
+  }
+
+  void Abandon() override {}
+
+  uint64_t NumEntries() const override { return table_.size(); }
+
+  uint64_t FileSize() const override { return table_.size() * key_value_size_; }
+
+  TableProperties GetTableProperties() const override {
+    return TableProperties();
+  }
+
+  // Get file checksum
+  std::string GetFileChecksum() const override { return kUnknownFileChecksum; }
+  // Get file checksum function name
+  const char* GetFileChecksumFuncName() const override {
+    return kUnknownFileChecksumFuncName;
+  }
+
+ private:
+  uint32_t id_;
+  std::string prev_key_;
+  std::string prev_value_;
+  MockTableFileSystem* file_system_;
+  int corrupt_mode_;
+  KVVector table_;
+  size_t key_value_size_;
+};
+
+InternalIterator* MockTableReader::NewIterator(
+    const ReadOptions&, const SliceTransform* /* prefix_extractor */,
+    Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/,
+    size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
+  return new MockTableIterator(table_);
+}
+
+Status MockTableReader::Get(const ReadOptions&, const Slice& key,
+                            GetContext* get_context,
+                            const SliceTransform* /*prefix_extractor*/,
+                            bool /*skip_filters*/) {
+  std::unique_ptr<MockTableIterator> iter(new MockTableIterator(table_));
+  for (iter->Seek(key); iter->Valid(); iter->Next()) {
+    ParsedInternalKey parsed_key;
+    Status pik_status =
+        ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      return pik_status;
+    }
+
+    bool dont_care __attribute__((__unused__));
+    if (!get_context->SaveValue(parsed_key, iter->value(), &dont_care)) {
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties()
+    const {
+  return std::shared_ptr<const TableProperties>(new TableProperties());
+}
+
+MockTableFactory::MockTableFactory()
+    : next_id_(1), corrupt_mode_(MockTableFactory::kCorruptNone) {}
+
+Status MockTableFactory::NewTableReader(
+    const ReadOptions& /*ro*/,
+    const TableReaderOptions& /*table_reader_options*/,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /*file_size*/,
+    std::unique_ptr<TableReader>* table_reader,
+    bool /*prefetch_index_and_filter_in_cache*/) const {
+  uint32_t id;
+  Status s = GetIDFromFile(file.get(), &id);
+  if (!s.ok()) {
+    return s;
+  }
+
+  MutexLock lock_guard(&file_system_.mutex);
+
+  auto it = file_system_.files.find(id);
+  if (it == file_system_.files.end()) {
+    return Status::IOError("Mock file not found");
+  }
+
+  table_reader->reset(new MockTableReader(it->second));
+
+  return Status::OK();
+}
+
+TableBuilder* MockTableFactory::NewTableBuilder(
+    const TableBuilderOptions& /*table_builder_options*/,
+    WritableFileWriter* file) const {
+  uint32_t id;
+  Status s = GetAndWriteNextID(file, &id);
+  assert(s.ok());
+
+  return new MockTableBuilder(id, &file_system_, corrupt_mode_,
+                              key_value_size_);
+}
+
+Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname,
+                                         KVVector file_contents) {
+  std::unique_ptr<WritableFileWriter> file_writer;
+  Status s = WritableFileWriter::Create(env->GetFileSystem(), fname,
+                                        FileOptions(), &file_writer, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  uint32_t id;
+  s = GetAndWriteNextID(file_writer.get(), &id);
+  if (s.ok()) {
+    file_system_.files.insert({id, std::move(file_contents)});
+  }
+  return s;
+}
+
+Status MockTableFactory::GetAndWriteNextID(WritableFileWriter* file,
+                                           uint32_t* next_id) const {
+  *next_id = next_id_.fetch_add(1);
+  char buf[4];
+  EncodeFixed32(buf, *next_id);
+  return file->Append(Slice(buf, 4));
+}
+
+Status MockTableFactory::GetIDFromFile(RandomAccessFileReader* file,
+                                       uint32_t* id) const {
+  char buf[4];
+  Slice result;
+  Status s = file->Read(IOOptions(), 0, 4, &result, buf, nullptr,
+                        Env::IO_TOTAL /* rate_limiter_priority */);
+  assert(result.size() == 4);
+  *id = DecodeFixed32(buf);
+  return s;
+}
+
+void MockTableFactory::AssertSingleFile(const KVVector& file_contents) {
+  ASSERT_EQ(file_system_.files.size(), 1U);
+  ASSERT_EQ(file_contents, file_system_.files.begin()->second);
+}
+
+void MockTableFactory::AssertLatestFiles(
+    const std::vector<KVVector>& files_contents) {
+  ASSERT_GE(file_system_.files.size(), files_contents.size());
+  auto it = file_system_.files.rbegin();
+  for (auto expect = files_contents.rbegin(); expect != files_contents.rend();
+       expect++, it++) {
+    ASSERT_TRUE(it != file_system_.files.rend());
+    if (*expect != it->second) {
+      std::cout << "Wrong content! Content of file, expect:" << std::endl;
+      for (const auto& kv : *expect) {
+        ParsedInternalKey ikey;
+        std::string key, value;
+        std::tie(key, value) = kv;
+        ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */));
+        std::cout << ikey.DebugString(true, false) << " -> " << value
+                  << std::endl;
+      }
+      std::cout << "actual:" << std::endl;
+      for (const auto& kv : it->second) {
+        ParsedInternalKey ikey;
+        std::string key, value;
+        std::tie(key, value) = kv;
+        ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */));
+        std::cout << ikey.DebugString(true, false) << " -> " << value
+                  << std::endl;
+      }
+      FAIL();
+    }
+  }
+}
+
+}  // namespace mock
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/mock_table.h b/src/rocksdb/table/mock_table.h
new file mode 100644
index 000000000..e4850d060
--- /dev/null
+++ b/src/rocksdb/table/mock_table.h
@@ -0,0 +1,94 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/table.h"
+#include "table/internal_iterator.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/kv_map.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace mock {
+using KVPair = std::pair<std::string, std::string>;
+using KVVector = std::vector<KVPair>;
+
+KVVector MakeMockFile(std::initializer_list<KVPair> l = {});
+void SortKVVector(KVVector* kv_vector,
+                  const Comparator* ucmp = BytewiseComparator());
+
+struct MockTableFileSystem {
+  port::Mutex mutex;
+  std::map<uint32_t, KVVector> files;
+};
+
+class MockTableFactory : public TableFactory {
+ public:
+  enum MockCorruptionMode {
+    kCorruptNone,
+    kCorruptKey,
+    kCorruptValue,
+    kCorruptReorderKey,
+  };
+
+  MockTableFactory();
+  static const char* kClassName() { return "MockTable"; }
+  const char* Name() const override { return kClassName(); }
+  using TableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      bool prefetch_index_and_filter_in_cache = true) const override;
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const override;
+
+  // This function will directly create mock table instead of going through
+  // MockTableBuilder. file_contents has to have a format of <internal_key,
+  // value>. Those key-value pairs will then be inserted into the mock table.
+  Status CreateMockTable(Env* env, const std::string& fname,
+                         KVVector file_contents);
+
+  virtual std::string GetPrintableOptions() const override {
+    return std::string();
+  }
+
+  void SetCorruptionMode(MockCorruptionMode mode) { corrupt_mode_ = mode; }
+
+  void SetKeyValueSize(size_t size) { key_value_size_ = size; }
+  // This function will assert that only a single file exists and that the
+  // contents are equal to file_contents
+  void AssertSingleFile(const KVVector& file_contents);
+  void AssertLatestFiles(const std::vector<KVVector>& files_contents);
+
+ private:
+  Status GetAndWriteNextID(WritableFileWriter* file, uint32_t* id) const;
+  Status GetIDFromFile(RandomAccessFileReader* file, uint32_t* id) const;
+
+  mutable MockTableFileSystem file_system_;
+  mutable std::atomic<uint32_t> next_id_;
+  MockCorruptionMode corrupt_mode_;
+
+  size_t key_value_size_ = 1;
+};
+
+}  // namespace mock
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/multiget_context.h b/src/rocksdb/table/multiget_context.h
new file mode 100644
index 000000000..76027a952
--- /dev/null
+++ b/src/rocksdb/table/multiget_context.h
@@ -0,0 +1,402 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <algorithm>
+#include <array>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/lookup_key.h"
+#include "db/merge_context.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/types.h"
+#include "util/async_file_reader.h"
+#include "util/autovector.h"
+#include "util/math.h"
+#include "util/single_thread_executor.h"
+
+namespace ROCKSDB_NAMESPACE {
+class GetContext;
+
+struct KeyContext {
+  const Slice* key;
+  LookupKey* lkey;
+  Slice ukey_with_ts;
+  Slice ukey_without_ts;
+  Slice ikey;
+  ColumnFamilyHandle* column_family;
+  Status* s;
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq;
+  bool key_exists;
+  bool is_blob_index;
+  void* cb_arg;
+  PinnableSlice* value;
+  std::string* timestamp;
+  GetContext* get_context;
+
+  KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key,
+             PinnableSlice* val, std::string* ts, Status* stat)
+      : key(&user_key),
+        lkey(nullptr),
+        column_family(col_family),
+        s(stat),
+        max_covering_tombstone_seq(0),
+        key_exists(false),
+        is_blob_index(false),
+        cb_arg(nullptr),
+        value(val),
+        timestamp(ts),
+        get_context(nullptr) {}
+
+  KeyContext() = default;
+};
+
+// The MultiGetContext class is a container for the sorted list of keys that
+// we need to lookup in a batch. Its main purpose is to make batch execution
+// easier by allowing various stages of the MultiGet lookups to operate on
+// subsets of keys, potentially non-contiguous. In order to accomplish this,
+// it defines the following classes -
+//
+// MultiGetContext::Range
+// MultiGetContext::Range::Iterator
+// MultiGetContext::Range::IteratorWrapper
+//
+// Here is an example of how this can be used -
+//
+// {
+//    MultiGetContext ctx(...);
+//    MultiGetContext::Range range = ctx.GetMultiGetRange();
+//
+//    // Iterate to determine some subset of the keys
+//    MultiGetContext::Range::Iterator start = range.begin();
+//    MultiGetContext::Range::Iterator end = ...;
+//
+//    // Make a new range with a subset of keys
+//    MultiGetContext::Range subrange(range, start, end);
+//
+//    // Define an auxillary vector, if needed, to hold additional data for
+//    // each key
+//    std::array<Foo, MultiGetContext::MAX_BATCH_SIZE> aux;
+//
+//    // Iterate over the subrange and the auxillary vector simultaneously
+//    MultiGetContext::Range::Iterator iter = subrange.begin();
+//    for (; iter != subrange.end(); ++iter) {
+//      KeyContext& key = *iter;
+//      Foo& aux_key = aux_iter[iter.index()];
+//      ...
+//    }
+//  }
+class MultiGetContext {
+ public:
+  // Limit the number of keys in a batch to this number. Benchmarks show that
+  // there is negligible benefit for batches exceeding this. Keeping this < 32
+  // simplifies iteration, as well as reduces the amount of stack allocations
+  // that need to be performed
+  static const int MAX_BATCH_SIZE = 32;
+
+  // A bitmask of at least MAX_BATCH_SIZE - 1 bits, so that
+  // Mask{1} << MAX_BATCH_SIZE is well defined
+  using Mask = uint64_t;
+  static_assert(MAX_BATCH_SIZE < sizeof(Mask) * 8);
+
+  MultiGetContext(autovector<KeyContext*, MAX_BATCH_SIZE>* sorted_keys,
+                  size_t begin, size_t num_keys, SequenceNumber snapshot,
+                  const ReadOptions& read_opts, FileSystem* fs,
+                  Statistics* stats)
+      : num_keys_(num_keys),
+        value_mask_(0),
+        value_size_(0),
+        lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf))
+#if USE_COROUTINES
+        ,
+        reader_(fs, stats),
+        executor_(reader_)
+#endif  // USE_COROUTINES
+  {
+    (void)fs;
+    (void)stats;
+    assert(num_keys <= MAX_BATCH_SIZE);
+    if (num_keys > MAX_LOOKUP_KEYS_ON_STACK) {
+      lookup_key_heap_buf.reset(new char[sizeof(LookupKey) * num_keys]);
+      lookup_key_ptr_ = reinterpret_cast<LookupKey*>(lookup_key_heap_buf.get());
+    }
+
+    for (size_t iter = 0; iter != num_keys_; ++iter) {
+      // autovector may not be contiguous storage, so make a copy
+      sorted_keys_[iter] = (*sorted_keys)[begin + iter];
+      sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter])
+          LookupKey(*sorted_keys_[iter]->key, snapshot, read_opts.timestamp);
+      sorted_keys_[iter]->ukey_with_ts = sorted_keys_[iter]->lkey->user_key();
+      sorted_keys_[iter]->ukey_without_ts = StripTimestampFromUserKey(
+          sorted_keys_[iter]->lkey->user_key(),
+          read_opts.timestamp == nullptr ? 0 : read_opts.timestamp->size());
+      sorted_keys_[iter]->ikey = sorted_keys_[iter]->lkey->internal_key();
+      sorted_keys_[iter]->timestamp = (*sorted_keys)[begin + iter]->timestamp;
+      sorted_keys_[iter]->get_context =
+          (*sorted_keys)[begin + iter]->get_context;
+    }
+  }
+
+  ~MultiGetContext() {
+    for (size_t i = 0; i < num_keys_; ++i) {
+      lookup_key_ptr_[i].~LookupKey();
+    }
+  }
+
+#if USE_COROUTINES
+  SingleThreadExecutor& executor() { return executor_; }
+
+  AsyncFileReader& reader() { return reader_; }
+#endif  // USE_COROUTINES
+
+ private:
+  static const int MAX_LOOKUP_KEYS_ON_STACK = 16;
+  alignas(
+      alignof(LookupKey)) char lookup_key_stack_buf[sizeof(LookupKey) *
+                                                    MAX_LOOKUP_KEYS_ON_STACK];
+  std::array<KeyContext*, MAX_BATCH_SIZE> sorted_keys_;
+  size_t num_keys_;
+  Mask value_mask_;
+  uint64_t value_size_;
+  std::unique_ptr<char[]> lookup_key_heap_buf;
+  LookupKey* lookup_key_ptr_;
+#if USE_COROUTINES
+  AsyncFileReader reader_;
+  SingleThreadExecutor executor_;
+#endif  // USE_COROUTINES
+
+ public:
+  // MultiGetContext::Range - Specifies a range of keys, by start and end index,
+  // from the parent MultiGetContext. Each range contains a bit vector that
+  // indicates whether the corresponding keys need to be processed or skipped.
+  // A Range object can be copy constructed, and the new object inherits the
+  // original Range's bit vector. This is useful for progressively skipping
+  // keys as the lookup goes through various stages. For example, when looking
+  // up keys in the same SST file, a Range is created excluding keys not
+  // belonging to that file. A new Range is then copy constructed and individual
+  // keys are skipped based on bloom filter lookup.
+  class Range {
+   public:
+    // MultiGetContext::Range::Iterator - A forward iterator that iterates over
+    // non-skippable keys in a Range, as well as keys whose final value has been
+    // found. The latter is tracked by MultiGetContext::value_mask_
+    class Iterator {
+     public:
+      // -- iterator traits
+      using self_type = Iterator;
+      using value_type = KeyContext;
+      using reference = KeyContext&;
+      using pointer = KeyContext*;
+      using difference_type = int;
+      using iterator_category = std::forward_iterator_tag;
+
+      Iterator(const Range* range, size_t idx)
+          : range_(range), ctx_(range->ctx_), index_(idx) {
+        while (index_ < range_->end_ &&
+               (Mask{1} << index_) &
+                   (range_->ctx_->value_mask_ | range_->skip_mask_ |
+                    range_->invalid_mask_))
+          index_++;
+      }
+
+      Iterator(const Iterator&) = default;
+
+      Iterator(const Iterator& other, const Range* range)
+          : range_(range), ctx_(other.ctx_), index_(other.index_) {
+        assert(range->ctx_ == other.ctx_);
+      }
+      Iterator& operator=(const Iterator&) = default;
+
+      Iterator& operator++() {
+        while (++index_ < range_->end_ &&
+               (Mask{1} << index_) &
+                   (range_->ctx_->value_mask_ | range_->skip_mask_ |
+                    range_->invalid_mask_))
+          ;
+        return *this;
+      }
+
+      bool operator==(Iterator other) const {
+        assert(range_->ctx_ == other.range_->ctx_);
+        return index_ == other.index_;
+      }
+
+      bool operator!=(Iterator other) const {
+        assert(range_->ctx_ == other.range_->ctx_);
+        return index_ != other.index_;
+      }
+
+      KeyContext& operator*() {
+        assert(index_ < range_->end_ && index_ >= range_->start_);
+        return *(ctx_->sorted_keys_[index_]);
+      }
+
+      KeyContext* operator->() {
+        assert(index_ < range_->end_ && index_ >= range_->start_);
+        return ctx_->sorted_keys_[index_];
+      }
+
+      size_t index() { return index_; }
+
+     private:
+      friend Range;
+      const Range* range_;
+      const MultiGetContext* ctx_;
+      size_t index_;
+    };
+
+    Range(const Range& mget_range, const Iterator& first,
+          const Iterator& last) {
+      ctx_ = mget_range.ctx_;
+      if (first == last) {
+        // This means create an empty range based on mget_range. So just
+        // set start_ and and_ to the same value
+        start_ = mget_range.start_;
+        end_ = start_;
+      } else {
+        start_ = first.index_;
+        end_ = last.index_;
+      }
+      skip_mask_ = mget_range.skip_mask_;
+      invalid_mask_ = mget_range.invalid_mask_;
+      assert(start_ < 64);
+      assert(end_ < 64);
+    }
+
+    Range() = default;
+
+    Iterator begin() const { return Iterator(this, start_); }
+
+    Iterator end() const { return Iterator(this, end_); }
+
+    bool empty() const { return RemainingMask() == 0; }
+
+    void SkipIndex(size_t index) { skip_mask_ |= Mask{1} << index; }
+
+    void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); }
+
+    bool IsKeySkipped(const Iterator& iter) const {
+      return skip_mask_ & (Mask{1} << iter.index_);
+    }
+
+    // Update the value_mask_ in MultiGetContext so its
+    // immediately reflected in all the Range Iterators
+    void MarkKeyDone(Iterator& iter) {
+      ctx_->value_mask_ |= (Mask{1} << iter.index_);
+    }
+
+    bool CheckKeyDone(Iterator& iter) const {
+      return ctx_->value_mask_ & (Mask{1} << iter.index_);
+    }
+
+    uint64_t KeysLeft() const { return BitsSetToOne(RemainingMask()); }
+
+    void AddSkipsFrom(const Range& other) {
+      assert(ctx_ == other.ctx_);
+      skip_mask_ |= other.skip_mask_;
+    }
+
+    uint64_t GetValueSize() { return ctx_->value_size_; }
+
+    void AddValueSize(uint64_t value_size) { ctx_->value_size_ += value_size; }
+
+    MultiGetContext* context() const { return ctx_; }
+
+    Range Suffix(const Range& other) const {
+      size_t other_last = other.FindLastRemaining();
+      size_t my_last = FindLastRemaining();
+
+      if (my_last > other_last) {
+        return Range(*this, Iterator(this, other_last),
+                     Iterator(this, my_last));
+      } else {
+        return Range(*this, begin(), begin());
+      }
+    }
+
+    // The += operator expands the number of keys in this range. The expansion
+    // is always to the right, i.e start of the additional range >= end of
+    // current range. There should be no overlap. Any skipped keys in rhs are
+    // marked as invalid in the invalid_mask_.
+    Range& operator+=(const Range& rhs) {
+      assert(rhs.start_ >= end_);
+      // Check for non-overlapping ranges and adjust invalid_mask_ accordingly
+      if (end_ < rhs.start_) {
+        invalid_mask_ |= RangeMask(end_, rhs.start_);
+        skip_mask_ |= RangeMask(end_, rhs.start_);
+      }
+      start_ = std::min<size_t>(start_, rhs.start_);
+      end_ = std::max<size_t>(end_, rhs.end_);
+      skip_mask_ |= rhs.skip_mask_ & RangeMask(rhs.start_, rhs.end_);
+      invalid_mask_ |= (rhs.invalid_mask_ | rhs.skip_mask_) &
+                       RangeMask(rhs.start_, rhs.end_);
+      assert(start_ < 64);
+      assert(end_ < 64);
+      return *this;
+    }
+
+    // The -= operator removes keys from this range. The removed keys should
+    // come from a range completely overlapping the current range. The removed
+    // keys are marked invalid in the invalid_mask_.
+    Range& operator-=(const Range& rhs) {
+      assert(start_ <= rhs.start_ && end_ >= rhs.end_);
+      skip_mask_ |= (~rhs.skip_mask_ | rhs.invalid_mask_) &
+                    RangeMask(rhs.start_, rhs.end_);
+      invalid_mask_ |= (~rhs.skip_mask_ | rhs.invalid_mask_) &
+                       RangeMask(rhs.start_, rhs.end_);
+      return *this;
+    }
+
+    // Return a complement of the current range
+    Range operator~() {
+      Range res = *this;
+      res.skip_mask_ = ~skip_mask_ & RangeMask(start_, end_);
+      return res;
+    }
+
+   private:
+    friend MultiGetContext;
+    MultiGetContext* ctx_;
+    size_t start_;
+    size_t end_;
+    Mask skip_mask_;
+    Mask invalid_mask_;
+
+    Range(MultiGetContext* ctx, size_t num_keys)
+        : ctx_(ctx),
+          start_(0),
+          end_(num_keys),
+          skip_mask_(0),
+          invalid_mask_(0) {
+      assert(num_keys < 64);
+    }
+
+    static Mask RangeMask(size_t start, size_t end) {
+      return (((Mask{1} << (end - start)) - 1) << start);
+    }
+
+    Mask RemainingMask() const {
+      return (((Mask{1} << end_) - 1) & ~((Mask{1} << start_) - 1) &
+              ~(ctx_->value_mask_ | skip_mask_));
+    }
+
+    size_t FindLastRemaining() const {
+      Mask mask = RemainingMask();
+      size_t index = (mask >>= start_) ? start_ : 0;
+      while (mask >>= 1) {
+        index++;
+      }
+      return index;
+    }
+  };
+
+  // Return the initial range that encompasses all the keys in the batch
+  Range GetMultiGetRange() { return Range(this, num_keys_); }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/persistent_cache_helper.cc b/src/rocksdb/table/persistent_cache_helper.cc
new file mode 100644
index 000000000..eece8100e
--- /dev/null
+++ b/src/rocksdb/table/persistent_cache_helper.cc
@@ -0,0 +1,111 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/persistent_cache_helper.h"
+
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const PersistentCacheOptions PersistentCacheOptions::kEmpty;
+
+void PersistentCacheHelper::InsertSerialized(
+    const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+    const char* data, const size_t size) {
+  assert(cache_options.persistent_cache);
+  assert(cache_options.persistent_cache->IsCompressed());
+
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
+  cache_options.persistent_cache->Insert(key.AsSlice(), data, size)
+      .PermitUncheckedError();
+}
+
+void PersistentCacheHelper::InsertUncompressed(
+    const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+    const BlockContents& contents) {
+  assert(cache_options.persistent_cache);
+  assert(!cache_options.persistent_cache->IsCompressed());
+  // Precondition:
+  // (1) content is cacheable
+  // (2) content is not compressed
+
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
+  cache_options.persistent_cache
+      ->Insert(key.AsSlice(), contents.data.data(), contents.data.size())
+      .PermitUncheckedError();
+  ;
+}
+
+Status PersistentCacheHelper::LookupSerialized(
+    const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+    std::unique_ptr<char[]>* out_data, const size_t expected_data_size) {
+#ifdef NDEBUG
+  (void)expected_data_size;
+#endif
+  assert(cache_options.persistent_cache);
+  assert(cache_options.persistent_cache->IsCompressed());
+
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
+  size_t size;
+  Status s =
+      cache_options.persistent_cache->Lookup(key.AsSlice(), out_data, &size);
+  if (!s.ok()) {
+    // cache miss
+    RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS);
+    return s;
+  }
+
+  // cache hit
+  // Block-based table is assumed
+  assert(expected_data_size ==
+         handle.size() + BlockBasedTable::kBlockTrailerSize);
+  assert(size == expected_data_size);
+  RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT);
+  return Status::OK();
+}
+
+Status PersistentCacheHelper::LookupUncompressed(
+    const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+    BlockContents* contents) {
+  assert(cache_options.persistent_cache);
+  assert(!cache_options.persistent_cache->IsCompressed());
+  if (!contents) {
+    // We shouldn't lookup in the cache. Either
+    // (1) Nowhere to store
+    return Status::NotFound();
+  }
+
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
+  std::unique_ptr<char[]> data;
+  size_t size;
+  Status s =
+      cache_options.persistent_cache->Lookup(key.AsSlice(), &data, &size);
+  if (!s.ok()) {
+    // cache miss
+    RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS);
+    return s;
+  }
+
+  // please note we are potentially comparing compressed data size with
+  // uncompressed data size
+  assert(handle.size() <= size);
+
+  // update stats
+  RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT);
+  // construct result and return
+  *contents = BlockContents(std::move(data), size);
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/persistent_cache_helper.h b/src/rocksdb/table/persistent_cache_helper.h
new file mode 100644
index 000000000..ece339aee
--- /dev/null
+++ b/src/rocksdb/table/persistent_cache_helper.h
@@ -0,0 +1,46 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+
+#include "monitoring/statistics.h"
+#include "table/format.h"
+#include "table/persistent_cache_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct BlockContents;
+
+// PersistentCacheHelper
+//
+// Encapsulates  some of the helper logic for read and writing from the cache
+class PersistentCacheHelper {
+ public:
+  // Insert block into cache of serialized blocks. Size includes block trailer
+  // (if applicable).
+  static void InsertSerialized(const PersistentCacheOptions& cache_options,
+                               const BlockHandle& handle, const char* data,
+                               const size_t size);
+
+  // Insert block into cache of uncompressed blocks. No block trailer.
+  static void InsertUncompressed(const PersistentCacheOptions& cache_options,
+                                 const BlockHandle& handle,
+                                 const BlockContents& contents);
+
+  // Lookup block from cache of serialized blocks. Size includes block trailer
+  // (if applicable).
+  static Status LookupSerialized(const PersistentCacheOptions& cache_options,
+                                 const BlockHandle& handle,
+                                 std::unique_ptr<char[]>* out_data,
+                                 const size_t expected_data_size);
+
+  // Lookup block from uncompressed cache. No block trailer.
+  static Status LookupUncompressed(const PersistentCacheOptions& cache_options,
+                                   const BlockHandle& handle,
+                                   BlockContents* contents);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/persistent_cache_options.h b/src/rocksdb/table/persistent_cache_options.h
new file mode 100644
index 000000000..b543ab3a3
--- /dev/null
+++ b/src/rocksdb/table/persistent_cache_options.h
@@ -0,0 +1,34 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+
+#include "cache/cache_key.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/persistent_cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PersistentCacheOptions
+//
+// This describe the caching behavior for page cache
+// This is used to pass the context for caching and the cache handle
+struct PersistentCacheOptions {
+  PersistentCacheOptions() {}
+  explicit PersistentCacheOptions(
+      const std::shared_ptr<PersistentCache>& _persistent_cache,
+      const OffsetableCacheKey& _base_cache_key, Statistics* const _statistics)
+      : persistent_cache(_persistent_cache),
+        base_cache_key(_base_cache_key),
+        statistics(_statistics) {}
+  std::shared_ptr<PersistentCache> persistent_cache;
+  OffsetableCacheKey base_cache_key;
+  Statistics* statistics = nullptr;
+
+  static const PersistentCacheOptions kEmpty;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_bloom.cc b/src/rocksdb/table/plain/plain_table_bloom.cc
new file mode 100644
index 000000000..21441f616
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_bloom.cc
@@ -0,0 +1,78 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/plain/plain_table_bloom.h"
+
+#include <algorithm>
+#include <string>
+
+#include "memory/allocator.h"
+#include "util/dynamic_bloom.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_blocks =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_blocks an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_blocks % 2 == 0) {
+    num_blocks++;
+  }
+
+  return num_blocks * (CACHE_LINE_SIZE * 8);
+}
+}  // namespace
+
+PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes)
+    : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
+
+void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits,
+                                   uint32_t num_blocks) {
+  data_ = raw_data;
+  kTotalBits = total_bits;
+  kNumBlocks = num_blocks;
+}
+
+void PlainTableBloomV1::SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                                     uint32_t locality,
+                                     size_t huge_page_tlb_size,
+                                     Logger* logger) {
+  kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
+                              : (total_bits + 7) / 8 * 8;
+  kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
+
+  assert(kNumBlocks > 0 || kTotalBits > 0);
+  assert(kNumProbes > 0);
+
+  uint32_t sz = kTotalBits / 8;
+  if (kNumBlocks > 0) {
+    sz += CACHE_LINE_SIZE - 1;
+  }
+  assert(allocator);
+
+  char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
+  memset(raw, 0, sz);
+  auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE;
+  if (kNumBlocks > 0 && cache_line_offset > 0) {
+    raw += CACHE_LINE_SIZE - cache_line_offset;
+  }
+  data_ = raw;
+}
+
+void BloomBlockBuilder::AddKeysHashes(
+    const std::vector<uint32_t>& keys_hashes) {
+  for (auto hash : keys_hashes) {
+    bloom_.AddHash(hash);
+  }
+}
+
+Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
+
+const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_bloom.h b/src/rocksdb/table/plain/plain_table_bloom.h
new file mode 100644
index 000000000..460e7ec39
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_bloom.h
@@ -0,0 +1,132 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/bloom_impl.h"
+#include "util/hash.h"
+#include "util/math.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Slice;
+class Allocator;
+class Logger;
+
+// A legacy Bloom filter implementation used by Plain Table db format, for
+// schema backward compatibility. Not for use in new filter applications.
+class PlainTableBloomV1 {
+ public:
+  // allocator: pass allocator to bloom filter, hence trace the usage of memory
+  // total_bits: fixed total bits for the bloom
+  // num_probes: number of hash probes for a single key
+  // locality:  If positive, optimize for cache line locality, 0 otherwise.
+  // hash_func:  customized hash function
+  // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
+  //                      within this page size. Need to reserve huge pages for
+  //                      it to be allocated, like:
+  //                         sysctl -w vm.nr_hugepages=20
+  //                     See linux doc Documentation/vm/hugetlbpage.txt
+  explicit PlainTableBloomV1(uint32_t num_probes = 6);
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger);
+
+  ~PlainTableBloomV1() {}
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t hash);
+
+  // Multithreaded access to this function is OK
+  bool MayContainHash(uint32_t hash) const;
+
+  void Prefetch(uint32_t hash);
+
+  uint32_t GetNumBlocks() const { return kNumBlocks; }
+
+  Slice GetRawData() const { return Slice(data_, GetTotalBits() / 8); }
+
+  void SetRawData(char* raw_data, uint32_t total_bits, uint32_t num_blocks = 0);
+
+  uint32_t GetTotalBits() const { return kTotalBits; }
+
+  bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
+
+ private:
+  uint32_t kTotalBits;
+  uint32_t kNumBlocks;
+  const uint32_t kNumProbes;
+
+  char* data_;
+
+  static constexpr int LOG2_CACHE_LINE_SIZE =
+      ConstexprFloorLog2(CACHE_LINE_SIZE);
+};
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// local variable is initialized but not referenced
+#pragma warning(disable : 4189)
+#endif
+inline void PlainTableBloomV1::Prefetch(uint32_t h) {
+  if (kNumBlocks != 0) {
+    uint32_t ignored;
+    LegacyLocalityBloomImpl</*ExtraRotates*/ true>::PrepareHashMayMatch(
+        h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE);
+  }
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const {
+  assert(IsInitialized());
+  if (kNumBlocks != 0) {
+    return LegacyLocalityBloomImpl<true>::HashMayMatch(
+        h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE);
+  } else {
+    return LegacyNoLocalityBloomImpl::HashMayMatch(h, kTotalBits, kNumProbes,
+                                                   data_);
+  }
+}
+
+inline void PlainTableBloomV1::AddHash(uint32_t h) {
+  assert(IsInitialized());
+  if (kNumBlocks != 0) {
+    LegacyLocalityBloomImpl<true>::AddHash(h, kNumBlocks, kNumProbes, data_,
+                                           LOG2_CACHE_LINE_SIZE);
+  } else {
+    LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_);
+  }
+}
+
+class BloomBlockBuilder {
+ public:
+  static const std::string kBloomBlock;
+
+  explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {}
+
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger) {
+    bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size,
+                        logger);
+  }
+
+  uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
+
+  void AddKeysHashes(const std::vector<uint32_t>& keys_hashes);
+
+  Slice Finish();
+
+ private:
+  PlainTableBloomV1 bloom_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_builder.cc b/src/rocksdb/table/plain/plain_table_builder.cc
new file mode 100644
index 000000000..04723955c
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_builder.cc
@@ -0,0 +1,337 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_builder.h"
+
+#include <assert.h>
+
+#include <limits>
+#include <map>
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// a utility that helps writing block content to the file
+//   @offset will advance if @block_contents was successfully written.
+//   @block_handle the block handle this particular block.
+IOStatus WriteBlock(const Slice& block_contents, WritableFileWriter* file,
+                    uint64_t* offset, BlockHandle* block_handle) {
+  block_handle->set_offset(*offset);
+  block_handle->set_size(block_contents.size());
+  IOStatus io_s = file->Append(block_contents);
+
+  if (io_s.ok()) {
+    *offset += block_contents.size();
+  }
+  return io_s;
+}
+
+}  // namespace
+
+// kPlainTableMagicNumber was picked by running
+//    echo rocksdb.table.plain | sha1sum
+// and taking the leading 64 bits.
+extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
+extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
+
+PlainTableBuilder::PlainTableBuilder(
+    const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+    const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+    uint32_t column_family_id, int level_at_creation, WritableFileWriter* file,
+    uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness,
+    uint32_t bloom_bits_per_key, const std::string& column_family_name,
+    uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio,
+    bool store_index_in_file, const std::string& db_id,
+    const std::string& db_session_id, uint64_t file_number)
+    : ioptions_(ioptions),
+      moptions_(moptions),
+      bloom_block_(num_probes),
+      file_(file),
+      bloom_bits_per_key_(bloom_bits_per_key),
+      huge_page_tlb_size_(huge_page_tlb_size),
+      encoder_(encoding_type, user_key_len, moptions.prefix_extractor.get(),
+               index_sparseness),
+      store_index_in_file_(store_index_in_file),
+      prefix_extractor_(moptions.prefix_extractor.get()) {
+  // Build index block and save it in the file if hash_table_ratio > 0
+  if (store_index_in_file_) {
+    assert(hash_table_ratio > 0 || IsTotalOrderMode());
+    index_builder_.reset(new PlainTableIndexBuilder(
+        &arena_, ioptions, moptions.prefix_extractor.get(), index_sparseness,
+        hash_table_ratio, huge_page_tlb_size_));
+    properties_
+        .user_collected_properties[PlainTablePropertyNames::kBloomVersion] =
+        "1";  // For future use
+  }
+
+  properties_.fixed_key_len = user_key_len;
+
+  // for plain table, we put all the data in a big chuck.
+  properties_.num_data_blocks = 1;
+  // Fill it later if store_index_in_file_ == true
+  properties_.index_size = 0;
+  properties_.filter_size = 0;
+  // To support roll-back to previous version, now still use version 0 for
+  // plain encoding.
+  properties_.format_version = (encoding_type == kPlain) ? 0 : 1;
+  properties_.column_family_id = column_family_id;
+  properties_.column_family_name = column_family_name;
+  properties_.db_id = db_id;
+  properties_.db_session_id = db_session_id;
+  properties_.db_host_id = ioptions.db_host_id;
+  if (!ReifyDbHostIdProperty(ioptions_.env, &properties_.db_host_id).ok()) {
+    ROCKS_LOG_INFO(ioptions_.logger, "db_host_id property will not be set");
+  }
+  properties_.orig_file_number = file_number;
+  properties_.prefix_extractor_name =
+      moptions_.prefix_extractor != nullptr
+          ? moptions_.prefix_extractor->AsString()
+          : "nullptr";
+
+  std::string val;
+  PutFixed32(&val, static_cast<uint32_t>(encoder_.GetEncodingType()));
+  properties_
+      .user_collected_properties[PlainTablePropertyNames::kEncodingType] = val;
+
+  assert(int_tbl_prop_collector_factories);
+  for (auto& factory : *int_tbl_prop_collector_factories) {
+    assert(factory);
+
+    table_properties_collectors_.emplace_back(
+        factory->CreateIntTblPropCollector(column_family_id,
+                                           level_at_creation));
+  }
+}
+
+PlainTableBuilder::~PlainTableBuilder() {
+  // They are supposed to have been passed to users through Finish()
+  // if the file succeeds.
+  status_.PermitUncheckedError();
+  io_status_.PermitUncheckedError();
+}
+
+void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
+  // temp buffer for metadata bytes between key and value.
+  char meta_bytes_buf[6];
+  size_t meta_bytes_buf_size = 0;
+
+  ParsedInternalKey internal_key;
+  if (!ParseInternalKey(key, &internal_key, false /* log_err_key */)
+           .ok()) {  // TODO
+    assert(false);
+    return;
+  }
+  if (internal_key.type == kTypeRangeDeletion) {
+    status_ = Status::NotSupported("Range deletion unsupported");
+    return;
+  }
+
+  // Store key hash
+  if (store_index_in_file_) {
+    if (moptions_.prefix_extractor == nullptr) {
+      keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
+    } else {
+      Slice prefix =
+          moptions_.prefix_extractor->Transform(internal_key.user_key);
+      keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
+    }
+  }
+
+  // Write value
+  assert(offset_ <= std::numeric_limits<uint32_t>::max());
+  auto prev_offset = static_cast<uint32_t>(offset_);
+  // Write out the key
+  io_status_ = encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
+                                  &meta_bytes_buf_size);
+  if (SaveIndexInFile()) {
+    index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset);
+  }
+
+  // Write value length
+  uint32_t value_size = static_cast<uint32_t>(value.size());
+  if (io_status_.ok()) {
+    char* end_ptr =
+        EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size);
+    assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf));
+    meta_bytes_buf_size = end_ptr - meta_bytes_buf;
+    io_status_ = file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size));
+  }
+
+  // Write value
+  if (io_status_.ok()) {
+    io_status_ = file_->Append(value);
+    offset_ += value_size + meta_bytes_buf_size;
+  }
+
+  if (io_status_.ok()) {
+    properties_.num_entries++;
+    properties_.raw_key_size += key.size();
+    properties_.raw_value_size += value.size();
+    if (internal_key.type == kTypeDeletion ||
+        internal_key.type == kTypeSingleDeletion) {
+      properties_.num_deletions++;
+    } else if (internal_key.type == kTypeMerge) {
+      properties_.num_merge_operands++;
+    }
+  }
+
+  // notify property collectors
+  NotifyCollectTableCollectorsOnAdd(
+      key, value, offset_, table_properties_collectors_, ioptions_.logger);
+  status_ = io_status_;
+}
+
+Status PlainTableBuilder::Finish() {
+  assert(!closed_);
+  closed_ = true;
+
+  properties_.data_size = offset_;
+
+  //  Write the following blocks
+  //  1. [meta block: bloom] - optional
+  //  2. [meta block: index] - optional
+  //  3. [meta block: properties]
+  //  4. [metaindex block]
+  //  5. [footer]
+
+  MetaIndexBuilder meta_index_builer;
+
+  if (store_index_in_file_ && (properties_.num_entries > 0)) {
+    assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max());
+    BlockHandle bloom_block_handle;
+    if (bloom_bits_per_key_ > 0) {
+      bloom_block_.SetTotalBits(
+          &arena_,
+          static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_,
+          ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.logger);
+
+      PutVarint32(&properties_.user_collected_properties
+                       [PlainTablePropertyNames::kNumBloomBlocks],
+                  bloom_block_.GetNumBlocks());
+
+      bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_);
+
+      Slice bloom_finish_result = bloom_block_.Finish();
+
+      properties_.filter_size = bloom_finish_result.size();
+      io_status_ =
+          WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle);
+
+      if (!io_status_.ok()) {
+        status_ = io_status_;
+        return status_;
+      }
+      meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
+    }
+    BlockHandle index_block_handle;
+    Slice index_finish_result = index_builder_->Finish();
+
+    properties_.index_size = index_finish_result.size();
+    io_status_ =
+        WriteBlock(index_finish_result, file_, &offset_, &index_block_handle);
+
+    if (!io_status_.ok()) {
+      status_ = io_status_;
+      return status_;
+    }
+
+    meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
+                          index_block_handle);
+  }
+
+  // Calculate bloom block size and index block size
+  PropertyBlockBuilder property_block_builder;
+  // -- Add basic properties
+  property_block_builder.AddTableProperty(properties_);
+
+  property_block_builder.Add(properties_.user_collected_properties);
+
+  // -- Add user collected properties
+  NotifyCollectTableCollectorsOnFinish(
+      table_properties_collectors_, ioptions_.logger, &property_block_builder);
+
+  // -- Write property block
+  BlockHandle property_block_handle;
+  IOStatus s = WriteBlock(property_block_builder.Finish(), file_, &offset_,
+                          &property_block_handle);
+  if (!s.ok()) {
+    return static_cast<Status>(s);
+  }
+  meta_index_builer.Add(kPropertiesBlockName, property_block_handle);
+
+  // -- write metaindex block
+  BlockHandle metaindex_block_handle;
+  io_status_ = WriteBlock(meta_index_builer.Finish(), file_, &offset_,
+                          &metaindex_block_handle);
+  if (!io_status_.ok()) {
+    status_ = io_status_;
+    return status_;
+  }
+
+  // Write Footer
+  // no need to write out new footer if we're using default checksum
+  FooterBuilder footer;
+  footer.Build(kPlainTableMagicNumber, /* format_version */ 0, offset_,
+               kNoChecksum, metaindex_block_handle);
+  io_status_ = file_->Append(footer.GetSlice());
+  if (io_status_.ok()) {
+    offset_ += footer.GetSlice().size();
+  }
+  status_ = io_status_;
+  return status_;
+}
+
+void PlainTableBuilder::Abandon() { closed_ = true; }
+
+uint64_t PlainTableBuilder::NumEntries() const {
+  return properties_.num_entries;
+}
+
+uint64_t PlainTableBuilder::FileSize() const { return offset_; }
+
+std::string PlainTableBuilder::GetFileChecksum() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
+const char* PlainTableBuilder::GetFileChecksumFuncName() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksumFuncName();
+  } else {
+    return kUnknownFileChecksumFuncName;
+  }
+}
+void PlainTableBuilder::SetSeqnoTimeTableProperties(const std::string& string,
+                                                    uint64_t uint_64) {
+  // TODO: storing seqno to time mapping is not yet support for plain table.
+  TableBuilder::SetSeqnoTimeTableProperties(string, uint_64);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_builder.h b/src/rocksdb/table/plain/plain_table_builder.h
new file mode 100644
index 000000000..445491c2a
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_builder.h
@@ -0,0 +1,154 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_index.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/table_builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+class TableBuilder;
+
+// The builder class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableBuilder : public TableBuilder {
+ public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish(). The output file
+  // will be part of level specified by 'level'.  A value of -1 means
+  // that the caller does not know which level the output file will reside.
+  PlainTableBuilder(
+      const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+      const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+      uint32_t column_family_id, int level_at_creation,
+      WritableFileWriter* file, uint32_t user_key_size,
+      EncodingType encoding_type, size_t index_sparseness,
+      uint32_t bloom_bits_per_key, const std::string& column_family_name,
+      uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
+      double hash_table_ratio = 0, bool store_index_in_file = false,
+      const std::string& db_id = "", const std::string& db_session_id = "",
+      uint64_t file_number = 0);
+  // No copying allowed
+  PlainTableBuilder(const PlainTableBuilder&) = delete;
+  void operator=(const PlainTableBuilder&) = delete;
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~PlainTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override { return status_; }
+
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override { return io_status_; }
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+  TableProperties GetTableProperties() const override { return properties_; }
+
+  bool SaveIndexInFile() const { return store_index_in_file_; }
+
+  // Get file checksum
+  std::string GetFileChecksum() const override;
+
+  // Get file checksum function name
+  const char* GetFileChecksumFuncName() const override;
+
+  void SetSeqnoTimeTableProperties(const std::string& string,
+                                   uint64_t uint_64) override;
+
+ private:
+  Arena arena_;
+  const ImmutableOptions& ioptions_;
+  const MutableCFOptions& moptions_;
+  std::vector<std::unique_ptr<IntTblPropCollector>>
+      table_properties_collectors_;
+
+  BloomBlockBuilder bloom_block_;
+  std::unique_ptr<PlainTableIndexBuilder> index_builder_;
+
+  WritableFileWriter* file_;
+  uint64_t offset_ = 0;
+  uint32_t bloom_bits_per_key_;
+  size_t huge_page_tlb_size_;
+  Status status_;
+  IOStatus io_status_;
+  TableProperties properties_;
+  PlainTableKeyEncoder encoder_;
+
+  bool store_index_in_file_;
+
+  std::vector<uint32_t> keys_or_prefixes_hashes_;
+  bool closed_ = false;  // Either Finish() or Abandon() has been called.
+
+  const SliceTransform* prefix_extractor_;
+
+  Slice GetPrefix(const Slice& target) const {
+    assert(target.size() >= 8);  // target is internal key
+    return GetPrefixFromUserKey(ExtractUserKey(target));
+  }
+
+  Slice GetPrefix(const ParsedInternalKey& target) const {
+    return GetPrefixFromUserKey(target.user_key);
+  }
+
+  Slice GetPrefixFromUserKey(const Slice& user_key) const {
+    if (!IsTotalOrderMode()) {
+      return prefix_extractor_->Transform(user_key);
+    } else {
+      // Use empty slice as prefix if prefix_extractor is not set.
+      // In that case,
+      // it falls back to pure binary search and
+      // total iterator seek is supported.
+      return Slice();
+    }
+  }
+
+  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_factory.cc b/src/rocksdb/table/plain/plain_table_factory.cc
new file mode 100644
index 000000000..dfe5241a5
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_factory.cc
@@ -0,0 +1,350 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain/plain_table_factory.h"
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/plain/plain_table_builder.h"
+#include "table/plain/plain_table_reader.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
+    {"user_key_len",
+     {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"bloom_bits_per_key",
+     {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"hash_table_ratio",
+     {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"index_sparseness",
+     {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"huge_page_tlb_size",
+     {offsetof(struct PlainTableOptions, huge_page_tlb_size),
+      OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"encoding_type",
+     {offsetof(struct PlainTableOptions, encoding_type),
+      OptionType::kEncodingType, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"full_scan_mode",
+     {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"store_index_in_file",
+     {offsetof(struct PlainTableOptions, store_index_in_file),
+      OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+};
+
+PlainTableFactory::PlainTableFactory(const PlainTableOptions& options)
+    : table_options_(options) {
+  RegisterOptions(&table_options_, &plain_table_type_info);
+}
+
+Status PlainTableFactory::NewTableReader(
+    const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table,
+    bool /*prefetch_index_and_filter_in_cache*/) const {
+  return PlainTableReader::Open(
+      table_reader_options.ioptions, table_reader_options.env_options,
+      table_reader_options.internal_comparator, std::move(file), file_size,
+      table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio,
+      table_options_.index_sparseness, table_options_.huge_page_tlb_size,
+      table_options_.full_scan_mode, table_reader_options.immortal,
+      table_reader_options.prefix_extractor.get());
+}
+
+TableBuilder* PlainTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFileWriter* file) const {
+  // Ignore the skip_filters flag. PlainTable format is optimized for small
+  // in-memory dbs. The skip_filters optimization is not useful for plain
+  // tables
+  //
+  return new PlainTableBuilder(
+      table_builder_options.ioptions, table_builder_options.moptions,
+      table_builder_options.int_tbl_prop_collector_factories,
+      table_builder_options.column_family_id,
+      table_builder_options.level_at_creation, file,
+      table_options_.user_key_len, table_options_.encoding_type,
+      table_options_.index_sparseness, table_options_.bloom_bits_per_key,
+      table_builder_options.column_family_name, 6,
+      table_options_.huge_page_tlb_size, table_options_.hash_table_ratio,
+      table_options_.store_index_in_file, table_builder_options.db_id,
+      table_builder_options.db_session_id, table_builder_options.cur_file_num);
+}
+
+std::string PlainTableFactory::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "  user_key_len: %u\n",
+           table_options_.user_key_len);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  bloom_bits_per_key: %d\n",
+           table_options_.bloom_bits_per_key);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  hash_table_ratio: %lf\n",
+           table_options_.hash_table_ratio);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_sparseness: %" ROCKSDB_PRIszt "\n",
+           table_options_.index_sparseness);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  huge_page_tlb_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.huge_page_tlb_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  encoding_type: %d\n",
+           table_options_.encoding_type);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  full_scan_mode: %d\n",
+           table_options_.full_scan_mode);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  store_index_in_file: %d\n",
+           table_options_.store_index_in_file);
+  ret.append(buffer);
+  return ret;
+}
+
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+  config_options.invoke_prepare_options = false;
+  return GetPlainTableOptionsFromString(config_options, table_options, opts_str,
+                                        new_table_options);
+}
+
+Status GetPlainTableOptionsFromString(const ConfigOptions& config_options,
+                                      const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = GetPlainTableOptionsFromMap(config_options, table_options, opts_map,
+                                  new_table_options);
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
+  }
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library,
+                                             const std::string& /*arg*/) {
+  // The MemTableRepFactory built-in classes will be either a class
+  // (VectorRepFactory) or a nickname (vector), followed optionally by ":#",
+  // where # is the "size" of the factory.
+  auto AsPattern = [](const std::string& name, const std::string& alt) {
+    auto pattern = ObjectLibrary::PatternEntry(name, true);
+    pattern.AnotherName(alt);
+    pattern.AddNumber(":");
+    return pattern;
+  };
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern(VectorRepFactory::kClassName(), VectorRepFactory::kNickName()),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(new VectorRepFactory(count));
+        } else {
+          guard->reset(new VectorRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern(SkipListFactory::kClassName(), SkipListFactory::kNickName()),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t lookahead = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(new SkipListFactory(lookahead));
+        } else {
+          guard->reset(new SkipListFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern("HashLinkListRepFactory", "hash_linkedlist"),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        // Expecting format: hash_linkedlist:<hash_bucket_count>
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(NewHashLinkListRepFactory(hash_bucket_count));
+        } else {
+          guard->reset(NewHashLinkListRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern("HashSkipListRepFactory", "prefix_hash"),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        // Expecting format: prefix_hash:<hash_bucket_count>
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(NewHashSkipListRepFactory(hash_bucket_count));
+        } else {
+          guard->reset(NewHashSkipListRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      "cuckoo",
+      [](const std::string& /*uri*/,
+         std::unique_ptr<MemTableRepFactory>* /*guard*/, std::string* errmsg) {
+        *errmsg = "cuckoo hash memtable is not supported anymore.";
+        return nullptr;
+      });
+
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+Status GetMemTableRepFactoryFromString(
+    const std::string& opts_str, std::unique_ptr<MemTableRepFactory>* result) {
+  ConfigOptions config_options;
+  config_options.ignore_unsupported_options = false;
+  config_options.ignore_unknown_options = false;
+  return MemTableRepFactory::CreateFromString(config_options, opts_str, result);
+}
+
+Status MemTableRepFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::unique_ptr<MemTableRepFactory>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinMemTableRepFactory(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (value.empty()) {
+    // No Id and no options.  Clear the object
+    result->reset();
+    return Status::OK();
+  } else if (id.empty()) {  // We have no Id but have options.  Not good
+    return Status::NotSupported("Cannot reset object ", id);
+  } else {
+#ifndef ROCKSDB_LITE
+    status = NewUniqueObject<MemTableRepFactory>(config_options, id, opt_map,
+                                                 result);
+#else
+    // To make it possible to configure the memtables in LITE mode, the ID
+    // is of the form <name>:<size>, where name is the name of the class and
+    // <size> is the length of the object (e.g. skip_list:10).
+    std::vector<std::string> opts_list = StringSplit(id, ':');
+    if (opts_list.empty() || opts_list.size() > 2 || !opt_map.empty()) {
+      status = Status::InvalidArgument("Can't parse memtable_factory option ",
+                                       value);
+    } else if (opts_list[0] == SkipListFactory::kNickName() ||
+               opts_list[0] == SkipListFactory::kClassName()) {
+      // Expecting format
+      // skip_list:<lookahead>
+      if (opts_list.size() == 2) {
+        size_t lookahead = ParseSizeT(opts_list[1]);
+        result->reset(new SkipListFactory(lookahead));
+      } else {
+        result->reset(new SkipListFactory());
+      }
+    } else if (!config_options.ignore_unsupported_options) {
+      status = Status::NotSupported("Cannot load object in LITE mode ", id);
+    }
+#endif  // ROCKSDB_LITE
+  }
+  return status;
+}
+
+Status MemTableRepFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<MemTableRepFactory>* result) {
+  std::unique_ptr<MemTableRepFactory> factory;
+  Status s = CreateFromString(config_options, value, &factory);
+  if (factory && s.ok()) {
+    result->reset(factory.release());
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status GetPlainTableOptionsFromMap(
+    const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options, bool input_strings_escaped,
+    bool ignore_unknown_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = input_strings_escaped;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  return GetPlainTableOptionsFromMap(config_options, table_options, opts_map,
+                                     new_table_options);
+}
+
+Status GetPlainTableOptionsFromMap(
+    const ConfigOptions& config_options, const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options) {
+  assert(new_table_options);
+  PlainTableFactory ptf(table_options);
+  Status s = ptf.ConfigureFromMap(config_options, opts_map);
+  if (s.ok()) {
+    *new_table_options = *(ptf.GetOptions<PlainTableOptions>());
+  } else {
+    // Restore "new_options" to the default "base_options".
+    *new_table_options = table_options;
+  }
+  return s;
+}
+
+extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
+  return new PlainTableFactory(options);
+}
+
+const std::string PlainTablePropertyNames::kEncodingType =
+    "rocksdb.plain.table.encoding.type";
+
+const std::string PlainTablePropertyNames::kBloomVersion =
+    "rocksdb.plain.table.bloom.version";
+
+const std::string PlainTablePropertyNames::kNumBloomBlocks =
+    "rocksdb.plain.table.bloom.numblocks";
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_factory.h b/src/rocksdb/table/plain/plain_table_factory.h
new file mode 100644
index 000000000..ce60b9d19
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_factory.h
@@ -0,0 +1,182 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct EnvOptions;
+
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+// PlainTableFactory is the entrance function to the PlainTable format of
+// SST files. It returns instances PlainTableBuilder as the builder
+// class and PlainTableReader as the reader class, where the format is
+// actually implemented.
+//
+// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
+// Data is not organized in blocks, which allows fast access. Because of
+// following downsides
+// 1. Data compression is not supported.
+// 2. Data is not checksumed.
+// it is not recommended to use this format on other type of file systems.
+//
+// PlainTable requires fixed length key, configured as a constructor
+// parameter of the factory class. Output file format:
+// +-------------+-----------------+
+// | version     | user_key_length |
+// +------------++------------+-----------------+  <= key1 offset
+// |  encoded key1            | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value1                                     |
+// |                                            |
+// +--------------------------+-------------+---+  <= key2 offset
+// | encoded key2             | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+
+//
+// When the key encoding type is kPlain. Key part is encoded as:
+// +------------+--------------------+
+// | [key_size] |  internal key      |
+// +------------+--------------------+
+// for the case of user_key_len = kPlainTableVariableLength case,
+// and simply:
+// +----------------------+
+// |  internal key        |
+// +----------------------+
+// for user_key_len != kPlainTableVariableLength case.
+//
+// If key encoding type is kPrefix. Keys are encoding in this format.
+// There are three ways to encode a key:
+// (1) Full Key
+// +---------------+---------------+-------------------+
+// | Full Key Flag | Full Key Size | Full Internal Key |
+// +---------------+---------------+-------------------+
+// which simply encodes a full key
+//
+// (2) A key shared the same prefix as the previous key, which is encoded as
+//     format of (1).
+// +-------------+-------------+-------------+-------------+------------+
+// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
+// +-------------+-------------+-------------+-------------+------------+
+// where key is the suffix part of the key, including the internal bytes.
+// the actual key will be constructed by concatenating prefix part of the
+// previous key, with the suffix part of the key here, with sizes given here.
+//
+// (3) A key shared the same prefix as the previous key, which is encoded as
+//     the format of (2).
+// +-----------------+-----------------+------------------------+
+// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
+// +-----------------+-----------------+------------------------+
+// The key will be constructed by concatenating previous key's prefix (which is
+// also a prefix which the last key encoded in the format of (1)) and the
+// key given here.
+//
+// For example, we for following keys (prefix and suffix are separated by
+// spaces):
+//   0000 0001
+//   0000 00021
+//   0000 0002
+//   00011 00
+//   0002 0001
+// Will be encoded like this:
+//   FK 8 00000001
+//   PF 4 SF 5 00021
+//   SF 4 0002
+//   FK 7 0001100
+//   FK 8 00020001
+// (where FK means full key flag, PF means prefix flag and SF means suffix flag)
+//
+// All those "key flag + key size" shown above are in this format:
+// The 8 bits of the first byte:
+// +----+----+----+----+----+----+----+----+
+// |  Type   |            Size             |
+// +----+----+----+----+----+----+----+----+
+// Type indicates: full key, prefix, or suffix.
+// The last 6 bits are for size. If the size bits are not all 1, it means the
+// size of the key. Otherwise, varint32 is read after this byte. This varint
+// value + 0x3F (the value of all 1) will be the key size.
+//
+// For example, full key with length 16 will be encoded as (binary):
+//     00 010000
+// (00 means full key)
+// and a prefix with 100 bytes will be encoded as:
+//     01 111111    00100101
+//         (63)       (37)
+// (01 means key suffix)
+//
+// All the internal keys above (including kPlain and kPrefix) are encoded in
+// this format:
+// There are two types:
+// (1) normal internal key format
+// +----------- ...... -------------+----+---+---+---+---+---+---+---+
+// |       user key                 |type|      sequence ID          |
+// +----------- ..... --------------+----+---+---+---+---+---+---+---+
+// (2) Special case for keys whose sequence ID is 0 and is value type
+// +----------- ...... -------------+----+
+// |       user key                 |0x80|
+// +----------- ..... --------------+----+
+// To save 7 bytes for the special case where sequence ID = 0.
+//
+//
+class PlainTableFactory : public TableFactory {
+ public:
+  ~PlainTableFactory() {}
+  // user_key_len is the length of the user key. If it is set to be
+  // kPlainTableVariableLength, then it means variable length. Otherwise, all
+  // the keys need to have the fix length of this value. bloom_bits_per_key is
+  // number of bits used for bloom filer per key. hash_table_ratio is
+  // the desired utilization of the hash table used for prefix hashing.
+  // hash_table_ratio = number of prefixes / #buckets in the hash table
+  // hash_table_ratio = 0 means skip hash table but only replying on binary
+  // search.
+  // index_sparseness determines index interval for keys
+  // inside the same prefix. It will be the maximum number of linear search
+  // required after hash and binary search.
+  // index_sparseness = 0 means index for every key.
+  // huge_page_tlb_size determines whether to allocate hash indexes from huge
+  // page TLB and the page size if allocating from there. See comments of
+  // Arena::AllocateAligned() for details.
+  explicit PlainTableFactory(
+      const PlainTableOptions& _table_options = PlainTableOptions());
+
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kPlainTableName(); }
+  const char* Name() const override { return kPlainTableName(); }
+  using TableFactory::NewTableReader;
+  Status NewTableReader(const ReadOptions& ro,
+                        const TableReaderOptions& table_reader_options,
+                        std::unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size, std::unique_ptr<TableReader>* table,
+                        bool prefetch_index_and_filter_in_cache) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const override;
+
+  std::string GetPrintableOptions() const override;
+  static const char kValueTypeSeqId0 = char(~0);
+
+ private:
+  PlainTableOptions table_options_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_index.cc b/src/rocksdb/table/plain/plain_table_index.cc
new file mode 100644
index 000000000..b7e07cfb2
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_index.cc
@@ -0,0 +1,213 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_index.h"
+
+#include <cinttypes>
+
+#include "logging/logging.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+  assert(num_buckets > 0);
+  return hash % num_buckets;
+}
+}  // namespace
+
+Status PlainTableIndex::InitFromRawData(Slice data) {
+  if (!GetVarint32(&data, &index_size_)) {
+    return Status::Corruption("Couldn't read the index size!");
+  }
+  assert(index_size_ > 0);
+  if (!GetVarint32(&data, &num_prefixes_)) {
+    return Status::Corruption("Couldn't read the index size!");
+  }
+  sub_index_size_ =
+      static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
+
+  char* index_data_begin = const_cast<char*>(data.data());
+  index_ = reinterpret_cast<uint32_t*>(index_data_begin);
+  sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
+  return Status::OK();
+}
+
+PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
+    uint32_t prefix_hash, uint32_t* bucket_value) const {
+  int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
+  GetUnaligned(index_ + bucket, bucket_value);
+  if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
+    *bucket_value ^= kSubIndexMask;
+    return kSubindex;
+  }
+  if (*bucket_value >= kMaxFileSize) {
+    return kNoPrefixForBucket;
+  } else {
+    // point directly to the file
+    return kDirectToFile;
+  }
+}
+
+void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
+                                                        uint32_t offset) {
+  if (num_records_in_current_group_ == kNumRecordsPerGroup) {
+    current_group_ = AllocateNewGroup();
+    num_records_in_current_group_ = 0;
+  }
+  auto& new_record = current_group_[num_records_in_current_group_++];
+  new_record.hash = hash;
+  new_record.offset = offset;
+  new_record.next = nullptr;
+}
+
+void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
+                                          uint32_t key_offset) {
+  if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
+    ++num_prefixes_;
+    if (!is_first_record_) {
+      keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+    }
+    num_keys_per_prefix_ = 0;
+    prev_key_prefix_ = key_prefix_slice.ToString();
+    prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
+    due_index_ = true;
+  }
+
+  if (due_index_) {
+    // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+    record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
+    due_index_ = false;
+  }
+
+  num_keys_per_prefix_++;
+  if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
+    due_index_ = true;
+  }
+  is_first_record_ = false;
+}
+
+Slice PlainTableIndexBuilder::Finish() {
+  AllocateIndex();
+  std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
+  std::vector<uint32_t> entries_per_bucket(index_size_, 0);
+  BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
+
+  keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+  ROCKS_LOG_INFO(ioptions_.logger, "Number of Keys per prefix Histogram: %s",
+                 keys_per_prefix_hist_.ToString().c_str());
+
+  // From the temp data structure, populate indexes.
+  return FillIndexes(hash_to_offsets, entries_per_bucket);
+}
+
+void PlainTableIndexBuilder::AllocateIndex() {
+  if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
+    // Fall back to pure binary search if the user fails to specify a prefix
+    // extractor.
+    index_size_ = 1;
+  } else {
+    double hash_table_size_multipier = 1.0 / hash_table_ratio_;
+    index_size_ =
+        static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
+    assert(index_size_ > 0);
+  }
+}
+
+void PlainTableIndexBuilder::BucketizeIndexes(
+    std::vector<IndexRecord*>* hash_to_offsets,
+    std::vector<uint32_t>* entries_per_bucket) {
+  bool first = true;
+  uint32_t prev_hash = 0;
+  size_t num_records = record_list_.GetNumRecords();
+  for (size_t i = 0; i < num_records; i++) {
+    IndexRecord* index_record = record_list_.At(i);
+    uint32_t cur_hash = index_record->hash;
+    if (first || prev_hash != cur_hash) {
+      prev_hash = cur_hash;
+      first = false;
+    }
+    uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
+    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
+    index_record->next = prev_bucket_head;
+    (*hash_to_offsets)[bucket] = index_record;
+    (*entries_per_bucket)[bucket]++;
+  }
+
+  sub_index_size_ = 0;
+  for (auto entry_count : *entries_per_bucket) {
+    if (entry_count <= 1) {
+      continue;
+    }
+    // Only buckets with more than 1 entry will have subindex.
+    sub_index_size_ += VarintLength(entry_count);
+    // total bytes needed to store these entries' in-file offsets.
+    sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
+  }
+}
+
+Slice PlainTableIndexBuilder::FillIndexes(
+    const std::vector<IndexRecord*>& hash_to_offsets,
+    const std::vector<uint32_t>& entries_per_bucket) {
+  ROCKS_LOG_DEBUG(ioptions_.logger,
+                  "Reserving %" PRIu32 " bytes for plain table's sub_index",
+                  sub_index_size_);
+  auto total_allocate_size = GetTotalSize();
+  char* allocated = arena_->AllocateAligned(
+      total_allocate_size, huge_page_tlb_size_, ioptions_.logger);
+
+  auto temp_ptr = EncodeVarint32(allocated, index_size_);
+  uint32_t* index =
+      reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
+  char* sub_index = reinterpret_cast<char*>(index + index_size_);
+
+  uint32_t sub_index_offset = 0;
+  for (uint32_t i = 0; i < index_size_; i++) {
+    uint32_t num_keys_for_bucket = entries_per_bucket[i];
+    switch (num_keys_for_bucket) {
+      case 0:
+        // No key for bucket
+        PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
+        break;
+      case 1:
+        // point directly to the file offset
+        PutUnaligned(index + i, hash_to_offsets[i]->offset);
+        break;
+      default:
+        // point to second level indexes.
+        PutUnaligned(index + i,
+                     sub_index_offset | PlainTableIndex::kSubIndexMask);
+        char* prev_ptr = &sub_index[sub_index_offset];
+        char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+        sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
+        char* sub_index_pos = &sub_index[sub_index_offset];
+        IndexRecord* record = hash_to_offsets[i];
+        int j;
+        for (j = num_keys_for_bucket - 1; j >= 0 && record;
+             j--, record = record->next) {
+          EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
+        }
+        assert(j == -1 && record == nullptr);
+        sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
+        assert(sub_index_offset <= sub_index_size_);
+        break;
+    }
+  }
+  assert(sub_index_offset == sub_index_size_);
+
+  ROCKS_LOG_DEBUG(ioptions_.logger,
+                  "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
+                  index_size_, sub_index_size_);
+  return Slice(allocated, GetTotalSize());
+}
+
+const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
+    "PlainTableIndexBlock";
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_index.h b/src/rocksdb/table/plain/plain_table_index.h
new file mode 100644
index 000000000..9f5f0eeff
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_index.h
@@ -0,0 +1,248 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "memory/arena.h"
+#include "monitoring/histogram.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains two classes PlainTableIndex and PlainTableIndexBuilder
+// The two classes implement the index format of PlainTable.
+// For description of PlainTable format, see comments of class
+// PlainTableFactory
+//
+//
+// PlainTableIndex contains buckets size of index_size_, each is a
+// 32-bit integer. The lower 31 bits contain an offset value (explained below)
+// and the first bit of the integer indicates type of the offset.
+//
+// +--------------+------------------------------------------------------+
+// | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
+// +--------------+------------------------------------------------------+
+//
+// Explanation for the "flag bit":
+//
+// 0 indicates that the bucket contains only one prefix (no conflict when
+//   hashing this prefix), whose first row starts from this offset of the
+// file.
+// 1 indicates that the bucket contains more than one prefixes, or there
+//   are too many rows for one prefix so we need a binary search for it. In
+//   this case, the offset indicates the offset of sub_index_ holding the
+//   binary search indexes of keys for those rows. Those binary search indexes
+//   are organized in this way:
+//
+// The first 4 bytes, indicate how many indexes (N) are stored after it. After
+// it, there are N 32-bit integers, each points of an offset of the file,
+// which
+// points to starting of a row. Those offsets need to be guaranteed to be in
+// ascending order so the keys they are pointing to are also in ascending
+// order
+// to make sure we can use them to do binary searches. Below is visual
+// presentation of a bucket.
+//
+// <begin>
+//   number_of_records:  varint32
+//   record 1 file offset:  fixedint32
+//   record 2 file offset:  fixedint32
+//    ....
+//   record N file offset:  fixedint32
+// <end>
+
+// The class loads the index block from a PlainTable SST file, and executes
+// the index lookup.
+// The class is used by PlainTableReader class.
+class PlainTableIndex {
+ public:
+  enum IndexSearchResult {
+    kNoPrefixForBucket = 0,
+    kDirectToFile = 1,
+    kSubindex = 2
+  };
+
+  explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
+
+  PlainTableIndex()
+      : index_size_(0),
+        sub_index_size_(0),
+        num_prefixes_(0),
+        index_(nullptr),
+        sub_index_(nullptr) {}
+
+  // The function that executes the lookup the hash table.
+  // The hash key is `prefix_hash`. The function fills the hash bucket
+  // content in `bucket_value`, which is up to the caller to interpret.
+  IndexSearchResult GetOffset(uint32_t prefix_hash,
+                              uint32_t* bucket_value) const;
+
+  // Initialize data from `index_data`, which points to raw data for
+  // index stored in the SST file.
+  Status InitFromRawData(Slice index_data);
+
+  // Decode the sub index for specific hash bucket.
+  // The `offset` is the value returned as `bucket_value` by GetOffset()
+  // and is only valid when the return value is `kSubindex`.
+  // The return value is the pointer to the starting address of the
+  // sub-index. `upper_bound` is filled with the value indicating how many
+  // entries the sub-index has.
+  const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
+                                              uint32_t* upper_bound) const {
+    const char* index_ptr = &sub_index_[offset];
+    return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
+  }
+
+  uint32_t GetIndexSize() const { return index_size_; }
+
+  uint32_t GetSubIndexSize() const { return sub_index_size_; }
+
+  uint32_t GetNumPrefixes() const { return num_prefixes_; }
+
+  static const uint64_t kMaxFileSize = (1u << 31) - 1;
+  static const uint32_t kSubIndexMask = 0x80000000;
+  static const size_t kOffsetLen = sizeof(uint32_t);
+
+ private:
+  uint32_t index_size_;
+  uint32_t sub_index_size_;
+  uint32_t num_prefixes_;
+
+  uint32_t* index_;
+  char* sub_index_;
+};
+
+// PlainTableIndexBuilder is used to create plain table index.
+// After calling Finish(), it returns Slice, which is usually
+// used either to initialize PlainTableIndex or
+// to save index to sst file.
+// For more details about the index, please refer to:
+// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
+// #wiki-in-memory-index-format
+// The class is used by PlainTableBuilder class.
+class PlainTableIndexBuilder {
+ public:
+  PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions,
+                         const SliceTransform* prefix_extractor,
+                         size_t index_sparseness, double hash_table_ratio,
+                         size_t huge_page_tlb_size)
+      : arena_(arena),
+        ioptions_(ioptions),
+        record_list_(kRecordsPerGroup),
+        is_first_record_(true),
+        due_index_(false),
+        num_prefixes_(0),
+        num_keys_per_prefix_(0),
+        prev_key_prefix_hash_(0),
+        index_sparseness_(index_sparseness),
+        index_size_(0),
+        sub_index_size_(0),
+        prefix_extractor_(prefix_extractor),
+        hash_table_ratio_(hash_table_ratio),
+        huge_page_tlb_size_(huge_page_tlb_size) {}
+
+  void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
+
+  Slice Finish();
+
+  uint32_t GetTotalSize() const {
+    return VarintLength(index_size_) + VarintLength(num_prefixes_) +
+           PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
+  }
+
+  static const std::string kPlainTableIndexBlock;
+
+ private:
+  struct IndexRecord {
+    uint32_t hash;    // hash of the prefix
+    uint32_t offset;  // offset of a row
+    IndexRecord* next;
+  };
+
+  // Helper class to track all the index records
+  class IndexRecordList {
+   public:
+    explicit IndexRecordList(size_t num_records_per_group)
+        : kNumRecordsPerGroup(num_records_per_group),
+          current_group_(nullptr),
+          num_records_in_current_group_(num_records_per_group) {}
+
+    ~IndexRecordList() {
+      for (size_t i = 0; i < groups_.size(); i++) {
+        delete[] groups_[i];
+      }
+    }
+
+    void AddRecord(uint32_t hash, uint32_t offset);
+
+    size_t GetNumRecords() const {
+      return (groups_.size() - 1) * kNumRecordsPerGroup +
+             num_records_in_current_group_;
+    }
+    IndexRecord* At(size_t index) {
+      return &(
+          groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
+    }
+
+   private:
+    IndexRecord* AllocateNewGroup() {
+      IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
+      groups_.push_back(result);
+      return result;
+    }
+
+    // Each group in `groups_` contains fix-sized records (determined by
+    // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
+    // occurs.
+    const size_t kNumRecordsPerGroup;
+    IndexRecord* current_group_;
+    // List of arrays allocated
+    std::vector<IndexRecord*> groups_;
+    size_t num_records_in_current_group_;
+  };
+
+  void AllocateIndex();
+
+  // Internal helper function to bucket index record list to hash buckets.
+  void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
+                        std::vector<uint32_t>* entries_per_bucket);
+
+  // Internal helper class to fill the indexes and bloom filters to internal
+  // data structures.
+  Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
+                    const std::vector<uint32_t>& entries_per_bucket);
+
+  Arena* arena_;
+  const ImmutableOptions ioptions_;
+  HistogramImpl keys_per_prefix_hist_;
+  IndexRecordList record_list_;
+  bool is_first_record_;
+  bool due_index_;
+  uint32_t num_prefixes_;
+  uint32_t num_keys_per_prefix_;
+
+  uint32_t prev_key_prefix_hash_;
+  size_t index_sparseness_;
+  uint32_t index_size_;
+  uint32_t sub_index_size_;
+
+  const SliceTransform* prefix_extractor_;
+  double hash_table_ratio_;
+  size_t huge_page_tlb_size_;
+
+  std::string prev_key_prefix_;
+
+  static const size_t kRecordsPerGroup = 256;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_key_coding.cc b/src/rocksdb/table/plain/plain_table_key_coding.cc
new file mode 100644
index 000000000..800d8d76f
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_key_coding.cc
@@ -0,0 +1,509 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_key_coding.h"
+
+#include <algorithm>
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum PlainTableEntryType : unsigned char {
+  kFullKey = 0,
+  kPrefixFromPreviousKey = 1,
+  kKeySuffix = 2,
+};
+
+namespace {
+
+// Control byte:
+// First two bits indicate type of entry
+// Other bytes are inlined sizes. If all bits are 1 (0x03F), overflow bytes
+// are used. key_size-0x3F will be encoded as a variint32 after this bytes.
+
+const unsigned char kSizeInlineLimit = 0x3F;
+
+// Return 0 for error
+size_t EncodeSize(PlainTableEntryType type, uint32_t key_size,
+                  char* out_buffer) {
+  out_buffer[0] = type << 6;
+
+  if (key_size < static_cast<uint32_t>(kSizeInlineLimit)) {
+    // size inlined
+    out_buffer[0] |= static_cast<char>(key_size);
+    return 1;
+  } else {
+    out_buffer[0] |= kSizeInlineLimit;
+    char* ptr = EncodeVarint32(out_buffer + 1, key_size - kSizeInlineLimit);
+    return ptr - out_buffer;
+  }
+}
+}  // namespace
+
+// Fill bytes_read with number of bytes read.
+inline Status PlainTableKeyDecoder::DecodeSize(uint32_t start_offset,
+                                               PlainTableEntryType* entry_type,
+                                               uint32_t* key_size,
+                                               uint32_t* bytes_read) {
+  Slice next_byte_slice;
+  bool success = file_reader_.Read(start_offset, 1, &next_byte_slice);
+  if (!success) {
+    return file_reader_.status();
+  }
+  *entry_type = static_cast<PlainTableEntryType>(
+      (static_cast<unsigned char>(next_byte_slice[0]) & ~kSizeInlineLimit) >>
+      6);
+  char inline_key_size = next_byte_slice[0] & kSizeInlineLimit;
+  if (inline_key_size < kSizeInlineLimit) {
+    *key_size = inline_key_size;
+    *bytes_read = 1;
+    return Status::OK();
+  } else {
+    uint32_t extra_size;
+    uint32_t tmp_bytes_read;
+    success = file_reader_.ReadVarint32(start_offset + 1, &extra_size,
+                                        &tmp_bytes_read);
+    if (!success) {
+      return file_reader_.status();
+    }
+    assert(tmp_bytes_read > 0);
+    *key_size = kSizeInlineLimit + extra_size;
+    *bytes_read = tmp_bytes_read + 1;
+    return Status::OK();
+  }
+}
+
+IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key,
+                                         WritableFileWriter* file,
+                                         uint64_t* offset, char* meta_bytes_buf,
+                                         size_t* meta_bytes_buf_size) {
+  ParsedInternalKey parsed_key;
+  Status pik_status =
+      ParseInternalKey(key, &parsed_key, false /* log_err_key */);  // TODO
+  if (!pik_status.ok()) {
+    return IOStatus::Corruption(pik_status.getState());
+  }
+
+  Slice key_to_write = key;  // Portion of internal key to write out.
+
+  uint32_t user_key_size = static_cast<uint32_t>(key.size() - 8);
+  if (encoding_type_ == kPlain) {
+    if (fixed_user_key_len_ == kPlainTableVariableLength) {
+      // Write key length
+      char key_size_buf[5];  // tmp buffer for key size as varint32
+      char* ptr = EncodeVarint32(key_size_buf, user_key_size);
+      assert(ptr <= key_size_buf + sizeof(key_size_buf));
+      auto len = ptr - key_size_buf;
+      IOStatus io_s = file->Append(Slice(key_size_buf, len));
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      *offset += len;
+    }
+  } else {
+    assert(encoding_type_ == kPrefix);
+    char size_bytes[12];
+    size_t size_bytes_pos = 0;
+
+    Slice prefix =
+        prefix_extractor_->Transform(Slice(key.data(), user_key_size));
+    if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetUserKey() ||
+        key_count_for_prefix_ % index_sparseness_ == 0) {
+      key_count_for_prefix_ = 1;
+      pre_prefix_.SetUserKey(prefix);
+      size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes);
+      IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos));
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      *offset += size_bytes_pos;
+    } else {
+      key_count_for_prefix_++;
+      if (key_count_for_prefix_ == 2) {
+        // For second key within a prefix, need to encode prefix length
+        size_bytes_pos +=
+            EncodeSize(kPrefixFromPreviousKey,
+                       static_cast<uint32_t>(pre_prefix_.GetUserKey().size()),
+                       size_bytes + size_bytes_pos);
+      }
+      uint32_t prefix_len =
+          static_cast<uint32_t>(pre_prefix_.GetUserKey().size());
+      size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len,
+                                   size_bytes + size_bytes_pos);
+      IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos));
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      *offset += size_bytes_pos;
+      key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len);
+    }
+  }
+
+  // Encode full key
+  // For value size as varint32 (up to 5 bytes).
+  // If the row is of value type with seqId 0, flush the special flag together
+  // in this buffer to safe one file append call, which takes 1 byte.
+  if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
+    IOStatus io_s =
+        file->Append(Slice(key_to_write.data(), key_to_write.size() - 8));
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    *offset += key_to_write.size() - 8;
+    meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0;
+    *meta_bytes_buf_size += 1;
+  } else {
+    IOStatus io_s = file->Append(key_to_write);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    *offset += key_to_write.size();
+  }
+
+  return IOStatus::OK();
+}
+
+Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset,
+                                          uint32_t len) {
+  assert(file_offset + len <= file_info_->data_end_offset);
+  return Slice(buffer->buf.get() + (file_offset - buffer->buf_start_offset),
+               len);
+}
+
+bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len,
+                                       Slice* out) {
+  const uint32_t kPrefetchSize = 256u;
+
+  // Try to read from buffers.
+  for (uint32_t i = 0; i < num_buf_; i++) {
+    Buffer* buffer = buffers_[num_buf_ - 1 - i].get();
+    if (file_offset >= buffer->buf_start_offset &&
+        file_offset + len <= buffer->buf_start_offset + buffer->buf_len) {
+      *out = GetFromBuffer(buffer, file_offset, len);
+      return true;
+    }
+  }
+
+  Buffer* new_buffer;
+  // Data needed is not in any of the buffer. Allocate a new buffer.
+  if (num_buf_ < buffers_.size()) {
+    // Add a new buffer
+    new_buffer = new Buffer();
+    buffers_[num_buf_++].reset(new_buffer);
+  } else {
+    // Now simply replace the last buffer. Can improve the placement policy
+    // if needed.
+    new_buffer = buffers_[num_buf_ - 1].get();
+  }
+
+  assert(file_offset + len <= file_info_->data_end_offset);
+  uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset,
+                                   std::max(kPrefetchSize, len));
+  if (size_to_read > new_buffer->buf_capacity) {
+    new_buffer->buf.reset(new char[size_to_read]);
+    new_buffer->buf_capacity = size_to_read;
+    new_buffer->buf_len = 0;
+  }
+  Slice read_result;
+  // TODO: rate limit plain table reads.
+  Status s =
+      file_info_->file->Read(IOOptions(), file_offset, size_to_read,
+                             &read_result, new_buffer->buf.get(), nullptr,
+                             Env::IO_TOTAL /* rate_limiter_priority */);
+  if (!s.ok()) {
+    status_ = s;
+    return false;
+  }
+  new_buffer->buf_start_offset = file_offset;
+  new_buffer->buf_len = size_to_read;
+  *out = GetFromBuffer(new_buffer, file_offset, len);
+  return true;
+}
+
+inline bool PlainTableFileReader::ReadVarint32(uint32_t offset, uint32_t* out,
+                                               uint32_t* bytes_read) {
+  if (file_info_->is_mmap_mode) {
+    const char* start = file_info_->file_data.data() + offset;
+    const char* limit =
+        file_info_->file_data.data() + file_info_->data_end_offset;
+    const char* key_ptr = GetVarint32Ptr(start, limit, out);
+    assert(key_ptr != nullptr);
+    *bytes_read = static_cast<uint32_t>(key_ptr - start);
+    return true;
+  } else {
+    return ReadVarint32NonMmap(offset, out, bytes_read);
+  }
+}
+
+bool PlainTableFileReader::ReadVarint32NonMmap(uint32_t offset, uint32_t* out,
+                                               uint32_t* bytes_read) {
+  const char* start;
+  const char* limit;
+  const uint32_t kMaxVarInt32Size = 6u;
+  uint32_t bytes_to_read =
+      std::min(file_info_->data_end_offset - offset, kMaxVarInt32Size);
+  Slice bytes;
+  if (!Read(offset, bytes_to_read, &bytes)) {
+    return false;
+  }
+  start = bytes.data();
+  limit = bytes.data() + bytes.size();
+
+  const char* key_ptr = GetVarint32Ptr(start, limit, out);
+  *bytes_read =
+      (key_ptr != nullptr) ? static_cast<uint32_t>(key_ptr - start) : 0;
+  return true;
+}
+
+Status PlainTableKeyDecoder::ReadInternalKey(
+    uint32_t file_offset, uint32_t user_key_size, ParsedInternalKey* parsed_key,
+    uint32_t* bytes_read, bool* internal_key_valid, Slice* internal_key) {
+  Slice tmp_slice;
+  bool success = file_reader_.Read(file_offset, user_key_size + 1, &tmp_slice);
+  if (!success) {
+    return file_reader_.status();
+  }
+  if (tmp_slice[user_key_size] == PlainTableFactory::kValueTypeSeqId0) {
+    // Special encoding for the row with seqID=0
+    parsed_key->user_key = Slice(tmp_slice.data(), user_key_size);
+    parsed_key->sequence = 0;
+    parsed_key->type = kTypeValue;
+    *bytes_read += user_key_size + 1;
+    *internal_key_valid = false;
+  } else {
+    success = file_reader_.Read(file_offset, user_key_size + 8, internal_key);
+    if (!success) {
+      return file_reader_.status();
+    }
+    *internal_key_valid = true;
+    Status pik_status = ParseInternalKey(*internal_key, parsed_key,
+                                         false /* log_err_key */);  // TODO
+    if (!pik_status.ok()) {
+      return Status::Corruption(
+          Slice("Corrupted key found during next key read. "),
+          pik_status.getState());
+    }
+    *bytes_read += user_key_size + 8;
+  }
+  return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset,
+                                                  ParsedInternalKey* parsed_key,
+                                                  Slice* internal_key,
+                                                  uint32_t* bytes_read,
+                                                  bool* /*seekable*/) {
+  uint32_t user_key_size = 0;
+  Status s;
+  if (fixed_user_key_len_ != kPlainTableVariableLength) {
+    user_key_size = fixed_user_key_len_;
+  } else {
+    uint32_t tmp_size = 0;
+    uint32_t tmp_read;
+    bool success =
+        file_reader_.ReadVarint32(start_offset, &tmp_size, &tmp_read);
+    if (!success) {
+      return file_reader_.status();
+    }
+    assert(tmp_read > 0);
+    user_key_size = tmp_size;
+    *bytes_read = tmp_read;
+  }
+  // dummy initial value to avoid compiler complain
+  bool decoded_internal_key_valid = true;
+  Slice decoded_internal_key;
+  s = ReadInternalKey(start_offset + *bytes_read, user_key_size, parsed_key,
+                      bytes_read, &decoded_internal_key_valid,
+                      &decoded_internal_key);
+  if (!s.ok()) {
+    return s;
+  }
+  if (!file_reader_.file_info()->is_mmap_mode) {
+    cur_key_.SetInternalKey(*parsed_key);
+    parsed_key->user_key =
+        Slice(cur_key_.GetInternalKey().data(), user_key_size);
+    if (internal_key != nullptr) {
+      *internal_key = cur_key_.GetInternalKey();
+    }
+  } else if (internal_key != nullptr) {
+    if (decoded_internal_key_valid) {
+      *internal_key = decoded_internal_key;
+    } else {
+      // Need to copy out the internal key
+      cur_key_.SetInternalKey(*parsed_key);
+      *internal_key = cur_key_.GetInternalKey();
+    }
+  }
+  return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextPrefixEncodingKey(
+    uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key,
+    uint32_t* bytes_read, bool* seekable) {
+  PlainTableEntryType entry_type;
+
+  bool expect_suffix = false;
+  Status s;
+  do {
+    uint32_t size = 0;
+    // dummy initial value to avoid compiler complain
+    bool decoded_internal_key_valid = true;
+    uint32_t my_bytes_read = 0;
+    s = DecodeSize(start_offset + *bytes_read, &entry_type, &size,
+                   &my_bytes_read);
+    if (!s.ok()) {
+      return s;
+    }
+    if (my_bytes_read == 0) {
+      return Status::Corruption("Unexpected EOF when reading size of the key");
+    }
+    *bytes_read += my_bytes_read;
+
+    switch (entry_type) {
+      case kFullKey: {
+        expect_suffix = false;
+        Slice decoded_internal_key;
+        s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+                            bytes_read, &decoded_internal_key_valid,
+                            &decoded_internal_key);
+        if (!s.ok()) {
+          return s;
+        }
+        if (!file_reader_.file_info()->is_mmap_mode ||
+            (internal_key != nullptr && !decoded_internal_key_valid)) {
+          // In non-mmap mode, always need to make a copy of keys returned to
+          // users, because after reading value for the key, the key might
+          // be invalid.
+          cur_key_.SetInternalKey(*parsed_key);
+          saved_user_key_ = cur_key_.GetUserKey();
+          if (!file_reader_.file_info()->is_mmap_mode) {
+            parsed_key->user_key =
+                Slice(cur_key_.GetInternalKey().data(), size);
+          }
+          if (internal_key != nullptr) {
+            *internal_key = cur_key_.GetInternalKey();
+          }
+        } else {
+          if (internal_key != nullptr) {
+            *internal_key = decoded_internal_key;
+          }
+          saved_user_key_ = parsed_key->user_key;
+        }
+        break;
+      }
+      case kPrefixFromPreviousKey: {
+        if (seekable != nullptr) {
+          *seekable = false;
+        }
+        prefix_len_ = size;
+        assert(prefix_extractor_ == nullptr ||
+               prefix_extractor_->Transform(saved_user_key_).size() ==
+                   prefix_len_);
+        // Need read another size flag for suffix
+        expect_suffix = true;
+        break;
+      }
+      case kKeySuffix: {
+        expect_suffix = false;
+        if (seekable != nullptr) {
+          *seekable = false;
+        }
+
+        Slice tmp_slice;
+        s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+                            bytes_read, &decoded_internal_key_valid,
+                            &tmp_slice);
+        if (!s.ok()) {
+          return s;
+        }
+        if (!file_reader_.file_info()->is_mmap_mode) {
+          // In non-mmap mode, we need to make a copy of keys returned to
+          // users, because after reading value for the key, the key might
+          // be invalid.
+          // saved_user_key_ points to cur_key_. We are making a copy of
+          // the prefix part to another string, and construct the current
+          // key from the prefix part and the suffix part back to cur_key_.
+          std::string tmp =
+              Slice(saved_user_key_.data(), prefix_len_).ToString();
+          cur_key_.Reserve(prefix_len_ + size);
+          cur_key_.SetInternalKey(tmp, *parsed_key);
+          parsed_key->user_key =
+              Slice(cur_key_.GetInternalKey().data(), prefix_len_ + size);
+          saved_user_key_ = cur_key_.GetUserKey();
+        } else {
+          cur_key_.Reserve(prefix_len_ + size);
+          cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_),
+                                  *parsed_key);
+        }
+        parsed_key->user_key = cur_key_.GetUserKey();
+        if (internal_key != nullptr) {
+          *internal_key = cur_key_.GetInternalKey();
+        }
+        break;
+      }
+      default:
+        return Status::Corruption("Un-identified size flag.");
+    }
+  } while (expect_suffix);  // Another round if suffix is expected.
+  return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextKey(uint32_t start_offset,
+                                     ParsedInternalKey* parsed_key,
+                                     Slice* internal_key, Slice* value,
+                                     uint32_t* bytes_read, bool* seekable) {
+  assert(value != nullptr);
+  Status s = NextKeyNoValue(start_offset, parsed_key, internal_key, bytes_read,
+                            seekable);
+  if (s.ok()) {
+    assert(bytes_read != nullptr);
+    uint32_t value_size;
+    uint32_t value_size_bytes;
+    bool success = file_reader_.ReadVarint32(start_offset + *bytes_read,
+                                             &value_size, &value_size_bytes);
+    if (!success) {
+      return file_reader_.status();
+    }
+    if (value_size_bytes == 0) {
+      return Status::Corruption(
+          "Unexpected EOF when reading the next value's size.");
+    }
+    *bytes_read += value_size_bytes;
+    success = file_reader_.Read(start_offset + *bytes_read, value_size, value);
+    if (!success) {
+      return file_reader_.status();
+    }
+    *bytes_read += value_size;
+  }
+  return s;
+}
+
+Status PlainTableKeyDecoder::NextKeyNoValue(uint32_t start_offset,
+                                            ParsedInternalKey* parsed_key,
+                                            Slice* internal_key,
+                                            uint32_t* bytes_read,
+                                            bool* seekable) {
+  *bytes_read = 0;
+  if (seekable != nullptr) {
+    *seekable = true;
+  }
+  if (encoding_type_ == kPlain) {
+    return NextPlainEncodingKey(start_offset, parsed_key, internal_key,
+                                bytes_read, seekable);
+  } else {
+    assert(encoding_type_ == kPrefix);
+    return NextPrefixEncodingKey(start_offset, parsed_key, internal_key,
+                                 bytes_read, seekable);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LIT
diff --git a/src/rocksdb/table/plain/plain_table_key_coding.h b/src/rocksdb/table/plain/plain_table_key_coding.h
new file mode 100644
index 000000000..9cda7df32
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_key_coding.h
@@ -0,0 +1,201 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <array>
+
+#include "rocksdb/slice.h"
+#include "table/plain/plain_table_reader.h"
+
+// The file contains three helper classes of PlainTable format,
+// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
+// These classes issue the lowest level of operations of PlainTable.
+// Actual data format of the key is documented in comments of class
+// PlainTableFactory.
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFile;
+struct ParsedInternalKey;
+struct PlainTableReaderFileInfo;
+enum PlainTableEntryType : unsigned char;
+
+// Helper class for PlainTable format to write out a key to an output file
+// The class is used in PlainTableBuilder.
+class PlainTableKeyEncoder {
+ public:
+  explicit PlainTableKeyEncoder(EncodingType encoding_type,
+                                uint32_t user_key_len,
+                                const SliceTransform* prefix_extractor,
+                                size_t index_sparseness)
+      : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
+        fixed_user_key_len_(user_key_len),
+        prefix_extractor_(prefix_extractor),
+        index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
+        key_count_for_prefix_(0) {}
+  // key: the key to write out, in the format of internal key.
+  // file: the output file to write out
+  // offset: offset in the file. Needs to be updated after appending bytes
+  //         for the key
+  // meta_bytes_buf: buffer for extra meta bytes
+  // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
+  //                      if meta_bytes_buf is updated.
+  IOStatus AppendKey(const Slice& key, WritableFileWriter* file,
+                     uint64_t* offset, char* meta_bytes_buf,
+                     size_t* meta_bytes_buf_size);
+
+  // Return actual encoding type to be picked
+  EncodingType GetEncodingType() { return encoding_type_; }
+
+ private:
+  EncodingType encoding_type_;
+  uint32_t fixed_user_key_len_;
+  const SliceTransform* prefix_extractor_;
+  const size_t index_sparseness_;
+  size_t key_count_for_prefix_;
+  IterKey pre_prefix_;
+};
+
+// The class does raw file reads for PlainTableReader.
+// It hides whether it is a mmap-read, or a non-mmap read.
+// The class is implemented in a way to favor the performance of mmap case.
+// The class is used by PlainTableReader.
+class PlainTableFileReader {
+ public:
+  explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
+      : file_info_(_file_info), num_buf_(0) {}
+
+  ~PlainTableFileReader() {
+    // Should fix.
+    status_.PermitUncheckedError();
+  }
+
+  // In mmaped mode, the results point to mmaped area of the file, which
+  // means it is always valid before closing the file.
+  // In non-mmap mode, the results point to an internal buffer. If the caller
+  // makes another read call, the results may not be valid. So callers should
+  // make a copy when needed.
+  // In order to save read calls to files, we keep two internal buffers:
+  // the first read and the most recent read. This is efficient because it
+  // columns these two common use cases:
+  // (1) hash index only identify one location, we read the key to verify
+  //     the location, and read key and value if it is the right location.
+  // (2) after hash index checking, we identify two locations (because of
+  //     hash bucket conflicts), we binary search the two location to see
+  //     which one is what we need and start to read from the location.
+  // These two most common use cases will be covered by the two buffers
+  // so that we don't need to re-read the same location.
+  // Currently we keep a fixed size buffer. If a read doesn't exactly fit
+  // the buffer, we replace the second buffer with the location user reads.
+  //
+  // If return false, status code is stored in status_.
+  bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
+    if (file_info_->is_mmap_mode) {
+      assert(file_offset + len <= file_info_->data_end_offset);
+      *out = Slice(file_info_->file_data.data() + file_offset, len);
+      return true;
+    } else {
+      return ReadNonMmap(file_offset, len, out);
+    }
+  }
+
+  // If return false, status code is stored in status_.
+  bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
+
+  // *bytes_read = 0 means eof. false means failure and status is saved
+  // in status_. Not directly returning Status to save copying status
+  // object to map previous performance of mmap mode.
+  inline bool ReadVarint32(uint32_t offset, uint32_t* output,
+                           uint32_t* bytes_read);
+
+  bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
+                           uint32_t* bytes_read);
+
+  Status status() const { return status_; }
+
+  const PlainTableReaderFileInfo* file_info() { return file_info_; }
+
+ private:
+  const PlainTableReaderFileInfo* file_info_;
+
+  struct Buffer {
+    Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
+    std::unique_ptr<char[]> buf;
+    uint32_t buf_start_offset;
+    uint32_t buf_len;
+    uint32_t buf_capacity;
+  };
+
+  // Keep buffers for two recent reads.
+  std::array<std::unique_ptr<Buffer>, 2> buffers_;
+  uint32_t num_buf_;
+  Status status_;
+
+  Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
+};
+
+// A helper class to decode keys from input buffer
+// The class is used by PlainTableBuilder.
+class PlainTableKeyDecoder {
+ public:
+  explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
+                                EncodingType encoding_type,
+                                uint32_t user_key_len,
+                                const SliceTransform* prefix_extractor)
+      : file_reader_(file_info),
+        encoding_type_(encoding_type),
+        prefix_len_(0),
+        fixed_user_key_len_(user_key_len),
+        prefix_extractor_(prefix_extractor),
+        in_prefix_(false) {}
+
+  // Find the next key.
+  // start: char array where the key starts.
+  // limit: boundary of the char array
+  // parsed_key: the output of the result key
+  // internal_key: if not null, fill with the output of the result key in
+  //               un-parsed format
+  // bytes_read: how many bytes read from start. Output
+  // seekable: whether key can be read from this place. Used when building
+  //           indexes. Output.
+  Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
+                 Slice* internal_key, Slice* value, uint32_t* bytes_read,
+                 bool* seekable = nullptr);
+
+  Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
+                        Slice* internal_key, uint32_t* bytes_read,
+                        bool* seekable = nullptr);
+
+  PlainTableFileReader file_reader_;
+  EncodingType encoding_type_;
+  uint32_t prefix_len_;
+  uint32_t fixed_user_key_len_;
+  Slice saved_user_key_;
+  IterKey cur_key_;
+  const SliceTransform* prefix_extractor_;
+  bool in_prefix_;
+
+ private:
+  Status NextPlainEncodingKey(uint32_t start_offset,
+                              ParsedInternalKey* parsed_key,
+                              Slice* internal_key, uint32_t* bytes_read,
+                              bool* seekable = nullptr);
+  Status NextPrefixEncodingKey(uint32_t start_offset,
+                               ParsedInternalKey* parsed_key,
+                               Slice* internal_key, uint32_t* bytes_read,
+                               bool* seekable = nullptr);
+  Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
+                         ParsedInternalKey* parsed_key, uint32_t* bytes_read,
+                         bool* internal_key_valid, Slice* internal_key);
+  inline Status DecodeSize(uint32_t start_offset,
+                           PlainTableEntryType* entry_type, uint32_t* key_size,
+                           uint32_t* bytes_read);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_reader.cc b/src/rocksdb/table/plain/plain_table_reader.cc
new file mode 100644
index 000000000..6ce3d0ab9
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_reader.cc
@@ -0,0 +1,765 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "table/plain/plain_table_reader.h"
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "memory/arena.h"
+#include "monitoring/histogram.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "table/block_based/block.h"
+#include "table/block_based/filter_block.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Safely getting a uint32_t element from a char array, where, starting from
+// `base`, every 4 bytes are considered as an fixed 32 bit integer.
+inline uint32_t GetFixed32Element(const char* base, size_t offset) {
+  return DecodeFixed32(base + offset * sizeof(uint32_t));
+}
+}  // namespace
+
+// Iterator to iterate IndexedTable
+class PlainTableIterator : public InternalIterator {
+ public:
+  explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
+  // No copying allowed
+  PlainTableIterator(const PlainTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+
+  ~PlainTableIterator() override;
+
+  bool Valid() const override;
+
+  void SeekToFirst() override;
+
+  void SeekToLast() override;
+
+  void Seek(const Slice& target) override;
+
+  void SeekForPrev(const Slice& target) override;
+
+  void Next() override;
+
+  void Prev() override;
+
+  Slice key() const override;
+
+  Slice value() const override;
+
+  Status status() const override;
+
+ private:
+  PlainTableReader* table_;
+  PlainTableKeyDecoder decoder_;
+  bool use_prefix_seek_;
+  uint32_t offset_;
+  uint32_t next_offset_;
+  Slice key_;
+  Slice value_;
+  Status status_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+PlainTableReader::PlainTableReader(
+    const ImmutableOptions& ioptions,
+    std::unique_ptr<RandomAccessFileReader>&& file,
+    const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
+    EncodingType encoding_type, uint64_t file_size,
+    const TableProperties* table_properties,
+    const SliceTransform* prefix_extractor)
+    : internal_comparator_(icomparator),
+      encoding_type_(encoding_type),
+      full_scan_mode_(false),
+      user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
+      prefix_extractor_(prefix_extractor),
+      enable_bloom_(false),
+      bloom_(6),
+      file_info_(std::move(file), storage_options,
+                 static_cast<uint32_t>(table_properties->data_size)),
+      ioptions_(ioptions),
+      file_size_(file_size),
+      table_properties_(nullptr) {}
+
+PlainTableReader::~PlainTableReader() {
+  // Should fix?
+  status_.PermitUncheckedError();
+}
+
+Status PlainTableReader::Open(
+    const ImmutableOptions& ioptions, const EnvOptions& env_options,
+    const InternalKeyComparator& internal_comparator,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
+    double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
+    bool full_scan_mode, const bool immortal_table,
+    const SliceTransform* prefix_extractor) {
+  if (file_size > PlainTableIndex::kMaxFileSize) {
+    return Status::NotSupported("File is too large for PlainTableReader!");
+  }
+
+  std::unique_ptr<TableProperties> props;
+  auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                               ioptions, &props);
+  if (!s.ok()) {
+    return s;
+  }
+
+  assert(hash_table_ratio >= 0.0);
+  auto& user_props = props->user_collected_properties;
+  auto prefix_extractor_in_file = props->prefix_extractor_name;
+
+  if (!full_scan_mode &&
+      !prefix_extractor_in_file.empty() /* old version sst file*/
+      && prefix_extractor_in_file != "nullptr") {
+    if (!prefix_extractor) {
+      return Status::InvalidArgument(
+          "Prefix extractor is missing when opening a PlainTable built "
+          "using a prefix extractor");
+    } else if (prefix_extractor_in_file != prefix_extractor->AsString()) {
+      return Status::InvalidArgument(
+          "Prefix extractor given doesn't match the one used to build "
+          "PlainTable");
+    }
+  }
+
+  EncodingType encoding_type = kPlain;
+  auto encoding_type_prop =
+      user_props.find(PlainTablePropertyNames::kEncodingType);
+  if (encoding_type_prop != user_props.end()) {
+    encoding_type = static_cast<EncodingType>(
+        DecodeFixed32(encoding_type_prop->second.c_str()));
+  }
+
+  std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
+      ioptions, std::move(file), env_options, internal_comparator,
+      encoding_type, file_size, props.get(), prefix_extractor));
+
+  s = new_reader->MmapDataIfNeeded();
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (!full_scan_mode) {
+    s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
+                                  hash_table_ratio, index_sparseness,
+                                  huge_page_tlb_size);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    // Flag to indicate it is a full scan mode so that none of the indexes
+    // can be used.
+    new_reader->full_scan_mode_ = true;
+  }
+  // PopulateIndex can add to the props, so don't store them until now
+  new_reader->table_properties_ = std::move(props);
+
+  if (immortal_table && new_reader->file_info_.is_mmap_mode) {
+    new_reader->dummy_cleanable_.reset(new Cleanable());
+  }
+
+  *table_reader = std::move(new_reader);
+  return s;
+}
+
+void PlainTableReader::SetupForCompaction() {}
+
+InternalIterator* PlainTableReader::NewIterator(
+    const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
+    Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
+    size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
+  // Not necessarily used here, but make sure this has been initialized
+  assert(table_properties_);
+
+  // Auto prefix mode is not implemented in PlainTable.
+  bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
+                         !options.auto_prefix_mode;
+  if (arena == nullptr) {
+    return new PlainTableIterator(this, use_prefix_seek);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
+    return new (mem) PlainTableIterator(this, use_prefix_seek);
+  }
+}
+
+Status PlainTableReader::PopulateIndexRecordList(
+    PlainTableIndexBuilder* index_builder,
+    std::vector<uint32_t>* prefix_hashes) {
+  Slice prev_key_prefix_slice;
+  std::string prev_key_prefix_buf;
+  uint32_t pos = data_start_offset_;
+
+  bool is_first_record = true;
+  Slice key_prefix_slice;
+  PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
+                               prefix_extractor_);
+  while (pos < file_info_.data_end_offset) {
+    uint32_t key_offset = pos;
+    ParsedInternalKey key;
+    Slice value_slice;
+    bool seekable = false;
+    Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
+    if (!s.ok()) {
+      return s;
+    }
+
+    key_prefix_slice = GetPrefix(key);
+    if (enable_bloom_) {
+      bloom_.AddHash(GetSliceHash(key.user_key));
+    } else {
+      if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
+        if (!is_first_record) {
+          prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
+        }
+        if (file_info_.is_mmap_mode) {
+          prev_key_prefix_slice = key_prefix_slice;
+        } else {
+          prev_key_prefix_buf = key_prefix_slice.ToString();
+          prev_key_prefix_slice = prev_key_prefix_buf;
+        }
+      }
+    }
+
+    index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
+
+    if (!seekable && is_first_record) {
+      return Status::Corruption("Key for a prefix is not seekable");
+    }
+
+    is_first_record = false;
+  }
+
+  prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
+  auto s = index_.InitFromRawData(index_builder->Finish());
+  return s;
+}
+
+void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
+                                     size_t huge_page_tlb_size) {
+  uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
+  if (bloom_total_bits > 0) {
+    enable_bloom_ = true;
+    bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
+                        huge_page_tlb_size, ioptions_.logger);
+  }
+}
+
+void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
+  assert(bloom_.IsInitialized());
+  for (const auto prefix_hash : prefix_hashes) {
+    bloom_.AddHash(prefix_hash);
+  }
+}
+
+Status PlainTableReader::MmapDataIfNeeded() {
+  if (file_info_.is_mmap_mode) {
+    // Get mmapped memory.
+    return file_info_.file->Read(
+        IOOptions(), 0, static_cast<size_t>(file_size_), &file_info_.file_data,
+        nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+  }
+  return Status::OK();
+}
+
+Status PlainTableReader::PopulateIndex(TableProperties* props,
+                                       int bloom_bits_per_key,
+                                       double hash_table_ratio,
+                                       size_t index_sparseness,
+                                       size_t huge_page_tlb_size) {
+  assert(props != nullptr);
+
+  BlockContents index_block_contents;
+  Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+                           file_size_, kPlainTableMagicNumber, ioptions_,
+                           PlainTableIndexBuilder::kPlainTableIndexBlock,
+                           BlockType::kIndex, &index_block_contents);
+
+  bool index_in_file = s.ok();
+
+  BlockContents bloom_block_contents;
+  bool bloom_in_file = false;
+  // We only need to read the bloom block if index block is in file.
+  if (index_in_file) {
+    s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+                      file_size_, kPlainTableMagicNumber, ioptions_,
+                      BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
+                      &bloom_block_contents);
+    bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
+  }
+
+  Slice* bloom_block;
+  if (bloom_in_file) {
+    // If bloom_block_contents.allocation is not empty (which will be the case
+    // for non-mmap mode), it holds the alloated memory for the bloom block.
+    // It needs to be kept alive to keep `bloom_block` valid.
+    bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
+    bloom_block = &bloom_block_contents.data;
+  } else {
+    bloom_block = nullptr;
+  }
+
+  Slice* index_block;
+  if (index_in_file) {
+    // If index_block_contents.allocation is not empty (which will be the case
+    // for non-mmap mode), it holds the alloated memory for the index block.
+    // It needs to be kept alive to keep `index_block` valid.
+    index_block_alloc_ = std::move(index_block_contents.allocation);
+    index_block = &index_block_contents.data;
+  } else {
+    index_block = nullptr;
+  }
+
+  if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
+    // moptions.prefix_extractor is requried for a hash-based look-up.
+    return Status::NotSupported(
+        "PlainTable requires a prefix extractor enable prefix hash mode.");
+  }
+
+  // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
+  // for a prefix (starting from the first one), generate a record of (hash,
+  // offset) and append it to IndexRecordList, which is a data structure created
+  // to store them.
+
+  if (!index_in_file) {
+    // Allocate bloom filter here for total order mode.
+    if (IsTotalOrderMode()) {
+      AllocateBloom(bloom_bits_per_key,
+                    static_cast<uint32_t>(props->num_entries),
+                    huge_page_tlb_size);
+    }
+  } else if (bloom_in_file) {
+    enable_bloom_ = true;
+    auto num_blocks_property = props->user_collected_properties.find(
+        PlainTablePropertyNames::kNumBloomBlocks);
+
+    uint32_t num_blocks = 0;
+    if (num_blocks_property != props->user_collected_properties.end()) {
+      Slice temp_slice(num_blocks_property->second);
+      if (!GetVarint32(&temp_slice, &num_blocks)) {
+        num_blocks = 0;
+      }
+    }
+    // cast away const qualifier, because bloom_ won't be changed
+    bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
+                      static_cast<uint32_t>(bloom_block->size()) * 8,
+                      num_blocks);
+  } else {
+    // Index in file but no bloom in file. Disable bloom filter in this case.
+    enable_bloom_ = false;
+    bloom_bits_per_key = 0;
+  }
+
+  PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
+                                       index_sparseness, hash_table_ratio,
+                                       huge_page_tlb_size);
+
+  std::vector<uint32_t> prefix_hashes;
+  if (!index_in_file) {
+    // Populates _bloom if enabled (total order mode)
+    s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    s = index_.InitFromRawData(*index_block);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (!index_in_file) {
+    if (!IsTotalOrderMode()) {
+      // Calculated bloom filter size and allocate memory for
+      // bloom filter based on the number of prefixes, then fill it.
+      AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
+                    huge_page_tlb_size);
+      if (enable_bloom_) {
+        FillBloom(prefix_hashes);
+      }
+    }
+  }
+
+  // Fill two table properties.
+  if (!index_in_file) {
+    props->user_collected_properties["plain_table_hash_table_size"] =
+        std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
+    props->user_collected_properties["plain_table_sub_index_size"] =
+        std::to_string(index_.GetSubIndexSize());
+  } else {
+    props->user_collected_properties["plain_table_hash_table_size"] =
+        std::to_string(0);
+    props->user_collected_properties["plain_table_sub_index_size"] =
+        std::to_string(0);
+  }
+
+  return Status::OK();
+}
+
+Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
+                                   const Slice& target, const Slice& prefix,
+                                   uint32_t prefix_hash, bool& prefix_matched,
+                                   uint32_t* offset) const {
+  prefix_matched = false;
+  uint32_t prefix_index_offset;
+  auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
+  if (res == PlainTableIndex::kNoPrefixForBucket) {
+    *offset = file_info_.data_end_offset;
+    return Status::OK();
+  } else if (res == PlainTableIndex::kDirectToFile) {
+    *offset = prefix_index_offset;
+    return Status::OK();
+  }
+
+  // point to sub-index, need to do a binary search
+  uint32_t upper_bound = 0;
+  const char* base_ptr =
+      index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
+  uint32_t low = 0;
+  uint32_t high = upper_bound;
+  ParsedInternalKey mid_key;
+  ParsedInternalKey parsed_target;
+  Status s = ParseInternalKey(target, &parsed_target,
+                              false /* log_err_key */);  // TODO
+  if (!s.ok()) return s;
+
+  // The key is between [low, high). Do a binary search between it.
+  while (high - low > 1) {
+    uint32_t mid = (high + low) / 2;
+    uint32_t file_offset = GetFixed32Element(base_ptr, mid);
+    uint32_t tmp;
+    s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
+    if (!s.ok()) {
+      return s;
+    }
+    int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
+    if (cmp_result < 0) {
+      low = mid;
+    } else {
+      if (cmp_result == 0) {
+        // Happen to have found the exact key or target is smaller than the
+        // first key after base_offset.
+        prefix_matched = true;
+        *offset = file_offset;
+        return Status::OK();
+      } else {
+        high = mid;
+      }
+    }
+  }
+  // Both of the key at the position low or low+1 could share the same
+  // prefix as target. We need to rule out one of them to avoid to go
+  // to the wrong prefix.
+  ParsedInternalKey low_key;
+  uint32_t tmp;
+  uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
+  s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (GetPrefix(low_key) == prefix) {
+    prefix_matched = true;
+    *offset = low_key_offset;
+  } else if (low + 1 < upper_bound) {
+    // There is possible a next prefix, return it
+    prefix_matched = false;
+    *offset = GetFixed32Element(base_ptr, low + 1);
+  } else {
+    // target is larger than a key of the last prefix in this bucket
+    // but with a different prefix. Key does not exist.
+    *offset = file_info_.data_end_offset;
+  }
+  return Status::OK();
+}
+
+bool PlainTableReader::MatchBloom(uint32_t hash) const {
+  if (!enable_bloom_) {
+    return true;
+  }
+
+  if (bloom_.MayContainHash(hash)) {
+    PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+    return true;
+  } else {
+    PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+    return false;
+  }
+}
+
+Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+                              ParsedInternalKey* parsed_key,
+                              Slice* internal_key, Slice* value,
+                              bool* seekable) const {
+  if (*offset == file_info_.data_end_offset) {
+    *offset = file_info_.data_end_offset;
+    return Status::OK();
+  }
+
+  if (*offset > file_info_.data_end_offset) {
+    return Status::Corruption("Offset is out of file size");
+  }
+
+  uint32_t bytes_read;
+  Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
+                              &bytes_read, seekable);
+  if (!s.ok()) {
+    return s;
+  }
+  *offset = *offset + bytes_read;
+  return Status::OK();
+}
+
+void PlainTableReader::Prepare(const Slice& target) {
+  if (enable_bloom_) {
+    uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
+    bloom_.Prefetch(prefix_hash);
+  }
+}
+
+Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
+                             GetContext* get_context,
+                             const SliceTransform* /* prefix_extractor */,
+                             bool /*skip_filters*/) {
+  // Check bloom filter first.
+  Slice prefix_slice;
+  uint32_t prefix_hash;
+  if (IsTotalOrderMode()) {
+    if (full_scan_mode_) {
+      status_ =
+          Status::InvalidArgument("Get() is not allowed in full scan mode.");
+    }
+    // Match whole user key for bloom filter check.
+    if (!MatchBloom(GetSliceHash(ExtractUserKey(target)))) {
+      return Status::OK();
+    }
+    // in total order mode, there is only one bucket 0, and we always use empty
+    // prefix.
+    prefix_slice = Slice();
+    prefix_hash = 0;
+  } else {
+    prefix_slice = GetPrefix(target);
+    prefix_hash = GetSliceHash(prefix_slice);
+    if (!MatchBloom(prefix_hash)) {
+      return Status::OK();
+    }
+  }
+  uint32_t offset;
+  bool prefix_match;
+  PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
+                               prefix_extractor_);
+  Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
+                       prefix_match, &offset);
+
+  if (!s.ok()) {
+    return s;
+  }
+  ParsedInternalKey found_key;
+  ParsedInternalKey parsed_target;
+  s = ParseInternalKey(target, &parsed_target,
+                       false /* log_err_key */);  // TODO
+  if (!s.ok()) return s;
+
+  Slice found_value;
+  while (offset < file_info_.data_end_offset) {
+    s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
+    if (!s.ok()) {
+      return s;
+    }
+    if (!prefix_match) {
+      // Need to verify prefix for the first key found if it is not yet
+      // checked.
+      if (GetPrefix(found_key) != prefix_slice) {
+        return Status::OK();
+      }
+      prefix_match = true;
+    }
+    // TODO(ljin): since we know the key comparison result here,
+    // can we enable the fast path?
+    if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
+      bool dont_care __attribute__((__unused__));
+      if (!get_context->SaveValue(found_key, found_value, &dont_care,
+                                  dummy_cleanable_.get())) {
+        break;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
+                                               TableReaderCaller /*caller*/) {
+  return 0;
+}
+
+uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
+                                           const Slice& /*end*/,
+                                           TableReaderCaller /*caller*/) {
+  return 0;
+}
+
+PlainTableIterator::PlainTableIterator(PlainTableReader* table,
+                                       bool use_prefix_seek)
+    : table_(table),
+      decoder_(&table_->file_info_, table_->encoding_type_,
+               table_->user_key_len_, table_->prefix_extractor_),
+      use_prefix_seek_(use_prefix_seek) {
+  next_offset_ = offset_ = table_->file_info_.data_end_offset;
+}
+
+PlainTableIterator::~PlainTableIterator() {}
+
+bool PlainTableIterator::Valid() const {
+  return offset_ < table_->file_info_.data_end_offset &&
+         offset_ >= table_->data_start_offset_;
+}
+
+void PlainTableIterator::SeekToFirst() {
+  status_ = Status::OK();
+  next_offset_ = table_->data_start_offset_;
+  if (next_offset_ >= table_->file_info_.data_end_offset) {
+    next_offset_ = offset_ = table_->file_info_.data_end_offset;
+  } else {
+    Next();
+  }
+}
+
+void PlainTableIterator::SeekToLast() {
+  assert(false);
+  status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
+  next_offset_ = offset_ = table_->file_info_.data_end_offset;
+}
+
+void PlainTableIterator::Seek(const Slice& target) {
+  if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
+    // This check is done here instead of NewIterator() to permit creating an
+    // iterator with total_order_seek = true even if we won't be able to Seek()
+    // it. This is needed for compaction: it creates iterator with
+    // total_order_seek = true but usually never does Seek() on it,
+    // only SeekToFirst().
+    status_ = Status::InvalidArgument(
+        "total_order_seek not implemented for PlainTable.");
+    offset_ = next_offset_ = table_->file_info_.data_end_offset;
+    return;
+  }
+
+  // If the user doesn't set prefix seek option and we are not able to do a
+  // total Seek(). assert failure.
+  if (table_->IsTotalOrderMode()) {
+    if (table_->full_scan_mode_) {
+      status_ =
+          Status::InvalidArgument("Seek() is not allowed in full scan mode.");
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+      return;
+    } else if (table_->GetIndexSize() > 1) {
+      assert(false);
+      status_ = Status::NotSupported(
+          "PlainTable cannot issue non-prefix seek unless in total order "
+          "mode.");
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+      return;
+    }
+  }
+
+  Slice prefix_slice = table_->GetPrefix(target);
+  uint32_t prefix_hash = 0;
+  // Bloom filter is ignored in total-order mode.
+  if (!table_->IsTotalOrderMode()) {
+    prefix_hash = GetSliceHash(prefix_slice);
+    if (!table_->MatchBloom(prefix_hash)) {
+      status_ = Status::OK();
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+      return;
+    }
+  }
+  bool prefix_match;
+  status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash,
+                              prefix_match, &next_offset_);
+  if (!status_.ok()) {
+    offset_ = next_offset_ = table_->file_info_.data_end_offset;
+    return;
+  }
+
+  if (next_offset_ < table_->file_info_.data_end_offset) {
+    for (Next(); status_.ok() && Valid(); Next()) {
+      if (!prefix_match) {
+        // Need to verify the first key's prefix
+        if (table_->GetPrefix(key()) != prefix_slice) {
+          offset_ = next_offset_ = table_->file_info_.data_end_offset;
+          break;
+        }
+        prefix_match = true;
+      }
+      if (table_->internal_comparator_.Compare(key(), target) >= 0) {
+        break;
+      }
+    }
+  } else {
+    offset_ = table_->file_info_.data_end_offset;
+  }
+}
+
+void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
+  assert(false);
+  status_ =
+      Status::NotSupported("SeekForPrev() is not supported in PlainTable");
+  offset_ = next_offset_ = table_->file_info_.data_end_offset;
+}
+
+void PlainTableIterator::Next() {
+  offset_ = next_offset_;
+  if (offset_ < table_->file_info_.data_end_offset) {
+    Slice tmp_slice;
+    ParsedInternalKey parsed_key;
+    status_ =
+        table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
+    if (!status_.ok()) {
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+    }
+  }
+}
+
+void PlainTableIterator::Prev() { assert(false); }
+
+Slice PlainTableIterator::key() const {
+  assert(Valid());
+  return key_;
+}
+
+Slice PlainTableIterator::value() const {
+  assert(Valid());
+  return value_;
+}
+
+Status PlainTableIterator::status() const { return status_; }
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_reader.h b/src/rocksdb/table/plain/plain_table_reader.h
new file mode 100644
index 000000000..62bda693a
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_reader.h
@@ -0,0 +1,244 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Block;
+struct BlockContents;
+class BlockHandle;
+class Footer;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class TableReader;
+class InternalKeyComparator;
+class PlainTableKeyDecoder;
+class GetContext;
+
+extern const uint32_t kPlainTableVariableLength;
+
+struct PlainTableReaderFileInfo {
+  bool is_mmap_mode;
+  Slice file_data;
+  uint32_t data_end_offset;
+  std::unique_ptr<RandomAccessFileReader> file;
+
+  PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file,
+                           const EnvOptions& storage_options,
+                           uint32_t _data_size_offset)
+      : is_mmap_mode(storage_options.use_mmap_reads),
+        data_end_offset(_data_size_offset),
+        file(std::move(_file)) {}
+};
+
+// The reader class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableReader : public TableReader {
+ public:
+  // Based on following output file format shown in plain_table_factory.h
+  // When opening the output file, PlainTableReader creates a hash table
+  // from key prefixes to offset of the output file. PlainTable will decide
+  // whether it points to the data offset of the first key with the key prefix
+  // or the offset of it. If there are too many keys share this prefix, it will
+  // create a binary search-able index from the suffix to offset on disk.
+  static Status Open(const ImmutableOptions& ioptions,
+                     const EnvOptions& env_options,
+                     const InternalKeyComparator& internal_comparator,
+                     std::unique_ptr<RandomAccessFileReader>&& file,
+                     uint64_t file_size, std::unique_ptr<TableReader>* table,
+                     const int bloom_bits_per_key, double hash_table_ratio,
+                     size_t index_sparseness, size_t huge_page_tlb_size,
+                     bool full_scan_mode, const bool immortal_table = false,
+                     const SliceTransform* prefix_extractor = nullptr);
+
+  // Returns new iterator over table contents
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
+
+  void Prepare(const Slice& target) override;
+
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& key,
+                               TableReaderCaller caller) override;
+
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           TableReaderCaller caller) override;
+
+  uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
+  void SetupForCompaction() override;
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override {
+    return table_properties_;
+  }
+
+  virtual size_t ApproximateMemoryUsage() const override {
+    return arena_.MemoryAllocatedBytes();
+  }
+
+  PlainTableReader(const ImmutableOptions& ioptions,
+                   std::unique_ptr<RandomAccessFileReader>&& file,
+                   const EnvOptions& env_options,
+                   const InternalKeyComparator& internal_comparator,
+                   EncodingType encoding_type, uint64_t file_size,
+                   const TableProperties* table_properties,
+                   const SliceTransform* prefix_extractor);
+  virtual ~PlainTableReader();
+
+ protected:
+  // Check bloom filter to see whether it might contain this prefix.
+  // The hash of the prefix is given, since it can be reused for index lookup
+  // too.
+  virtual bool MatchBloom(uint32_t hash) const;
+
+  // PopulateIndex() builds index of keys. It must be called before any query
+  // to the table.
+  //
+  // props: the table properties object that need to be stored. Ownership of
+  //        the object will be passed.
+  //
+
+  Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
+                       double hash_table_ratio, size_t index_sparseness,
+                       size_t huge_page_tlb_size);
+
+  Status MmapDataIfNeeded();
+
+ private:
+  const InternalKeyComparator internal_comparator_;
+  EncodingType encoding_type_;
+  // represents plain table's current status.
+  Status status_;
+
+  PlainTableIndex index_;
+  bool full_scan_mode_;
+
+  // data_start_offset_ and data_end_offset_ defines the range of the
+  // sst file that stores data.
+  const uint32_t data_start_offset_ = 0;
+  const uint32_t user_key_len_;
+  const SliceTransform* prefix_extractor_;
+
+  static const size_t kNumInternalBytes = 8;
+
+  // Bloom filter is used to rule out non-existent key
+  bool enable_bloom_;
+  PlainTableBloomV1 bloom_;
+  PlainTableReaderFileInfo file_info_;
+  Arena arena_;
+  CacheAllocationPtr index_block_alloc_;
+  CacheAllocationPtr bloom_block_alloc_;
+
+  const ImmutableOptions& ioptions_;
+  std::unique_ptr<Cleanable> dummy_cleanable_;
+  uint64_t file_size_;
+
+ protected:  // for testing
+  std::shared_ptr<const TableProperties> table_properties_;
+
+ private:
+  bool IsFixedLength() const {
+    return user_key_len_ != kPlainTableVariableLength;
+  }
+
+  size_t GetFixedInternalKeyLength() const {
+    return user_key_len_ + kNumInternalBytes;
+  }
+
+  Slice GetPrefix(const Slice& target) const {
+    assert(target.size() >= 8);  // target is internal key
+    return GetPrefixFromUserKey(ExtractUserKey(target));
+  }
+
+  Slice GetPrefix(const ParsedInternalKey& target) const {
+    return GetPrefixFromUserKey(target.user_key);
+  }
+
+  Slice GetPrefixFromUserKey(const Slice& user_key) const {
+    if (!IsTotalOrderMode()) {
+      return prefix_extractor_->Transform(user_key);
+    } else {
+      // Use empty slice as prefix if prefix_extractor is not set.
+      // In that case,
+      // it falls back to pure binary search and
+      // total iterator seek is supported.
+      return Slice();
+    }
+  }
+
+  friend class TableCache;
+  friend class PlainTableIterator;
+
+  // Internal helper function to generate an IndexRecordList object from all
+  // the rows, which contains index records as a list.
+  // If bloom_ is not null, all the keys' full-key hash will be added to the
+  // bloom filter.
+  Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
+                                 std::vector<uint32_t>* prefix_hashes);
+
+  // Internal helper function to allocate memory for bloom filter
+  void AllocateBloom(int bloom_bits_per_key, int num_prefixes,
+                     size_t huge_page_tlb_size);
+
+  void FillBloom(const std::vector<uint32_t>& prefix_hashes);
+
+  // Read the key and value at `offset` to parameters for keys, the and
+  // `seekable`.
+  // On success, `offset` will be updated as the offset for the next key.
+  // `parsed_key` will be key in parsed format.
+  // if `internal_key` is not empty, it will be filled with key with slice
+  // format.
+  // if `seekable` is not null, it will return whether we can directly read
+  // data using this offset.
+  Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+              ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
+              bool* seekable = nullptr) const;
+  // Get file offset for key target.
+  // return value prefix_matched is set to true if the offset is confirmed
+  // for a key with the same prefix as target.
+  Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target,
+                   const Slice& prefix, uint32_t prefix_hash,
+                   bool& prefix_matched, uint32_t* offset) const;
+
+  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+
+  // No copying allowed
+  explicit PlainTableReader(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/scoped_arena_iterator.h b/src/rocksdb/table/scoped_arena_iterator.h
new file mode 100644
index 000000000..2b8824d95
--- /dev/null
+++ b/src/rocksdb/table/scoped_arena_iterator.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "port/port.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ScopedArenaIterator {
+  void reset(InternalIterator* iter) noexcept {
+    if (iter_ != nullptr) {
+      iter_->~InternalIterator();
+    }
+    iter_ = iter;
+  }
+
+ public:
+  explicit ScopedArenaIterator(InternalIterator* iter = nullptr)
+      : iter_(iter) {}
+
+  ScopedArenaIterator(const ScopedArenaIterator&) = delete;
+  ScopedArenaIterator& operator=(const ScopedArenaIterator&) = delete;
+
+  ScopedArenaIterator(ScopedArenaIterator&& o) noexcept {
+    iter_ = o.iter_;
+    o.iter_ = nullptr;
+  }
+
+  ScopedArenaIterator& operator=(ScopedArenaIterator&& o) noexcept {
+    reset(o.iter_);
+    o.iter_ = nullptr;
+    return *this;
+  }
+
+  InternalIterator* operator->() { return iter_; }
+  InternalIterator* get() { return iter_; }
+
+  void set(InternalIterator* iter) { reset(iter); }
+
+  InternalIterator* release() {
+    assert(iter_ != nullptr);
+    auto* res = iter_;
+    iter_ = nullptr;
+    return res;
+  }
+
+  ~ScopedArenaIterator() { reset(nullptr); }
+
+ private:
+  InternalIterator* iter_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/sst_file_dumper.cc b/src/rocksdb/table/sst_file_dumper.cc
new file mode 100644
index 000000000..122f0995a
--- /dev/null
+++ b/src/rocksdb/table/sst_file_dumper.cc
@@ -0,0 +1,519 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "table/sst_file_dumper.h"
+
+#include <chrono>
+#include <cinttypes>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#include "db/blob/blob_index.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "options/cf_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "util/compression.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+SstFileDumper::SstFileDumper(const Options& options,
+                             const std::string& file_path,
+                             Temperature file_temp, size_t readahead_size,
+                             bool verify_checksum, bool output_hex,
+                             bool decode_blob_index, const EnvOptions& soptions,
+                             bool silent)
+    : file_name_(file_path),
+      read_num_(0),
+      file_temp_(file_temp),
+      output_hex_(output_hex),
+      decode_blob_index_(decode_blob_index),
+      soptions_(soptions),
+      silent_(silent),
+      options_(options),
+      ioptions_(options_),
+      moptions_(ColumnFamilyOptions(options_)),
+      read_options_(verify_checksum, false),
+      internal_comparator_(BytewiseComparator()) {
+  read_options_.readahead_size = readahead_size;
+  if (!silent_) {
+    fprintf(stdout, "Process %s\n", file_path.c_str());
+  }
+  init_result_ = GetTableReader(file_name_);
+}
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+
+const char* testFileName = "test_file_name";
+
+Status SstFileDumper::GetTableReader(const std::string& file_path) {
+  // Warning about 'magic_number' being uninitialized shows up only in UBsan
+  // builds. Though access is guarded by 's.ok()' checks, fix the issue to
+  // avoid any warnings.
+  uint64_t magic_number = Footer::kNullTableMagicNumber;
+
+  // read table magic number
+  Footer footer;
+
+  const auto& fs = options_.env->GetFileSystem();
+  std::unique_ptr<FSRandomAccessFile> file;
+  uint64_t file_size = 0;
+  FileOptions fopts = soptions_;
+  fopts.temperature = file_temp_;
+  Status s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
+  if (s.ok()) {
+    s = fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+  }
+
+  // check empty file
+  // if true, skip further processing of this file
+  if (file_size == 0) {
+    return Status::Aborted(file_path, "Empty file");
+  }
+
+  file_.reset(new RandomAccessFileReader(std::move(file), file_path));
+
+  FilePrefetchBuffer prefetch_buffer(
+      0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */,
+      false /* track_min_offset */);
+  if (s.ok()) {
+    const uint64_t kSstDumpTailPrefetchSize = 512 * 1024;
+    uint64_t prefetch_size = (file_size > kSstDumpTailPrefetchSize)
+                                 ? kSstDumpTailPrefetchSize
+                                 : file_size;
+    uint64_t prefetch_off = file_size - prefetch_size;
+    IOOptions opts;
+    s = prefetch_buffer.Prefetch(opts, file_.get(), prefetch_off,
+                                 static_cast<size_t>(prefetch_size),
+                                 Env::IO_TOTAL /* rate_limiter_priority */);
+
+    s = ReadFooterFromFile(opts, file_.get(), &prefetch_buffer, file_size,
+                           &footer);
+  }
+  if (s.ok()) {
+    magic_number = footer.table_magic_number();
+  }
+
+  if (s.ok()) {
+    if (magic_number == kPlainTableMagicNumber ||
+        magic_number == kLegacyPlainTableMagicNumber) {
+      soptions_.use_mmap_reads = true;
+
+      fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
+      file_.reset(new RandomAccessFileReader(std::move(file), file_path));
+    }
+
+    // For old sst format, ReadTableProperties might fail but file can be read
+    if (ReadTableProperties(magic_number, file_.get(), file_size,
+                            (magic_number == kBlockBasedTableMagicNumber)
+                                ? &prefetch_buffer
+                                : nullptr)
+            .ok()) {
+      s = SetTableOptionsByMagicNumber(magic_number);
+      if (s.ok()) {
+        if (table_properties_ && !table_properties_->comparator_name.empty()) {
+          ConfigOptions config_options;
+          const Comparator* user_comparator = nullptr;
+          s = Comparator::CreateFromString(config_options,
+                                           table_properties_->comparator_name,
+                                           &user_comparator);
+          if (s.ok()) {
+            assert(user_comparator);
+            internal_comparator_ = InternalKeyComparator(user_comparator);
+          }
+        }
+      }
+    } else {
+      s = SetOldTableOptions();
+    }
+    options_.comparator = internal_comparator_.user_comparator();
+  }
+
+  if (s.ok()) {
+    s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size,
+                       &table_reader_);
+  }
+  return s;
+}
+
+Status SstFileDumper::NewTableReader(
+    const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
+    const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
+    std::unique_ptr<TableReader>* /*table_reader*/) {
+  auto t_opt =
+      TableReaderOptions(ioptions_, moptions_.prefix_extractor, soptions_,
+                         internal_comparator_, false /* skip_filters */,
+                         false /* imortal */, true /* force_direct_prefetch */);
+  // Allow open file with global sequence number for backward compatibility.
+  t_opt.largest_seqno = kMaxSequenceNumber;
+
+  // We need to turn off pre-fetching of index and filter nodes for
+  // BlockBasedTable
+  if (options_.table_factory->IsInstanceOf(
+          TableFactory::kBlockBasedTableName())) {
+    return options_.table_factory->NewTableReader(t_opt, std::move(file_),
+                                                  file_size, &table_reader_,
+                                                  /*enable_prefetch=*/false);
+  }
+
+  // For all other factory implementation
+  return options_.table_factory->NewTableReader(t_opt, std::move(file_),
+                                                file_size, &table_reader_);
+}
+
+Status SstFileDumper::VerifyChecksum() {
+  // We could pass specific readahead setting into read options if needed.
+  return table_reader_->VerifyChecksum(read_options_,
+                                       TableReaderCaller::kSSTDumpTool);
+}
+
+Status SstFileDumper::DumpTable(const std::string& out_filename) {
+  std::unique_ptr<WritableFile> out_file;
+  Env* env = options_.env;
+  Status s = env->NewWritableFile(out_filename, &out_file, soptions_);
+  if (s.ok()) {
+    s = table_reader_->DumpTable(out_file.get());
+  }
+  if (!s.ok()) {
+    // close the file before return error, ignore the close error if there's any
+    out_file->Close().PermitUncheckedError();
+    return s;
+  }
+  return out_file->Close();
+}
+
+Status SstFileDumper::CalculateCompressedTableSize(
+    const TableBuilderOptions& tb_options, size_t block_size,
+    uint64_t* num_data_blocks, uint64_t* compressed_table_size) {
+  std::unique_ptr<Env> env(NewMemEnv(options_.env));
+  std::unique_ptr<WritableFileWriter> dest_writer;
+  Status s =
+      WritableFileWriter::Create(env->GetFileSystem(), testFileName,
+                                 FileOptions(soptions_), &dest_writer, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.block_size = block_size;
+  BlockBasedTableFactory block_based_tf(table_options);
+  std::unique_ptr<TableBuilder> table_builder;
+  table_builder.reset(
+      block_based_tf.NewTableBuilder(tb_options, dest_writer.get()));
+  std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
+      read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    table_builder->Add(iter->key(), iter->value());
+  }
+  s = iter->status();
+  if (!s.ok()) {
+    return s;
+  }
+  s = table_builder->Finish();
+  if (!s.ok()) {
+    return s;
+  }
+  *compressed_table_size = table_builder->FileSize();
+  assert(num_data_blocks != nullptr);
+  *num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
+  return env->DeleteFile(testFileName);
+}
+
+Status SstFileDumper::ShowAllCompressionSizes(
+    size_t block_size,
+    const std::vector<std::pair<CompressionType, const char*>>&
+        compression_types,
+    int32_t compress_level_from, int32_t compress_level_to,
+    uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
+    uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer) {
+  fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
+  for (auto& i : compression_types) {
+    if (CompressionTypeSupported(i.first)) {
+      fprintf(stdout, "Compression: %-24s\n", i.second);
+      CompressionOptions compress_opt;
+      compress_opt.max_dict_bytes = max_dict_bytes;
+      compress_opt.zstd_max_train_bytes = zstd_max_train_bytes;
+      compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes;
+      compress_opt.use_zstd_dict_trainer = use_zstd_dict_trainer;
+      for (int32_t j = compress_level_from; j <= compress_level_to; j++) {
+        fprintf(stdout, "Compression level: %d", j);
+        compress_opt.level = j;
+        Status s = ShowCompressionSize(block_size, i.first, compress_opt);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    } else {
+      fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
+    }
+  }
+  return Status::OK();
+}
+
+Status SstFileDumper::ShowCompressionSize(
+    size_t block_size, CompressionType compress_type,
+    const CompressionOptions& compress_opt) {
+  Options opts;
+  opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  opts.statistics->set_stats_level(StatsLevel::kAll);
+  const ImmutableOptions imoptions(opts);
+  const ColumnFamilyOptions cfo(opts);
+  const MutableCFOptions moptions(cfo);
+  ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
+  IntTblPropCollectorFactories block_based_table_factories;
+
+  std::string column_family_name;
+  int unknown_level = -1;
+  TableBuilderOptions tb_opts(
+      imoptions, moptions, ikc, &block_based_table_factories, compress_type,
+      compress_opt,
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      column_family_name, unknown_level);
+  uint64_t num_data_blocks = 0;
+  std::chrono::steady_clock::time_point start =
+      std::chrono::steady_clock::now();
+  uint64_t file_size;
+  Status s = CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks,
+                                          &file_size);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
+  fprintf(stdout, " Size: %10" PRIu64, file_size);
+  fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
+  fprintf(stdout, " Time Taken: %10s microsecs",
+          std::to_string(
+              std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+                  .count())
+              .c_str());
+  const uint64_t compressed_blocks =
+      opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
+  const uint64_t not_compressed_blocks =
+      opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED);
+  // When the option enable_index_compression is true,
+  // NUMBER_BLOCK_COMPRESSED is incremented for index block(s).
+  if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) {
+    num_data_blocks = compressed_blocks + not_compressed_blocks;
+  }
+
+  const uint64_t ratio_not_compressed_blocks =
+      (num_data_blocks - compressed_blocks) - not_compressed_blocks;
+  const double compressed_pcnt =
+      (0 == num_data_blocks) ? 0.0
+                             : ((static_cast<double>(compressed_blocks) /
+                                 static_cast<double>(num_data_blocks)) *
+                                100.0);
+  const double ratio_not_compressed_pcnt =
+      (0 == num_data_blocks)
+          ? 0.0
+          : ((static_cast<double>(ratio_not_compressed_blocks) /
+              static_cast<double>(num_data_blocks)) *
+             100.0);
+  const double not_compressed_pcnt =
+      (0 == num_data_blocks) ? 0.0
+                             : ((static_cast<double>(not_compressed_blocks) /
+                                 static_cast<double>(num_data_blocks)) *
+                                100.0);
+  fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
+          compressed_pcnt);
+  fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
+          ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
+  fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
+          not_compressed_blocks, not_compressed_pcnt);
+  return Status::OK();
+}
+
+// Reads TableProperties prior to opening table reader in order to set up
+// options.
+Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
+                                          RandomAccessFileReader* file,
+                                          uint64_t file_size,
+                                          FilePrefetchBuffer* prefetch_buffer) {
+  Status s = ROCKSDB_NAMESPACE::ReadTableProperties(
+      file, file_size, table_magic_number, ioptions_, &table_properties_,
+      /* memory_allocator= */ nullptr, prefetch_buffer);
+  if (!s.ok()) {
+    if (!silent_) {
+      fprintf(stdout, "Not able to read table properties\n");
+    }
+  }
+  return s;
+}
+
+Status SstFileDumper::SetTableOptionsByMagicNumber(
+    uint64_t table_magic_number) {
+  assert(table_properties_);
+  if (table_magic_number == kBlockBasedTableMagicNumber ||
+      table_magic_number == kLegacyBlockBasedTableMagicNumber) {
+    BlockBasedTableFactory* bbtf = new BlockBasedTableFactory();
+    // To force tail prefetching, we fake reporting two useful reads of 512KB
+    // from the tail.
+    // It needs at least two data points to warm up the stats.
+    bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
+    bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
+
+    options_.table_factory.reset(bbtf);
+    if (!silent_) {
+      fprintf(stdout, "Sst file format: block-based\n");
+    }
+
+    auto& props = table_properties_->user_collected_properties;
+    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (pos != props.end()) {
+      auto index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
+          DecodeFixed32(pos->second.c_str()));
+      if (index_type_on_file ==
+          BlockBasedTableOptions::IndexType::kHashSearch) {
+        options_.prefix_extractor.reset(NewNoopTransform());
+      }
+    }
+  } else if (table_magic_number == kPlainTableMagicNumber ||
+             table_magic_number == kLegacyPlainTableMagicNumber) {
+    options_.allow_mmap_reads = true;
+
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = kPlainTableVariableLength;
+    plain_table_options.bloom_bits_per_key = 0;
+    plain_table_options.hash_table_ratio = 0;
+    plain_table_options.index_sparseness = 1;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPlain;
+    plain_table_options.full_scan_mode = true;
+
+    options_.table_factory.reset(NewPlainTableFactory(plain_table_options));
+    if (!silent_) {
+      fprintf(stdout, "Sst file format: plain table\n");
+    }
+  } else {
+    char error_msg_buffer[80];
+    snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
+             "Unsupported table magic number --- %lx",
+             (long)table_magic_number);
+    return Status::InvalidArgument(error_msg_buffer);
+  }
+
+  return Status::OK();
+}
+
+Status SstFileDumper::SetOldTableOptions() {
+  assert(table_properties_ == nullptr);
+  options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+  if (!silent_) {
+    fprintf(stdout, "Sst file format: block-based(old version)\n");
+  }
+
+  return Status::OK();
+}
+
+Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
+                                     bool has_from, const std::string& from_key,
+                                     bool has_to, const std::string& to_key,
+                                     bool use_from_as_prefix) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  InternalIterator* iter = table_reader_->NewIterator(
+      read_options_, moptions_.prefix_extractor.get(),
+      /*arena=*/nullptr, /*skip_filters=*/false,
+      TableReaderCaller::kSSTDumpTool);
+  uint64_t i = 0;
+  if (has_from) {
+    InternalKey ikey;
+    ikey.SetMinPossibleForUserKey(from_key);
+    iter->Seek(ikey.Encode());
+  } else {
+    iter->SeekToFirst();
+  }
+  for (; iter->Valid(); iter->Next()) {
+    Slice key = iter->key();
+    Slice value = iter->value();
+    ++i;
+    if (read_num > 0 && i > read_num) break;
+
+    ParsedInternalKey ikey;
+    Status pik_status = ParseInternalKey(key, &ikey, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      std::cerr << pik_status.getState() << "\n";
+      continue;
+    }
+
+    // the key returned is not prefixed with out 'from' key
+    if (use_from_as_prefix && !ikey.user_key.starts_with(from_key)) {
+      break;
+    }
+
+    // If end marker was specified, we stop before it
+    if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
+      break;
+    }
+
+    if (print_kv) {
+      if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) {
+        fprintf(stdout, "%s => %s\n",
+                ikey.DebugString(true, output_hex_).c_str(),
+                value.ToString(output_hex_).c_str());
+      } else {
+        BlobIndex blob_index;
+
+        const Status s = blob_index.DecodeFrom(value);
+        if (!s.ok()) {
+          fprintf(stderr, "%s => error decoding blob index\n",
+                  ikey.DebugString(true, output_hex_).c_str());
+          continue;
+        }
+
+        fprintf(stdout, "%s => %s\n",
+                ikey.DebugString(true, output_hex_).c_str(),
+                blob_index.DebugString(output_hex_).c_str());
+      }
+    }
+  }
+
+  read_num_ += i;
+
+  Status ret = iter->status();
+  delete iter;
+  return ret;
+}
+
+// Provides TableProperties to API user
+Status SstFileDumper::ReadTableProperties(
+    std::shared_ptr<const TableProperties>* table_properties) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  *table_properties = table_reader_->GetTableProperties();
+  return init_result_;
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/sst_file_dumper.h b/src/rocksdb/table/sst_file_dumper.h
new file mode 100644
index 000000000..7be876390
--- /dev/null
+++ b/src/rocksdb/table/sst_file_dumper.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/advanced_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SstFileDumper {
+ public:
+  explicit SstFileDumper(const Options& options, const std::string& file_name,
+                         Temperature file_temp, size_t readahead_size,
+                         bool verify_checksum, bool output_hex,
+                         bool decode_blob_index,
+                         const EnvOptions& soptions = EnvOptions(),
+                         bool silent = false);
+
+  Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from,
+                        const std::string& from_key, bool has_to,
+                        const std::string& to_key,
+                        bool use_from_as_prefix = false);
+
+  Status ReadTableProperties(
+      std::shared_ptr<const TableProperties>* table_properties);
+  uint64_t GetReadNumber() { return read_num_; }
+  TableProperties* GetInitTableProperties() { return table_properties_.get(); }
+
+  Status VerifyChecksum();
+  Status DumpTable(const std::string& out_filename);
+  Status getStatus() { return init_result_; }
+
+  Status ShowAllCompressionSizes(
+      size_t block_size,
+      const std::vector<std::pair<CompressionType, const char*>>&
+          compression_types,
+      int32_t compress_level_from, int32_t compress_level_to,
+      uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
+      uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer);
+
+  Status ShowCompressionSize(size_t block_size, CompressionType compress_type,
+                             const CompressionOptions& compress_opt);
+
+ private:
+  // Get the TableReader implementation for the sst file
+  Status GetTableReader(const std::string& file_path);
+  Status ReadTableProperties(uint64_t table_magic_number,
+                             RandomAccessFileReader* file, uint64_t file_size,
+                             FilePrefetchBuffer* prefetch_buffer);
+
+  Status CalculateCompressedTableSize(const TableBuilderOptions& tb_options,
+                                      size_t block_size,
+                                      uint64_t* num_data_blocks,
+                                      uint64_t* compressed_table_size);
+
+  Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
+  Status SetOldTableOptions();
+
+  // Helper function to call the factory with settings specific to the
+  // factory implementation
+  Status NewTableReader(const ImmutableOptions& ioptions,
+                        const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
+                        uint64_t file_size,
+                        std::unique_ptr<TableReader>* table_reader);
+
+  std::string file_name_;
+  uint64_t read_num_;
+  Temperature file_temp_;
+  bool output_hex_;
+  bool decode_blob_index_;
+  EnvOptions soptions_;
+  // less verbose in stdout/stderr
+  bool silent_;
+
+  // options_ and internal_comparator_ will also be used in
+  // ReadSequential internally (specifically, seek-related operations)
+  Options options_;
+
+  Status init_result_;
+  std::unique_ptr<TableReader> table_reader_;
+  std::unique_ptr<RandomAccessFileReader> file_;
+
+  const ImmutableOptions ioptions_;
+  const MutableCFOptions moptions_;
+  ReadOptions read_options_;
+  InternalKeyComparator internal_comparator_;
+  std::unique_ptr<TableProperties> table_properties_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/sst_file_reader.cc b/src/rocksdb/table/sst_file_reader.cc
new file mode 100644
index 000000000..48f1be0be
--- /dev/null
+++ b/src/rocksdb/table/sst_file_reader.cc
@@ -0,0 +1,101 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_file_reader.h"
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "table/get_context.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct SstFileReader::Rep {
+  Options options;
+  EnvOptions soptions;
+  ImmutableOptions ioptions;
+  MutableCFOptions moptions;
+
+  std::unique_ptr<TableReader> table_reader;
+
+  Rep(const Options& opts)
+      : options(opts),
+        soptions(options),
+        ioptions(options),
+        moptions(ColumnFamilyOptions(options)) {}
+};
+
+SstFileReader::SstFileReader(const Options& options) : rep_(new Rep(options)) {}
+
+SstFileReader::~SstFileReader() {}
+
+Status SstFileReader::Open(const std::string& file_path) {
+  auto r = rep_.get();
+  Status s;
+  uint64_t file_size = 0;
+  std::unique_ptr<FSRandomAccessFile> file;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  FileOptions fopts(r->soptions);
+  const auto& fs = r->options.env->GetFileSystem();
+
+  s = fs->GetFileSize(file_path, fopts.io_options, &file_size, nullptr);
+  if (s.ok()) {
+    s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
+  }
+  if (s.ok()) {
+    file_reader.reset(new RandomAccessFileReader(std::move(file), file_path));
+  }
+  if (s.ok()) {
+    TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor,
+                             r->soptions, r->ioptions.internal_comparator);
+    // Allow open file with global sequence number for backward compatibility.
+    t_opt.largest_seqno = kMaxSequenceNumber;
+    s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader),
+                                                 file_size, &r->table_reader);
+  }
+  return s;
+}
+
+Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) {
+  auto r = rep_.get();
+  auto sequence = roptions.snapshot != nullptr
+                      ? roptions.snapshot->GetSequenceNumber()
+                      : kMaxSequenceNumber;
+  ArenaWrappedDBIter* res = new ArenaWrappedDBIter();
+  res->Init(r->options.env, roptions, r->ioptions, r->moptions,
+            nullptr /* version */, sequence,
+            r->moptions.max_sequential_skip_in_iterations,
+            0 /* version_number */, nullptr /* read_callback */,
+            nullptr /* db_impl */, nullptr /* cfd */,
+            true /* expose_blob_index */, false /* allow_refresh */);
+  auto internal_iter = r->table_reader->NewIterator(
+      res->GetReadOptions(), r->moptions.prefix_extractor.get(),
+      res->GetArena(), false /* skip_filters */,
+      TableReaderCaller::kSSTFileReader);
+  res->SetIterUnderDBIter(internal_iter);
+  return res;
+}
+
+std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties()
+    const {
+  return rep_->table_reader->GetTableProperties();
+}
+
+Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) {
+  return rep_->table_reader->VerifyChecksum(read_options,
+                                            TableReaderCaller::kSSTFileReader);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/table/sst_file_reader_test.cc b/src/rocksdb/table/sst_file_reader_test.cc
new file mode 100644
index 000000000..4837d223b
--- /dev/null
+++ b/src/rocksdb/table/sst_file_reader_test.cc
@@ -0,0 +1,434 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_file_reader.h"
+
+#include <cinttypes>
+
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/sst_file_writer.h"
+#include "table/sst_file_writer_collectors.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string EncodeAsString(uint64_t v) {
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%08" PRIu64, v);
+  return std::string(buf);
+}
+
+std::string EncodeAsUint64(uint64_t v) {
+  std::string dst;
+  PutFixed64(&dst, v);
+  return dst;
+}
+
+class SstFileReaderTest : public testing::Test {
+ public:
+  SstFileReaderTest() {
+    options_.merge_operator = MergeOperators::CreateUInt64AddOperator();
+    sst_name_ = test::PerThreadDBPath("sst_file");
+
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    EXPECT_NE(nullptr, base_env);
+    env_ = base_env;
+    options_.env = env_;
+  }
+
+  ~SstFileReaderTest() {
+    Status s = env_->DeleteFile(sst_name_);
+    EXPECT_OK(s);
+  }
+
+  void CreateFile(const std::string& file_name,
+                  const std::vector<std::string>& keys) {
+    SstFileWriter writer(soptions_, options_);
+    ASSERT_OK(writer.Open(file_name));
+    for (size_t i = 0; i + 2 < keys.size(); i += 3) {
+      ASSERT_OK(writer.Put(keys[i], keys[i]));
+      ASSERT_OK(writer.Merge(keys[i + 1], EncodeAsUint64(i + 1)));
+      ASSERT_OK(writer.Delete(keys[i + 2]));
+    }
+    ASSERT_OK(writer.Finish());
+  }
+
+  void CheckFile(const std::string& file_name,
+                 const std::vector<std::string>& keys,
+                 bool check_global_seqno = false) {
+    ReadOptions ropts;
+    SstFileReader reader(options_);
+    ASSERT_OK(reader.Open(file_name));
+    ASSERT_OK(reader.VerifyChecksum());
+    std::unique_ptr<Iterator> iter(reader.NewIterator(ropts));
+    iter->SeekToFirst();
+    for (size_t i = 0; i + 2 < keys.size(); i += 3) {
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key().compare(keys[i]), 0);
+      ASSERT_EQ(iter->value().compare(keys[i]), 0);
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key().compare(keys[i + 1]), 0);
+      ASSERT_EQ(iter->value().compare(EncodeAsUint64(i + 1)), 0);
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+    if (check_global_seqno) {
+      auto properties = reader.GetTableProperties();
+      ASSERT_TRUE(properties);
+      std::string hostname;
+      ASSERT_OK(env_->GetHostNameString(&hostname));
+      ASSERT_EQ(properties->db_host_id, hostname);
+      auto& user_properties = properties->user_collected_properties;
+      ASSERT_TRUE(
+          user_properties.count(ExternalSstFilePropertyNames::kGlobalSeqno));
+    }
+  }
+
+  void CreateFileAndCheck(const std::vector<std::string>& keys) {
+    CreateFile(sst_name_, keys);
+    CheckFile(sst_name_, keys);
+  }
+
+ protected:
+  Options options_;
+  EnvOptions soptions_;
+  std::string sst_name_;
+  std::shared_ptr<Env> env_guard_;
+  Env* env_;
+};
+
+const uint64_t kNumKeys = 100;
+
+TEST_F(SstFileReaderTest, Basic) {
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsString(i));
+  }
+  CreateFileAndCheck(keys);
+}
+
+TEST_F(SstFileReaderTest, Uint64Comparator) {
+  options_.comparator = test::Uint64Comparator();
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsUint64(i));
+  }
+  CreateFileAndCheck(keys);
+}
+
+TEST_F(SstFileReaderTest, ReadOptionsOutOfScope) {
+  // Repro a bug where the SstFileReader depended on its configured ReadOptions
+  // outliving it.
+  options_.comparator = test::Uint64Comparator();
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsUint64(i));
+  }
+  CreateFile(sst_name_, keys);
+
+  SstFileReader reader(options_);
+  ASSERT_OK(reader.Open(sst_name_));
+  std::unique_ptr<Iterator> iter;
+  {
+    // Make sure ReadOptions go out of scope ASAP so we know the iterator
+    // operations do not depend on it.
+    ReadOptions ropts;
+    iter.reset(reader.NewIterator(ropts));
+  }
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->Next();
+  }
+}
+
+TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) {
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsString(i));
+  }
+  // Generate a SST file.
+  CreateFile(sst_name_, keys);
+
+  // Ingest the file into a db, to assign it a global sequence number.
+  Options options;
+  options.create_if_missing = true;
+  std::string db_name = test::PerThreadDBPath("test_db");
+  DB* db;
+  ASSERT_OK(DB::Open(options, db_name, &db));
+  // Bump sequence number.
+  ASSERT_OK(db->Put(WriteOptions(), keys[0], "foo"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // Ingest the file.
+  IngestExternalFileOptions ingest_options;
+  ingest_options.write_global_seqno = true;
+  ASSERT_OK(db->IngestExternalFile({sst_name_}, ingest_options));
+  std::vector<std::string> live_files;
+  uint64_t manifest_file_size = 0;
+  ASSERT_OK(db->GetLiveFiles(live_files, &manifest_file_size));
+  // Get the ingested file.
+  std::string ingested_file;
+  for (auto& live_file : live_files) {
+    if (live_file.substr(live_file.size() - 4, std::string::npos) == ".sst") {
+      if (ingested_file.empty() || ingested_file < live_file) {
+        ingested_file = live_file;
+      }
+    }
+  }
+  ASSERT_FALSE(ingested_file.empty());
+  delete db;
+
+  // Verify the file can be open and read by SstFileReader.
+  CheckFile(db_name + ingested_file, keys, true /* check_global_seqno */);
+
+  // Cleanup.
+  ASSERT_OK(DestroyDB(db_name, options));
+}
+
+TEST_F(SstFileReaderTest, TimestampSizeMismatch) {
+  SstFileWriter writer(soptions_, options_);
+
+  ASSERT_OK(writer.Open(sst_name_));
+
+  // Comparator is not timestamp-aware; calls to APIs taking timestamps should
+  // fail.
+  ASSERT_NOK(writer.Put("key", EncodeAsUint64(100), "value"));
+  ASSERT_NOK(writer.Delete("another_key", EncodeAsUint64(200)));
+}
+
+class SstFileReaderTimestampTest : public testing::Test {
+ public:
+  SstFileReaderTimestampTest() {
+    Env* env = Env::Default();
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env, &env_guard_));
+    EXPECT_NE(nullptr, env);
+
+    options_.env = env;
+
+    options_.comparator = test::BytewiseComparatorWithU64TsWrapper();
+
+    sst_name_ = test::PerThreadDBPath("sst_file_ts");
+  }
+
+  ~SstFileReaderTimestampTest() {
+    EXPECT_OK(options_.env->DeleteFile(sst_name_));
+  }
+
+  struct KeyValueDesc {
+    KeyValueDesc(std::string k, std::string ts, std::string v)
+        : key(std::move(k)), timestamp(std::move(ts)), value(std::move(v)) {}
+
+    std::string key;
+    std::string timestamp;
+    std::string value;
+  };
+
+  struct InputKeyValueDesc : public KeyValueDesc {
+    InputKeyValueDesc(std::string k, std::string ts, std::string v, bool is_del,
+                      bool use_contig_buf)
+        : KeyValueDesc(std::move(k), std::move(ts), std::move(v)),
+          is_delete(is_del),
+          use_contiguous_buffer(use_contig_buf) {}
+
+    bool is_delete = false;
+    bool use_contiguous_buffer = false;
+  };
+
+  struct OutputKeyValueDesc : public KeyValueDesc {
+    OutputKeyValueDesc(std::string k, std::string ts, std::string v)
+        : KeyValueDesc(std::move(k), std::string(ts), std::string(v)) {}
+  };
+
+  void CreateFile(const std::vector<InputKeyValueDesc>& descs) {
+    SstFileWriter writer(soptions_, options_);
+
+    ASSERT_OK(writer.Open(sst_name_));
+
+    for (const auto& desc : descs) {
+      if (desc.is_delete) {
+        if (desc.use_contiguous_buffer) {
+          std::string key_with_ts(desc.key + desc.timestamp);
+          ASSERT_OK(writer.Delete(Slice(key_with_ts.data(), desc.key.size()),
+                                  Slice(key_with_ts.data() + desc.key.size(),
+                                        desc.timestamp.size())));
+        } else {
+          ASSERT_OK(writer.Delete(desc.key, desc.timestamp));
+        }
+      } else {
+        if (desc.use_contiguous_buffer) {
+          std::string key_with_ts(desc.key + desc.timestamp);
+          ASSERT_OK(writer.Put(Slice(key_with_ts.data(), desc.key.size()),
+                               Slice(key_with_ts.data() + desc.key.size(),
+                                     desc.timestamp.size()),
+                               desc.value));
+        } else {
+          ASSERT_OK(writer.Put(desc.key, desc.timestamp, desc.value));
+        }
+      }
+    }
+
+    ASSERT_OK(writer.Finish());
+  }
+
+  void CheckFile(const std::string& timestamp,
+                 const std::vector<OutputKeyValueDesc>& descs) {
+    SstFileReader reader(options_);
+
+    ASSERT_OK(reader.Open(sst_name_));
+    ASSERT_OK(reader.VerifyChecksum());
+
+    Slice ts_slice(timestamp);
+
+    ReadOptions read_options;
+    read_options.timestamp = &ts_slice;
+
+    std::unique_ptr<Iterator> iter(reader.NewIterator(read_options));
+    iter->SeekToFirst();
+
+    for (const auto& desc : descs) {
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key(), desc.key);
+      ASSERT_EQ(iter->timestamp(), desc.timestamp);
+      ASSERT_EQ(iter->value(), desc.value);
+
+      iter->Next();
+    }
+
+    ASSERT_FALSE(iter->Valid());
+  }
+
+ protected:
+  std::shared_ptr<Env> env_guard_;
+  Options options_;
+  EnvOptions soptions_;
+  std::string sst_name_;
+};
+
+TEST_F(SstFileReaderTimestampTest, Basic) {
+  std::vector<InputKeyValueDesc> input_descs;
+
+  for (uint64_t k = 0; k < kNumKeys; k += 4) {
+    // A Put with key k, timestamp k that gets overwritten by a subsequent Put
+    // with timestamp (k + 1). Note that the comparator uses descending order
+    // for the timestamp part, so we add the later Put first.
+    input_descs.emplace_back(
+        /* key */ EncodeAsString(k), /* timestamp */ EncodeAsUint64(k + 1),
+        /* value */ EncodeAsString(k * 2), /* is_delete */ false,
+        /* use_contiguous_buffer */ false);
+    input_descs.emplace_back(
+        /* key */ EncodeAsString(k), /* timestamp */ EncodeAsUint64(k),
+        /* value */ EncodeAsString(k * 3), /* is_delete */ false,
+        /* use_contiguous_buffer */ true);
+
+    // A Put with key (k + 2), timestamp (k + 2) that gets cancelled out by a
+    // Delete with timestamp (k + 3).  Note that the comparator uses descending
+    // order for the timestamp part, so we add the Delete first.
+    input_descs.emplace_back(/* key */ EncodeAsString(k + 2),
+                             /* timestamp */ EncodeAsUint64(k + 3),
+                             /* value */ std::string(), /* is_delete */ true,
+                             /* use_contiguous_buffer */ (k % 8) == 0);
+    input_descs.emplace_back(
+        /* key */ EncodeAsString(k + 2), /* timestamp */ EncodeAsUint64(k + 2),
+        /* value */ EncodeAsString(k * 5), /* is_delete */ false,
+        /* use_contiguous_buffer */ (k % 8) != 0);
+  }
+
+  CreateFile(input_descs);
+
+  // Note: below, we check the results as of each timestamp in the range,
+  // updating the expected result as needed.
+  std::vector<OutputKeyValueDesc> output_descs;
+
+  for (uint64_t ts = 0; ts < kNumKeys; ++ts) {
+    const uint64_t k = ts - (ts % 4);
+
+    switch (ts % 4) {
+      case 0:  // Initial Put for key k
+        output_descs.emplace_back(/* key */ EncodeAsString(k),
+                                  /* timestamp */ EncodeAsUint64(ts),
+                                  /* value */ EncodeAsString(k * 3));
+        break;
+
+      case 1:  // Second Put for key k
+        assert(output_descs.back().key == EncodeAsString(k));
+        assert(output_descs.back().timestamp == EncodeAsUint64(ts - 1));
+        assert(output_descs.back().value == EncodeAsString(k * 3));
+        output_descs.back().timestamp = EncodeAsUint64(ts);
+        output_descs.back().value = EncodeAsString(k * 2);
+        break;
+
+      case 2:  // Put for key (k + 2)
+        output_descs.emplace_back(/* key */ EncodeAsString(k + 2),
+                                  /* timestamp */ EncodeAsUint64(ts),
+                                  /* value */ EncodeAsString(k * 5));
+        break;
+
+      case 3:  // Delete for key (k + 2)
+        assert(output_descs.back().key == EncodeAsString(k + 2));
+        assert(output_descs.back().timestamp == EncodeAsUint64(ts - 1));
+        assert(output_descs.back().value == EncodeAsString(k * 5));
+        output_descs.pop_back();
+        break;
+    }
+
+    CheckFile(EncodeAsUint64(ts), output_descs);
+  }
+}
+
+TEST_F(SstFileReaderTimestampTest, TimestampsOutOfOrder) {
+  SstFileWriter writer(soptions_, options_);
+
+  ASSERT_OK(writer.Open(sst_name_));
+
+  // Note: KVs that have the same user key disregarding timestamps should be in
+  // descending order of timestamps.
+  ASSERT_OK(writer.Put("key", EncodeAsUint64(1), "value1"));
+  ASSERT_NOK(writer.Put("key", EncodeAsUint64(2), "value2"));
+}
+
+TEST_F(SstFileReaderTimestampTest, TimestampSizeMismatch) {
+  SstFileWriter writer(soptions_, options_);
+
+  ASSERT_OK(writer.Open(sst_name_));
+
+  // Comparator expects 64-bit timestamps; timestamps with other sizes as well
+  // as calls to the timestamp-less APIs should be rejected.
+  ASSERT_NOK(writer.Put("key", "not_an_actual_64_bit_timestamp", "value"));
+  ASSERT_NOK(writer.Delete("another_key", "timestamp_of_unexpected_size"));
+
+  ASSERT_NOK(writer.Put("key_without_timestamp", "value"));
+  ASSERT_NOK(writer.Merge("another_key_missing_a_timestamp", "merge_operand"));
+  ASSERT_NOK(writer.Delete("yet_another_key_still_no_timestamp"));
+  ASSERT_NOK(writer.DeleteRange("begin_key_timestamp_absent",
+                                "end_key_with_a_complete_lack_of_timestamps"));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as SstFileReader is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/sst_file_writer.cc b/src/rocksdb/table/sst_file_writer.cc
new file mode 100644
index 000000000..16d11efd3
--- /dev/null
+++ b/src/rocksdb/table/sst_file_writer.cc
@@ -0,0 +1,427 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/sst_file_writer.h"
+
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/sst_file_writer_collectors.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string ExternalSstFilePropertyNames::kVersion =
+    "rocksdb.external_sst_file.version";
+const std::string ExternalSstFilePropertyNames::kGlobalSeqno =
+    "rocksdb.external_sst_file.global_seqno";
+
+#ifndef ROCKSDB_LITE
+
+const size_t kFadviseTrigger = 1024 * 1024;  // 1MB
+
+struct SstFileWriter::Rep {
+  Rep(const EnvOptions& _env_options, const Options& options,
+      Env::IOPriority _io_priority, const Comparator* _user_comparator,
+      ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters,
+      std::string _db_session_id)
+      : env_options(_env_options),
+        ioptions(options),
+        mutable_cf_options(options),
+        io_priority(_io_priority),
+        internal_comparator(_user_comparator),
+        cfh(_cfh),
+        invalidate_page_cache(_invalidate_page_cache),
+        skip_filters(_skip_filters),
+        db_session_id(_db_session_id) {}
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  std::unique_ptr<TableBuilder> builder;
+  EnvOptions env_options;
+  ImmutableOptions ioptions;
+  MutableCFOptions mutable_cf_options;
+  Env::IOPriority io_priority;
+  InternalKeyComparator internal_comparator;
+  ExternalSstFileInfo file_info;
+  InternalKey ikey;
+  std::string column_family_name;
+  ColumnFamilyHandle* cfh;
+  // If true, We will give the OS a hint that this file pages is not needed
+  // every time we write 1MB to the file.
+  bool invalidate_page_cache;
+  // The size of the file during the last time we called Fadvise to remove
+  // cached pages from page cache.
+  uint64_t last_fadvise_size = 0;
+  bool skip_filters;
+  std::string db_session_id;
+  uint64_t next_file_number = 1;
+
+  Status AddImpl(const Slice& user_key, const Slice& value,
+                 ValueType value_type) {
+    if (!builder) {
+      return Status::InvalidArgument("File is not opened");
+    }
+
+    if (file_info.num_entries == 0) {
+      file_info.smallest_key.assign(user_key.data(), user_key.size());
+    } else {
+      if (internal_comparator.user_comparator()->Compare(
+              user_key, file_info.largest_key) <= 0) {
+        // Make sure that keys are added in order
+        return Status::InvalidArgument(
+            "Keys must be added in strict ascending order.");
+      }
+    }
+
+    assert(value_type == kTypeValue || value_type == kTypeMerge ||
+           value_type == kTypeDeletion ||
+           value_type == kTypeDeletionWithTimestamp);
+
+    constexpr SequenceNumber sequence_number = 0;
+
+    ikey.Set(user_key, sequence_number, value_type);
+
+    builder->Add(ikey.Encode(), value);
+
+    // update file info
+    file_info.num_entries++;
+    file_info.largest_key.assign(user_key.data(), user_key.size());
+    file_info.file_size = builder->FileSize();
+
+    InvalidatePageCache(false /* closing */).PermitUncheckedError();
+    return Status::OK();
+  }
+
+  Status Add(const Slice& user_key, const Slice& value, ValueType value_type) {
+    if (internal_comparator.user_comparator()->timestamp_size() != 0) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+
+    return AddImpl(user_key, value, value_type);
+  }
+
+  Status Add(const Slice& user_key, const Slice& timestamp, const Slice& value,
+             ValueType value_type) {
+    const size_t timestamp_size = timestamp.size();
+
+    if (internal_comparator.user_comparator()->timestamp_size() !=
+        timestamp_size) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+
+    const size_t user_key_size = user_key.size();
+
+    if (user_key.data() + user_key_size == timestamp.data()) {
+      Slice user_key_with_ts(user_key.data(), user_key_size + timestamp_size);
+      return AddImpl(user_key_with_ts, value, value_type);
+    }
+
+    std::string user_key_with_ts;
+    user_key_with_ts.reserve(user_key_size + timestamp_size);
+    user_key_with_ts.append(user_key.data(), user_key_size);
+    user_key_with_ts.append(timestamp.data(), timestamp_size);
+
+    return AddImpl(user_key_with_ts, value, value_type);
+  }
+
+  Status DeleteRangeImpl(const Slice& begin_key, const Slice& end_key) {
+    if (!builder) {
+      return Status::InvalidArgument("File is not opened");
+    }
+    RangeTombstone tombstone(begin_key, end_key, 0 /* Sequence Number */);
+    if (file_info.num_range_del_entries == 0) {
+      file_info.smallest_range_del_key.assign(tombstone.start_key_.data(),
+                                              tombstone.start_key_.size());
+      file_info.largest_range_del_key.assign(tombstone.end_key_.data(),
+                                             tombstone.end_key_.size());
+    } else {
+      if (internal_comparator.user_comparator()->Compare(
+              tombstone.start_key_, file_info.smallest_range_del_key) < 0) {
+        file_info.smallest_range_del_key.assign(tombstone.start_key_.data(),
+                                                tombstone.start_key_.size());
+      }
+      if (internal_comparator.user_comparator()->Compare(
+              tombstone.end_key_, file_info.largest_range_del_key) > 0) {
+        file_info.largest_range_del_key.assign(tombstone.end_key_.data(),
+                                               tombstone.end_key_.size());
+      }
+    }
+
+    auto ikey_and_end_key = tombstone.Serialize();
+    builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
+
+    // update file info
+    file_info.num_range_del_entries++;
+    file_info.file_size = builder->FileSize();
+
+    InvalidatePageCache(false /* closing */).PermitUncheckedError();
+    return Status::OK();
+  }
+
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key) {
+    if (internal_comparator.user_comparator()->timestamp_size() != 0) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+    return DeleteRangeImpl(begin_key, end_key);
+  }
+
+  // begin_key and end_key should be users keys without timestamp.
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key,
+                     const Slice& timestamp) {
+    const size_t timestamp_size = timestamp.size();
+
+    if (internal_comparator.user_comparator()->timestamp_size() !=
+        timestamp_size) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+
+    const size_t begin_key_size = begin_key.size();
+    const size_t end_key_size = end_key.size();
+    if (begin_key.data() + begin_key_size == timestamp.data() ||
+        end_key.data() + begin_key_size == timestamp.data()) {
+      assert(memcmp(begin_key.data() + begin_key_size,
+                    end_key.data() + end_key_size, timestamp_size) == 0);
+      Slice begin_key_with_ts(begin_key.data(),
+                              begin_key_size + timestamp_size);
+      Slice end_key_with_ts(end_key.data(), end_key.size() + timestamp_size);
+      return DeleteRangeImpl(begin_key_with_ts, end_key_with_ts);
+    }
+    std::string begin_key_with_ts;
+    begin_key_with_ts.reserve(begin_key_size + timestamp_size);
+    begin_key_with_ts.append(begin_key.data(), begin_key_size);
+    begin_key_with_ts.append(timestamp.data(), timestamp_size);
+    std::string end_key_with_ts;
+    end_key_with_ts.reserve(end_key_size + timestamp_size);
+    end_key_with_ts.append(end_key.data(), end_key_size);
+    end_key_with_ts.append(timestamp.data(), timestamp_size);
+    return DeleteRangeImpl(begin_key_with_ts, end_key_with_ts);
+  }
+
+  Status InvalidatePageCache(bool closing) {
+    Status s = Status::OK();
+    if (invalidate_page_cache == false) {
+      // Fadvise disabled
+      return s;
+    }
+    uint64_t bytes_since_last_fadvise = builder->FileSize() - last_fadvise_size;
+    if (bytes_since_last_fadvise > kFadviseTrigger || closing) {
+      TEST_SYNC_POINT_CALLBACK("SstFileWriter::Rep::InvalidatePageCache",
+                               &(bytes_since_last_fadvise));
+      // Tell the OS that we don't need this file in page cache
+      s = file_writer->InvalidateCache(0, 0);
+      if (s.IsNotSupported()) {
+        // NotSupported is fine as it could be a file type that doesn't use page
+        // cache.
+        s = Status::OK();
+      }
+      last_fadvise_size = builder->FileSize();
+    }
+    return s;
+  }
+};
+
+SstFileWriter::SstFileWriter(const EnvOptions& env_options,
+                             const Options& options,
+                             const Comparator* user_comparator,
+                             ColumnFamilyHandle* column_family,
+                             bool invalidate_page_cache,
+                             Env::IOPriority io_priority, bool skip_filters)
+    : rep_(new Rep(env_options, options, io_priority, user_comparator,
+                   column_family, invalidate_page_cache, skip_filters,
+                   DBImpl::GenerateDbSessionId(options.env))) {
+  // SstFileWriter is used to create sst files that can be added to database
+  // later. Therefore, no real db_id and db_session_id are associated with it.
+  // Here we mimic the way db_session_id behaves by getting a db_session_id
+  // for each SstFileWriter, and (later below) assign unique file numbers
+  // in the table properties. The db_id is set to be "SST Writer" for clarity.
+
+  rep_->file_info.file_size = 0;
+}
+
+SstFileWriter::~SstFileWriter() {
+  if (rep_->builder) {
+    // User did not call Finish() or Finish() failed, we need to
+    // abandon the builder.
+    rep_->builder->Abandon();
+  }
+}
+
+Status SstFileWriter::Open(const std::string& file_path) {
+  Rep* r = rep_.get();
+  Status s;
+  std::unique_ptr<FSWritableFile> sst_file;
+  FileOptions cur_file_opts(r->env_options);
+  s = r->ioptions.env->GetFileSystem()->NewWritableFile(
+      file_path, cur_file_opts, &sst_file, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+
+  sst_file->SetIOPriority(r->io_priority);
+
+  CompressionType compression_type;
+  CompressionOptions compression_opts;
+  if (r->mutable_cf_options.bottommost_compression !=
+      kDisableCompressionOption) {
+    compression_type = r->mutable_cf_options.bottommost_compression;
+    if (r->mutable_cf_options.bottommost_compression_opts.enabled) {
+      compression_opts = r->mutable_cf_options.bottommost_compression_opts;
+    } else {
+      compression_opts = r->mutable_cf_options.compression_opts;
+    }
+  } else if (!r->mutable_cf_options.compression_per_level.empty()) {
+    // Use the compression of the last level if we have per level compression
+    compression_type = *(r->mutable_cf_options.compression_per_level.rbegin());
+    compression_opts = r->mutable_cf_options.compression_opts;
+  } else {
+    compression_type = r->mutable_cf_options.compression;
+    compression_opts = r->mutable_cf_options.compression_opts;
+  }
+
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+  // SstFileWriter properties collector to add SstFileWriter version.
+  int_tbl_prop_collector_factories.emplace_back(
+      new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+                                                  0 /* global_seqno*/));
+
+  // User collector factories
+  auto user_collector_factories =
+      r->ioptions.table_properties_collector_factories;
+  for (size_t i = 0; i < user_collector_factories.size(); i++) {
+    int_tbl_prop_collector_factories.emplace_back(
+        new UserKeyTablePropertiesCollectorFactory(
+            user_collector_factories[i]));
+  }
+  int unknown_level = -1;
+  uint32_t cf_id;
+
+  if (r->cfh != nullptr) {
+    // user explicitly specified that this file will be ingested into cfh,
+    // we can persist this information in the file.
+    cf_id = r->cfh->GetID();
+    r->column_family_name = r->cfh->GetName();
+  } else {
+    r->column_family_name = "";
+    cf_id = TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+  }
+
+  // TODO: it would be better to set oldest_key_time to be used for getting the
+  //  approximate time of ingested keys.
+  TableBuilderOptions table_builder_options(
+      r->ioptions, r->mutable_cf_options, r->internal_comparator,
+      &int_tbl_prop_collector_factories, compression_type, compression_opts,
+      cf_id, r->column_family_name, unknown_level, false /* is_bottommost */,
+      TableFileCreationReason::kMisc, 0 /* oldest_key_time */,
+      0 /* file_creation_time */, "SST Writer" /* db_id */, r->db_session_id,
+      0 /* target_file_size */, r->next_file_number);
+  // External SST files used to each get a unique session id. Now for
+  // slightly better uniqueness probability in constructing cache keys, we
+  // assign fake file numbers to each file (into table properties) and keep
+  // the same session id for the life of the SstFileWriter.
+  r->next_file_number++;
+  // XXX: when we can remove skip_filters from the SstFileWriter public API
+  // we can remove it from TableBuilderOptions.
+  table_builder_options.skip_filters = r->skip_filters;
+  FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types;
+  r->file_writer.reset(new WritableFileWriter(
+      std::move(sst_file), file_path, r->env_options, r->ioptions.clock,
+      nullptr /* io_tracer */, nullptr /* stats */, r->ioptions.listeners,
+      r->ioptions.file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kTableFile), false));
+
+  // TODO(tec) : If table_factory is using compressed block cache, we will
+  // be adding the external sst file blocks into it, which is wasteful.
+  r->builder.reset(r->ioptions.table_factory->NewTableBuilder(
+      table_builder_options, r->file_writer.get()));
+
+  r->file_info = ExternalSstFileInfo();
+  r->file_info.file_path = file_path;
+  r->file_info.version = 2;
+  return s;
+}
+
+Status SstFileWriter::Add(const Slice& user_key, const Slice& value) {
+  return rep_->Add(user_key, value, ValueType::kTypeValue);
+}
+
+Status SstFileWriter::Put(const Slice& user_key, const Slice& value) {
+  return rep_->Add(user_key, value, ValueType::kTypeValue);
+}
+
+Status SstFileWriter::Put(const Slice& user_key, const Slice& timestamp,
+                          const Slice& value) {
+  return rep_->Add(user_key, timestamp, value, ValueType::kTypeValue);
+}
+
+Status SstFileWriter::Merge(const Slice& user_key, const Slice& value) {
+  return rep_->Add(user_key, value, ValueType::kTypeMerge);
+}
+
+Status SstFileWriter::Delete(const Slice& user_key) {
+  return rep_->Add(user_key, Slice(), ValueType::kTypeDeletion);
+}
+
+Status SstFileWriter::Delete(const Slice& user_key, const Slice& timestamp) {
+  return rep_->Add(user_key, timestamp, Slice(),
+                   ValueType::kTypeDeletionWithTimestamp);
+}
+
+Status SstFileWriter::DeleteRange(const Slice& begin_key,
+                                  const Slice& end_key) {
+  return rep_->DeleteRange(begin_key, end_key);
+}
+
+Status SstFileWriter::DeleteRange(const Slice& begin_key, const Slice& end_key,
+                                  const Slice& timestamp) {
+  return rep_->DeleteRange(begin_key, end_key, timestamp);
+}
+
+Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
+  Rep* r = rep_.get();
+  if (!r->builder) {
+    return Status::InvalidArgument("File is not opened");
+  }
+  if (r->file_info.num_entries == 0 &&
+      r->file_info.num_range_del_entries == 0) {
+    return Status::InvalidArgument("Cannot create sst file with no entries");
+  }
+
+  Status s = r->builder->Finish();
+  r->file_info.file_size = r->builder->FileSize();
+
+  if (s.ok()) {
+    s = r->file_writer->Sync(r->ioptions.use_fsync);
+    r->InvalidatePageCache(true /* closing */).PermitUncheckedError();
+    if (s.ok()) {
+      s = r->file_writer->Close();
+    }
+  }
+  if (s.ok()) {
+    r->file_info.file_checksum = r->file_writer->GetFileChecksum();
+    r->file_info.file_checksum_func_name =
+        r->file_writer->GetFileChecksumFuncName();
+  }
+  if (!s.ok()) {
+    r->ioptions.env->DeleteFile(r->file_info.file_path);
+  }
+
+  if (file_info != nullptr) {
+    *file_info = r->file_info;
+  }
+
+  r->builder.reset();
+  return s;
+}
+
+uint64_t SstFileWriter::FileSize() { return rep_->file_info.file_size; }
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/sst_file_writer_collectors.h b/src/rocksdb/table/sst_file_writer_collectors.h
new file mode 100644
index 000000000..486315fb5
--- /dev/null
+++ b/src/rocksdb/table/sst_file_writer_collectors.h
@@ -0,0 +1,95 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+
+#include "db/table_properties_collector.h"
+#include "rocksdb/types.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Table Properties that are specific to tables created by SstFileWriter.
+struct ExternalSstFilePropertyNames {
+  // value of this property is a fixed uint32 number.
+  static const std::string kVersion;
+  // value of this property is a fixed uint64 number.
+  static const std::string kGlobalSeqno;
+};
+
+// PropertiesCollector used to add properties specific to tables
+// generated by SstFileWriter
+class SstFileWriterPropertiesCollector : public IntTblPropCollector {
+ public:
+  explicit SstFileWriterPropertiesCollector(int32_t version,
+                                            SequenceNumber global_seqno)
+      : version_(version), global_seqno_(global_seqno) {}
+
+  virtual Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
+                             uint64_t /*file_size*/) override {
+    // Intentionally left blank. Have no interest in collecting stats for
+    // individual key/value pairs.
+    return Status::OK();
+  }
+
+  virtual void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) override {
+    // Intentionally left blank. No interest in collecting stats for
+    // blocks.
+    return;
+  }
+
+  virtual Status Finish(UserCollectedProperties* properties) override {
+    // File version
+    std::string version_val;
+    PutFixed32(&version_val, static_cast<uint32_t>(version_));
+    properties->insert({ExternalSstFilePropertyNames::kVersion, version_val});
+
+    // Global Sequence number
+    std::string seqno_val;
+    PutFixed64(&seqno_val, static_cast<uint64_t>(global_seqno_));
+    properties->insert({ExternalSstFilePropertyNames::kGlobalSeqno, seqno_val});
+
+    return Status::OK();
+  }
+
+  virtual const char* Name() const override {
+    return "SstFileWriterPropertiesCollector";
+  }
+
+  virtual UserCollectedProperties GetReadableProperties() const override {
+    return {{ExternalSstFilePropertyNames::kVersion, std::to_string(version_)}};
+  }
+
+ private:
+  int32_t version_;
+  SequenceNumber global_seqno_;
+};
+
+class SstFileWriterPropertiesCollectorFactory
+    : public IntTblPropCollectorFactory {
+ public:
+  explicit SstFileWriterPropertiesCollectorFactory(int32_t version,
+                                                   SequenceNumber global_seqno)
+      : version_(version), global_seqno_(global_seqno) {}
+
+  virtual IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t /*column_family_id*/, int /* level_at_creation */) override {
+    return new SstFileWriterPropertiesCollector(version_, global_seqno_);
+  }
+
+  virtual const char* Name() const override {
+    return "SstFileWriterPropertiesCollector";
+  }
+
+ private:
+  int32_t version_;
+  SequenceNumber global_seqno_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_builder.h b/src/rocksdb/table/table_builder.h
new file mode 100644
index 000000000..1790f33b1
--- /dev/null
+++ b/src/rocksdb/table/table_builder.h
@@ -0,0 +1,219 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/table_properties_collector.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "table/unique_id_impl.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Status;
+
+struct TableReaderOptions {
+  // @param skip_filters Disables loading/accessing the filter block
+  TableReaderOptions(
+      const ImmutableOptions& _ioptions,
+      const std::shared_ptr<const SliceTransform>& _prefix_extractor,
+      const EnvOptions& _env_options,
+      const InternalKeyComparator& _internal_comparator,
+      bool _skip_filters = false, bool _immortal = false,
+      bool _force_direct_prefetch = false, int _level = -1,
+      BlockCacheTracer* const _block_cache_tracer = nullptr,
+      size_t _max_file_size_for_l0_meta_pin = 0,
+      const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0,
+      UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0)
+      : ioptions(_ioptions),
+        prefix_extractor(_prefix_extractor),
+        env_options(_env_options),
+        internal_comparator(_internal_comparator),
+        skip_filters(_skip_filters),
+        immortal(_immortal),
+        force_direct_prefetch(_force_direct_prefetch),
+        level(_level),
+        largest_seqno(_largest_seqno),
+        block_cache_tracer(_block_cache_tracer),
+        max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin),
+        cur_db_session_id(_cur_db_session_id),
+        cur_file_num(_cur_file_num),
+        unique_id(_unique_id) {}
+
+  const ImmutableOptions& ioptions;
+  const std::shared_ptr<const SliceTransform>& prefix_extractor;
+  const EnvOptions& env_options;
+  const InternalKeyComparator& internal_comparator;
+  // This is only used for BlockBasedTable (reader)
+  bool skip_filters;
+  // Whether the table will be valid as long as the DB is open
+  bool immortal;
+  // When data prefetching is needed, even if direct I/O is off, read data to
+  // fetch into RocksDB's buffer, rather than relying
+  // RandomAccessFile::Prefetch().
+  bool force_direct_prefetch;
+  // What level this table/file is on, -1 for "not set, don't know." Used
+  // for level-specific statistics.
+  int level;
+  // largest seqno in the table (or 0 means unknown???)
+  SequenceNumber largest_seqno;
+  BlockCacheTracer* const block_cache_tracer;
+  // Largest L0 file size whose meta-blocks may be pinned (can be zero when
+  // unknown).
+  const size_t max_file_size_for_l0_meta_pin;
+
+  std::string cur_db_session_id;
+
+  uint64_t cur_file_num;
+
+  // Known unique_id or {}, kNullUniqueId64x2 means unknown
+  UniqueId64x2 unique_id;
+};
+
+struct TableBuilderOptions {
+  TableBuilderOptions(
+      const ImmutableOptions& _ioptions, const MutableCFOptions& _moptions,
+      const InternalKeyComparator& _internal_comparator,
+      const IntTblPropCollectorFactories* _int_tbl_prop_collector_factories,
+      CompressionType _compression_type,
+      const CompressionOptions& _compression_opts, uint32_t _column_family_id,
+      const std::string& _column_family_name, int _level,
+      bool _is_bottommost = false,
+      TableFileCreationReason _reason = TableFileCreationReason::kMisc,
+      const int64_t _oldest_key_time = 0,
+      const uint64_t _file_creation_time = 0, const std::string& _db_id = "",
+      const std::string& _db_session_id = "",
+      const uint64_t _target_file_size = 0, const uint64_t _cur_file_num = 0)
+      : ioptions(_ioptions),
+        moptions(_moptions),
+        internal_comparator(_internal_comparator),
+        int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories),
+        compression_type(_compression_type),
+        compression_opts(_compression_opts),
+        column_family_id(_column_family_id),
+        column_family_name(_column_family_name),
+        oldest_key_time(_oldest_key_time),
+        target_file_size(_target_file_size),
+        file_creation_time(_file_creation_time),
+        db_id(_db_id),
+        db_session_id(_db_session_id),
+        level_at_creation(_level),
+        is_bottommost(_is_bottommost),
+        reason(_reason),
+        cur_file_num(_cur_file_num) {}
+
+  const ImmutableOptions& ioptions;
+  const MutableCFOptions& moptions;
+  const InternalKeyComparator& internal_comparator;
+  const IntTblPropCollectorFactories* int_tbl_prop_collector_factories;
+  const CompressionType compression_type;
+  const CompressionOptions& compression_opts;
+  const uint32_t column_family_id;
+  const std::string& column_family_name;
+  const int64_t oldest_key_time;
+  const uint64_t target_file_size;
+  const uint64_t file_creation_time;
+  const std::string db_id;
+  const std::string db_session_id;
+  // BEGIN for FilterBuildingContext
+  const int level_at_creation;
+  const bool is_bottommost;
+  const TableFileCreationReason reason;
+  // END for FilterBuildingContext
+
+  // XXX: only used by BlockBasedTableBuilder for SstFileWriter. If you
+  // want to skip filters, that should be (for example) null filter_policy
+  // in the table options of the ioptions.table_factory
+  bool skip_filters = false;
+  const uint64_t cur_file_num;
+};
+
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
+class TableBuilder {
+ public:
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  virtual ~TableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Add(const Slice& key, const Slice& value) = 0;
+
+  // Return non-ok iff some error has been detected.
+  virtual Status status() const = 0;
+
+  // Return non-ok iff some error happens during IO.
+  virtual IOStatus io_status() const = 0;
+
+  // Finish building the table.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual Status Finish() = 0;
+
+  // Indicate that the contents of this builder should be abandoned.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Abandon() = 0;
+
+  // Number of calls to Add() so far.
+  virtual uint64_t NumEntries() const = 0;
+
+  // Whether the output file is completely empty. It has neither entries
+  // or tombstones.
+  virtual bool IsEmpty() const {
+    return NumEntries() == 0 && GetTableProperties().num_range_deletions == 0;
+  }
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  virtual uint64_t FileSize() const = 0;
+
+  // Estimated size of the file generated so far. This is used when
+  // FileSize() cannot estimate final SST size, e.g. parallel compression
+  // is enabled.
+  virtual uint64_t EstimatedFileSize() const { return FileSize(); }
+
+  // If the user defined table properties collector suggest the file to
+  // be further compacted.
+  virtual bool NeedCompact() const { return false; }
+
+  // Returns table properties
+  virtual TableProperties GetTableProperties() const = 0;
+
+  // Return file checksum
+  virtual std::string GetFileChecksum() const = 0;
+
+  // Return file checksum function name
+  virtual const char* GetFileChecksumFuncName() const = 0;
+
+  // Set the sequence number to time mapping
+  virtual void SetSeqnoTimeTableProperties(
+      const std::string& /*encoded_seqno_to_time_mapping*/,
+      uint64_t /*oldest_ancestor_time*/){};
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_factory.cc b/src/rocksdb/table/table_factory.cc
new file mode 100644
index 000000000..fc5c5ccde
--- /dev/null
+++ b/src/rocksdb/table/table_factory.cc
@@ -0,0 +1,65 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <mutex>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/plain/plain_table_factory.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static void RegisterTableFactories(const std::string& /*arg*/) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag loaded;
+  std::call_once(loaded, []() {
+    auto library = ObjectLibrary::Default();
+    library->AddFactory<TableFactory>(
+        TableFactory::kBlockBasedTableName(),
+        [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+           std::string* /* errmsg */) {
+          guard->reset(new BlockBasedTableFactory());
+          return guard->get();
+        });
+    library->AddFactory<TableFactory>(
+        TableFactory::kPlainTableName(),
+        [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+           std::string* /* errmsg */) {
+          guard->reset(new PlainTableFactory());
+          return guard->get();
+        });
+    library->AddFactory<TableFactory>(
+        TableFactory::kCuckooTableName(),
+        [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+           std::string* /* errmsg */) {
+          guard->reset(new CuckooTableFactory());
+          return guard->get();
+        });
+  });
+#endif  // ROCKSDB_LITE
+}
+
+static bool LoadFactory(const std::string& name,
+                        std::shared_ptr<TableFactory>* factory) {
+  if (name == TableFactory::kBlockBasedTableName()) {
+    factory->reset(new BlockBasedTableFactory());
+    return true;
+  } else {
+    return false;
+  }
+}
+
+Status TableFactory::CreateFromString(const ConfigOptions& config_options,
+                                      const std::string& value,
+                                      std::shared_ptr<TableFactory>* factory) {
+  RegisterTableFactories("");
+  return LoadSharedObject<TableFactory>(config_options, value, LoadFactory,
+                                        factory);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_properties.cc b/src/rocksdb/table/table_properties.cc
new file mode 100644
index 000000000..b382281f8
--- /dev/null
+++ b/src/rocksdb/table/table_properties.cc
@@ -0,0 +1,349 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/table_properties.h"
+
+#include "db/seqno_to_time_mapping.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/unique_id.h"
+#include "table/table_properties_internal.h"
+#include "table/unique_id_impl.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint32_t TablePropertiesCollectorFactory::Context::kUnknownColumnFamily =
+    std::numeric_limits<int32_t>::max();
+
+namespace {
+void AppendProperty(std::string& props, const std::string& key,
+                    const std::string& value, const std::string& prop_delim,
+                    const std::string& kv_delim) {
+  props.append(key);
+  props.append(kv_delim);
+  props.append(value);
+  props.append(prop_delim);
+}
+
+template <class TValue>
+void AppendProperty(std::string& props, const std::string& key,
+                    const TValue& value, const std::string& prop_delim,
+                    const std::string& kv_delim) {
+  AppendProperty(props, key, std::to_string(value), prop_delim, kv_delim);
+}
+}  // namespace
+
+std::string TableProperties::ToString(const std::string& prop_delim,
+                                      const std::string& kv_delim) const {
+  std::string result;
+  result.reserve(1024);
+
+  // Basic Info
+  AppendProperty(result, "# data blocks", num_data_blocks, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
+  AppendProperty(result, "# deletions", num_deletions, prop_delim, kv_delim);
+  AppendProperty(result, "# merge operands", num_merge_operands, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "# range deletions", num_range_deletions, prop_delim,
+                 kv_delim);
+
+  AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
+  AppendProperty(result, "raw average key size",
+                 num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0,
+                 prop_delim, kv_delim);
+  AppendProperty(result, "raw value size", raw_value_size, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "raw average value size",
+                 num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0,
+                 prop_delim, kv_delim);
+
+  AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
+  char index_block_size_str[80];
+  snprintf(index_block_size_str, sizeof(index_block_size_str),
+           "index block size (user-key? %d, delta-value? %d)",
+           static_cast<int>(index_key_is_user_key),
+           static_cast<int>(index_value_is_delta_encoded));
+  AppendProperty(result, index_block_size_str, index_size, prop_delim,
+                 kv_delim);
+  if (index_partitions != 0) {
+    AppendProperty(result, "# index partitions", index_partitions, prop_delim,
+                   kv_delim);
+    AppendProperty(result, "top-level index size", top_level_index_size,
+                   prop_delim, kv_delim);
+  }
+  AppendProperty(result, "filter block size", filter_size, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "# entries for filter", num_filter_entries, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "(estimated) table size",
+                 data_size + index_size + filter_size, prop_delim, kv_delim);
+
+  AppendProperty(
+      result, "filter policy name",
+      filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
+      prop_delim, kv_delim);
+
+  AppendProperty(result, "prefix extractor name",
+                 prefix_extractor_name.empty() ? std::string("N/A")
+                                               : prefix_extractor_name,
+                 prop_delim, kv_delim);
+
+  AppendProperty(result, "column family ID",
+                 column_family_id ==
+                         ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::
+                             Context::kUnknownColumnFamily
+                     ? std::string("N/A")
+                     : std::to_string(column_family_id),
+                 prop_delim, kv_delim);
+  AppendProperty(
+      result, "column family name",
+      column_family_name.empty() ? std::string("N/A") : column_family_name,
+      prop_delim, kv_delim);
+
+  AppendProperty(result, "comparator name",
+                 comparator_name.empty() ? std::string("N/A") : comparator_name,
+                 prop_delim, kv_delim);
+
+  AppendProperty(
+      result, "merge operator name",
+      merge_operator_name.empty() ? std::string("N/A") : merge_operator_name,
+      prop_delim, kv_delim);
+
+  AppendProperty(result, "property collectors names",
+                 property_collectors_names.empty() ? std::string("N/A")
+                                                   : property_collectors_names,
+                 prop_delim, kv_delim);
+
+  AppendProperty(
+      result, "SST file compression algo",
+      compression_name.empty() ? std::string("N/A") : compression_name,
+      prop_delim, kv_delim);
+
+  AppendProperty(
+      result, "SST file compression options",
+      compression_options.empty() ? std::string("N/A") : compression_options,
+      prop_delim, kv_delim);
+
+  AppendProperty(result, "creation time", creation_time, prop_delim, kv_delim);
+
+  AppendProperty(result, "time stamp of earliest key", oldest_key_time,
+                 prop_delim, kv_delim);
+
+  AppendProperty(result, "file creation time", file_creation_time, prop_delim,
+                 kv_delim);
+
+  AppendProperty(result, "slow compression estimated data size",
+                 slow_compression_estimated_data_size, prop_delim, kv_delim);
+  AppendProperty(result, "fast compression estimated data size",
+                 fast_compression_estimated_data_size, prop_delim, kv_delim);
+
+  // DB identity and DB session ID
+  AppendProperty(result, "DB identity", db_id, prop_delim, kv_delim);
+  AppendProperty(result, "DB session identity", db_session_id, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "DB host id", db_host_id, prop_delim, kv_delim);
+  AppendProperty(result, "original file number", orig_file_number, prop_delim,
+                 kv_delim);
+
+  // Unique ID, when available
+  std::string id;
+  Status s = GetUniqueIdFromTableProperties(*this, &id);
+  AppendProperty(result, "unique ID",
+                 s.ok() ? UniqueIdToHumanString(id) : "N/A", prop_delim,
+                 kv_delim);
+
+  SeqnoToTimeMapping seq_time_mapping;
+  s = seq_time_mapping.Add(seqno_to_time_mapping);
+  AppendProperty(result, "Sequence number to time mapping",
+                 s.ok() ? seq_time_mapping.ToHumanString() : "N/A", prop_delim,
+                 kv_delim);
+
+  return result;
+}
+
+void TableProperties::Add(const TableProperties& tp) {
+  data_size += tp.data_size;
+  index_size += tp.index_size;
+  index_partitions += tp.index_partitions;
+  top_level_index_size += tp.top_level_index_size;
+  index_key_is_user_key += tp.index_key_is_user_key;
+  index_value_is_delta_encoded += tp.index_value_is_delta_encoded;
+  filter_size += tp.filter_size;
+  raw_key_size += tp.raw_key_size;
+  raw_value_size += tp.raw_value_size;
+  num_data_blocks += tp.num_data_blocks;
+  num_entries += tp.num_entries;
+  num_filter_entries += tp.num_filter_entries;
+  num_deletions += tp.num_deletions;
+  num_merge_operands += tp.num_merge_operands;
+  num_range_deletions += tp.num_range_deletions;
+  slow_compression_estimated_data_size +=
+      tp.slow_compression_estimated_data_size;
+  fast_compression_estimated_data_size +=
+      tp.fast_compression_estimated_data_size;
+}
+
+std::map<std::string, uint64_t>
+TableProperties::GetAggregatablePropertiesAsMap() const {
+  std::map<std::string, uint64_t> rv;
+  rv["data_size"] = data_size;
+  rv["index_size"] = index_size;
+  rv["index_partitions"] = index_partitions;
+  rv["top_level_index_size"] = top_level_index_size;
+  rv["filter_size"] = filter_size;
+  rv["raw_key_size"] = raw_key_size;
+  rv["raw_value_size"] = raw_value_size;
+  rv["num_data_blocks"] = num_data_blocks;
+  rv["num_entries"] = num_entries;
+  rv["num_filter_entries"] = num_filter_entries;
+  rv["num_deletions"] = num_deletions;
+  rv["num_merge_operands"] = num_merge_operands;
+  rv["num_range_deletions"] = num_range_deletions;
+  rv["slow_compression_estimated_data_size"] =
+      slow_compression_estimated_data_size;
+  rv["fast_compression_estimated_data_size"] =
+      fast_compression_estimated_data_size;
+  return rv;
+}
+
+// WARNING: manual update to this function is needed
+// whenever a new string property is added to TableProperties
+// to reduce approximation error.
+//
+// TODO: eliminate the need of manually updating this function
+// for new string properties
+std::size_t TableProperties::ApproximateMemoryUsage() const {
+  std::size_t usage = 0;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size((void*)this);
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+
+  std::size_t string_props_mem_usage =
+      db_id.size() + db_session_id.size() + db_host_id.size() +
+      column_family_name.size() + filter_policy_name.size() +
+      comparator_name.size() + merge_operator_name.size() +
+      prefix_extractor_name.size() + property_collectors_names.size() +
+      compression_name.size() + compression_options.size();
+  usage += string_props_mem_usage;
+
+  for (auto iter = user_collected_properties.begin();
+       iter != user_collected_properties.end(); ++iter) {
+    usage += (iter->first.size() + iter->second.size());
+  }
+
+  return usage;
+}
+
+const std::string TablePropertiesNames::kDbId = "rocksdb.creating.db.identity";
+const std::string TablePropertiesNames::kDbSessionId =
+    "rocksdb.creating.session.identity";
+const std::string TablePropertiesNames::kDbHostId =
+    "rocksdb.creating.host.identity";
+const std::string TablePropertiesNames::kOriginalFileNumber =
+    "rocksdb.original.file.number";
+const std::string TablePropertiesNames::kDataSize = "rocksdb.data.size";
+const std::string TablePropertiesNames::kIndexSize = "rocksdb.index.size";
+const std::string TablePropertiesNames::kIndexPartitions =
+    "rocksdb.index.partitions";
+const std::string TablePropertiesNames::kTopLevelIndexSize =
+    "rocksdb.top-level.index.size";
+const std::string TablePropertiesNames::kIndexKeyIsUserKey =
+    "rocksdb.index.key.is.user.key";
+const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded =
+    "rocksdb.index.value.is.delta.encoded";
+const std::string TablePropertiesNames::kFilterSize = "rocksdb.filter.size";
+const std::string TablePropertiesNames::kRawKeySize = "rocksdb.raw.key.size";
+const std::string TablePropertiesNames::kRawValueSize =
+    "rocksdb.raw.value.size";
+const std::string TablePropertiesNames::kNumDataBlocks =
+    "rocksdb.num.data.blocks";
+const std::string TablePropertiesNames::kNumEntries = "rocksdb.num.entries";
+const std::string TablePropertiesNames::kNumFilterEntries =
+    "rocksdb.num.filter_entries";
+const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys";
+const std::string TablePropertiesNames::kMergeOperands =
+    "rocksdb.merge.operands";
+const std::string TablePropertiesNames::kNumRangeDeletions =
+    "rocksdb.num.range-deletions";
+const std::string TablePropertiesNames::kFilterPolicy = "rocksdb.filter.policy";
+const std::string TablePropertiesNames::kFormatVersion =
+    "rocksdb.format.version";
+const std::string TablePropertiesNames::kFixedKeyLen =
+    "rocksdb.fixed.key.length";
+const std::string TablePropertiesNames::kColumnFamilyId =
+    "rocksdb.column.family.id";
+const std::string TablePropertiesNames::kColumnFamilyName =
+    "rocksdb.column.family.name";
+const std::string TablePropertiesNames::kComparator = "rocksdb.comparator";
+const std::string TablePropertiesNames::kMergeOperator =
+    "rocksdb.merge.operator";
+const std::string TablePropertiesNames::kPrefixExtractorName =
+    "rocksdb.prefix.extractor.name";
+const std::string TablePropertiesNames::kPropertyCollectors =
+    "rocksdb.property.collectors";
+const std::string TablePropertiesNames::kCompression = "rocksdb.compression";
+const std::string TablePropertiesNames::kCompressionOptions =
+    "rocksdb.compression_options";
+const std::string TablePropertiesNames::kCreationTime = "rocksdb.creation.time";
+const std::string TablePropertiesNames::kOldestKeyTime =
+    "rocksdb.oldest.key.time";
+const std::string TablePropertiesNames::kFileCreationTime =
+    "rocksdb.file.creation.time";
+const std::string TablePropertiesNames::kSlowCompressionEstimatedDataSize =
+    "rocksdb.sample_for_compression.slow.data.size";
+const std::string TablePropertiesNames::kFastCompressionEstimatedDataSize =
+    "rocksdb.sample_for_compression.fast.data.size";
+const std::string TablePropertiesNames::kSequenceNumberTimeMapping =
+    "rocksdb.seqno.time.map";
+
+#ifndef NDEBUG
+// WARNING: TEST_SetRandomTableProperties assumes the following layout of
+// TableProperties
+//
+// struct TableProperties {
+//    int64_t orig_file_number = 0;
+//    ...
+//    ... int64_t properties only
+//    ...
+//    std::string db_id;
+//    ...
+//    ... std::string properties only
+//    ...
+//    std::string compression_options;
+//    UserCollectedProperties user_collected_properties;
+//    ...
+//    ... Other extra properties: non-int64_t/non-std::string properties only
+//    ...
+// }
+void TEST_SetRandomTableProperties(TableProperties* props) {
+  Random* r = Random::GetTLSInstance();
+  uint64_t* pu = &props->orig_file_number;
+  assert(static_cast<void*>(pu) == static_cast<void*>(props));
+  std::string* ps = &props->db_id;
+  const uint64_t* const pu_end = reinterpret_cast<const uint64_t*>(ps);
+  // Use the last string property's address instead of
+  // the first extra property (e.g `user_collected_properties`)'s address
+  // in the for-loop to avoid advancing pointer to pointing to
+  // potential non-zero padding bytes between these two addresses due to
+  // user_collected_properties's alignment requirement
+  const std::string* const ps_end_inclusive = &props->compression_options;
+
+  for (; pu < pu_end; ++pu) {
+    *pu = r->Next64();
+  }
+  assert(static_cast<void*>(pu) == static_cast<void*>(ps));
+  for (; ps <= ps_end_inclusive; ++ps) {
+    *ps = r->RandomBinaryString(13);
+  }
+}
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_properties_internal.h b/src/rocksdb/table/table_properties_internal.h
new file mode 100644
index 000000000..5c2a0cb9a
--- /dev/null
+++ b/src/rocksdb/table/table_properties_internal.h
@@ -0,0 +1,14 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef NDEBUG
+void TEST_SetRandomTableProperties(TableProperties* props);
+#endif
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_reader.h b/src/rocksdb/table/table_reader.h
new file mode 100644
index 000000000..391072eec
--- /dev/null
+++ b/src/rocksdb/table/table_reader.h
@@ -0,0 +1,184 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+
+#include "db/range_tombstone_fragmenter.h"
+#if USE_COROUTINES
+#include "folly/experimental/coro/Coroutine.h"
+#include "folly/experimental/coro/Task.h"
+#endif
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table_reader_caller.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/multiget_context.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Iterator;
+struct ParsedInternalKey;
+class Slice;
+class Arena;
+struct ReadOptions;
+struct TableProperties;
+class GetContext;
+class MultiGetContext;
+
+// A Table (also referred to as SST) is a sorted map from strings to strings.
+// Tables are immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization. Table readers are used
+// for reading various types of table formats supported by rocksdb including
+// BlockBasedTable, PlainTable and CuckooTable format.
+class TableReader {
+ public:
+  virtual ~TableReader() {}
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  //
+  // read_options: Must outlive the returned iterator.
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        When destroying the iterator, the caller will not call "delete"
+  //        but Iterator::~Iterator() directly. The destructor needs to destroy
+  //        all the states but those allocated in arena.
+  // skip_filters: disables checking the bloom filters even if they exist. This
+  //               option is effective only for block-based table format.
+  // compaction_readahead_size: its value will only be used if caller =
+  // kCompaction
+  virtual InternalIterator* NewIterator(
+      const ReadOptions& read_options, const SliceTransform* prefix_extractor,
+      Arena* arena, bool skip_filters, TableReaderCaller caller,
+      size_t compaction_readahead_size = 0,
+      bool allow_unprepared_value = false) = 0;
+
+  virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& /*read_options*/) {
+    return nullptr;
+  }
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  // TODO(peterd): Since this function is only used for approximate size
+  // from beginning of file, reduce code duplication by removing this
+  // function and letting ApproximateSize take optional start and end, so
+  // that absolute start and end can be specified and optimized without
+  // key / index work.
+  virtual uint64_t ApproximateOffsetOf(const Slice& key,
+                                       TableReaderCaller caller) = 0;
+
+  // Given start and end keys, return the approximate data size in the file
+  // between the keys. The returned value is in terms of file bytes, and so
+  // includes effects like compression of the underlying data and applicable
+  // portions of metadata including filters and indexes. Nullptr for start or
+  // end (or both) indicates absolute start or end of the table.
+  virtual uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                                   TableReaderCaller caller) = 0;
+
+  struct Anchor {
+    Anchor(const Slice& _user_key, size_t _range_size)
+        : user_key(_user_key.ToStringView()), range_size(_range_size) {}
+    std::string user_key;
+    size_t range_size;
+  };
+
+  // Now try to return approximately 128 anchor keys.
+  // The last one tends to be the largest key.
+  virtual Status ApproximateKeyAnchors(const ReadOptions& /*read_options*/,
+                                       std::vector<Anchor>& /*anchors*/) {
+    return Status::NotSupported("ApproximateKeyAnchors() not supported.");
+  }
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  virtual void SetupForCompaction() = 0;
+
+  virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0;
+
+  // Prepare work that can be done before the real Get()
+  virtual void Prepare(const Slice& /*target*/) {}
+
+  // Report an approximation of how much memory has been used.
+  virtual size_t ApproximateMemoryUsage() const = 0;
+
+  // Calls get_context->SaveValue() repeatedly, starting with
+  // the entry found after a call to Seek(key), until it returns false.
+  // May not make such a call if filter policy says that key is not present.
+  //
+  // get_context->MarkKeyMayExist needs to be called when it is configured to be
+  // memory only and the key is not found in the block cache.
+  //
+  // readOptions is the options for the read
+  // key is the key to search for
+  // skip_filters: disables checking the bloom filters even if they exist. This
+  //               option is effective only for block-based table format.
+  virtual Status Get(const ReadOptions& readOptions, const Slice& key,
+                     GetContext* get_context,
+                     const SliceTransform* prefix_extractor,
+                     bool skip_filters = false) = 0;
+
+  // Use bloom filters in the table file, if present, to filter out keys. The
+  // mget_range will be updated to skip keys that get a negative result from
+  // the filter lookup.
+  virtual Status MultiGetFilter(const ReadOptions& /*readOptions*/,
+                                const SliceTransform* /*prefix_extractor*/,
+                                MultiGetContext::Range* /*mget_range*/) {
+    return Status::NotSupported();
+  }
+
+  virtual void MultiGet(const ReadOptions& readOptions,
+                        const MultiGetContext::Range* mget_range,
+                        const SliceTransform* prefix_extractor,
+                        bool skip_filters = false) {
+    for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) {
+      *iter->s = Get(readOptions, iter->ikey, iter->get_context,
+                     prefix_extractor, skip_filters);
+    }
+  }
+
+#if USE_COROUTINES
+  virtual folly::coro::Task<void> MultiGetCoroutine(
+      const ReadOptions& readOptions, const MultiGetContext::Range* mget_range,
+      const SliceTransform* prefix_extractor, bool skip_filters = false) {
+    MultiGet(readOptions, mget_range, prefix_extractor, skip_filters);
+    co_return;
+  }
+#endif  // USE_COROUTINES
+
+  // Prefetch data corresponding to a give range of keys
+  // Typically this functionality is required for table implementations that
+  // persists the data on a non volatile storage medium like disk/SSD
+  virtual Status Prefetch(const Slice* begin = nullptr,
+                          const Slice* end = nullptr) {
+    (void)begin;
+    (void)end;
+    // Default implementation is NOOP.
+    // The child class should implement functionality when applicable
+    return Status::OK();
+  }
+
+  // convert db file to a human readable form
+  virtual Status DumpTable(WritableFile* /*out_file*/) {
+    return Status::NotSupported("DumpTable() not supported");
+  }
+
+  // check whether there is corruption in this db file
+  virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
+                                TableReaderCaller /*caller*/) {
+    return Status::NotSupported("VerifyChecksum() not supported");
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_reader_bench.cc b/src/rocksdb/table/table_reader_bench.cc
new file mode 100644
index 000000000..b13caf68d
--- /dev/null
+++ b/src/rocksdb/table/table_reader_bench.cc
@@ -0,0 +1,349 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
+#include "monitoring/histogram.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+// Make a key that i determines the first 4 characters and j determines the
+// last 4 characters.
+static std::string MakeKey(int i, int j, bool through_db) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%04d__key___%04d", i, j);
+  if (through_db) {
+    return std::string(buf);
+  }
+  // If we directly query table, which operates on internal keys
+  // instead of user keys, we need to add 8 bytes of internal
+  // information (row type etc) to user key to make an internal
+  // key.
+  InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+  return key.Encode().ToString();
+}
+
+uint64_t Now(SystemClock* clock, bool measured_by_nanosecond) {
+  return measured_by_nanosecond ? clock->NowNanos() : clock->NowMicros();
+}
+}  // namespace
+
+// A very simple benchmark that.
+// Create a table with roughly numKey1 * numKey2 keys,
+// where there are numKey1 prefixes of the key, each has numKey2 number of
+// distinguished key, differing in the suffix part.
+// If if_query_empty_keys = false, query the existing keys numKey1 * numKey2
+// times randomly.
+// If if_query_empty_keys = true, query numKey1 * numKey2 random empty keys.
+// Print out the total time.
+// If through_db=true, a full DB will be created and queries will be against
+// it. Otherwise, operations will be directly through table level.
+//
+// If for_terator=true, instead of just query one key each time, it queries
+// a range sharing the same prefix.
+namespace {
+void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
+                          ReadOptions& read_options, int num_keys1,
+                          int num_keys2, int num_iter, int /*prefix_len*/,
+                          bool if_query_empty_keys, bool for_iterator,
+                          bool through_db, bool measured_by_nanosecond) {
+  ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
+
+  std::string file_name =
+      test::PerThreadDBPath("rocksdb_table_reader_benchmark");
+  std::string dbname = test::PerThreadDBPath("rocksdb_table_reader_bench_db");
+  WriteOptions wo;
+  Env* env = Env::Default();
+  auto* clock = env->GetSystemClock().get();
+  TableBuilder* tb = nullptr;
+  DB* db = nullptr;
+  Status s;
+  const ImmutableOptions ioptions(opts);
+  const ColumnFamilyOptions cfo(opts);
+  const MutableCFOptions moptions(cfo);
+  std::unique_ptr<WritableFileWriter> file_writer;
+  if (!through_db) {
+    ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), file_name,
+                                         FileOptions(env_options), &file_writer,
+                                         nullptr));
+
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+    int unknown_level = -1;
+    tb = opts.table_factory->NewTableBuilder(
+        TableBuilderOptions(
+            ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
+            CompressionType::kNoCompression, CompressionOptions(),
+            0 /* column_family_id */, kDefaultColumnFamilyName, unknown_level),
+        file_writer.get());
+  } else {
+    s = DB::Open(opts, dbname, &db);
+    ASSERT_OK(s);
+    ASSERT_TRUE(db != nullptr);
+  }
+  // Populate slightly more than 1M keys
+  for (int i = 0; i < num_keys1; i++) {
+    for (int j = 0; j < num_keys2; j++) {
+      std::string key = MakeKey(i * 2, j, through_db);
+      if (!through_db) {
+        tb->Add(key, key);
+      } else {
+        db->Put(wo, key, key);
+      }
+    }
+  }
+  if (!through_db) {
+    tb->Finish();
+    file_writer->Close();
+  } else {
+    db->Flush(FlushOptions());
+  }
+
+  std::unique_ptr<TableReader> table_reader;
+  if (!through_db) {
+    const auto& fs = env->GetFileSystem();
+    FileOptions fopts(env_options);
+
+    std::unique_ptr<FSRandomAccessFile> raf;
+    s = fs->NewRandomAccessFile(file_name, fopts, &raf, nullptr);
+    if (!s.ok()) {
+      fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+    uint64_t file_size;
+    fs->GetFileSize(file_name, fopts.io_options, &file_size, nullptr);
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(raf), file_name));
+    s = opts.table_factory->NewTableReader(
+        TableReaderOptions(ioptions, moptions.prefix_extractor, env_options,
+                           ikc),
+        std::move(file_reader), file_size, &table_reader);
+    if (!s.ok()) {
+      fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  Random rnd(301);
+  std::string result;
+  HistogramImpl hist;
+
+  for (int it = 0; it < num_iter; it++) {
+    for (int i = 0; i < num_keys1; i++) {
+      for (int j = 0; j < num_keys2; j++) {
+        int r1 = rnd.Uniform(num_keys1) * 2;
+        int r2 = rnd.Uniform(num_keys2);
+        if (if_query_empty_keys) {
+          r1++;
+          r2 = num_keys2 * 2 - r2;
+        }
+
+        if (!for_iterator) {
+          // Query one existing key;
+          std::string key = MakeKey(r1, r2, through_db);
+          uint64_t start_time = Now(clock, measured_by_nanosecond);
+          if (!through_db) {
+            PinnableSlice value;
+            MergeContext merge_context;
+            SequenceNumber max_covering_tombstone_seq = 0;
+            GetContext get_context(
+                ioptions.user_comparator, ioptions.merge_operator.get(),
+                ioptions.logger, ioptions.stats, GetContext::kNotFound,
+                Slice(key), &value, /*columns=*/nullptr, /*timestamp=*/nullptr,
+                &merge_context, true, &max_covering_tombstone_seq, clock);
+            s = table_reader->Get(read_options, key, &get_context, nullptr);
+          } else {
+            s = db->Get(read_options, key, &result);
+          }
+          hist.Add(Now(clock, measured_by_nanosecond) - start_time);
+        } else {
+          int r2_len;
+          if (if_query_empty_keys) {
+            r2_len = 0;
+          } else {
+            r2_len = rnd.Uniform(num_keys2) + 1;
+            if (r2_len + r2 > num_keys2) {
+              r2_len = num_keys2 - r2;
+            }
+          }
+          std::string start_key = MakeKey(r1, r2, through_db);
+          std::string end_key = MakeKey(r1, r2 + r2_len, through_db);
+          uint64_t total_time = 0;
+          uint64_t start_time = Now(clock, measured_by_nanosecond);
+          Iterator* iter = nullptr;
+          InternalIterator* iiter = nullptr;
+          if (!through_db) {
+            iiter = table_reader->NewIterator(
+                read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+                /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+          } else {
+            iter = db->NewIterator(read_options);
+          }
+          int count = 0;
+          for (through_db ? iter->Seek(start_key) : iiter->Seek(start_key);
+               through_db ? iter->Valid() : iiter->Valid();
+               through_db ? iter->Next() : iiter->Next()) {
+            if (if_query_empty_keys) {
+              break;
+            }
+            // verify key;
+            total_time += Now(clock, measured_by_nanosecond) - start_time;
+            assert(Slice(MakeKey(r1, r2 + count, through_db)) ==
+                   (through_db ? iter->key() : iiter->key()));
+            start_time = Now(clock, measured_by_nanosecond);
+            if (++count >= r2_len) {
+              break;
+            }
+          }
+          if (count != r2_len) {
+            fprintf(stderr,
+                    "Iterator cannot iterate expected number of entries. "
+                    "Expected %d but got %d\n",
+                    r2_len, count);
+            assert(false);
+          }
+          delete iter;
+          total_time += Now(clock, measured_by_nanosecond) - start_time;
+          hist.Add(total_time);
+        }
+      }
+    }
+  }
+
+  fprintf(
+      stderr,
+      "==================================================="
+      "====================================================\n"
+      "InMemoryTableSimpleBenchmark: %20s   num_key1:  %5d   "
+      "num_key2: %5d  %10s\n"
+      "==================================================="
+      "===================================================="
+      "\nHistogram (unit: %s): \n%s",
+      opts.table_factory->Name(), num_keys1, num_keys2,
+      for_iterator ? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"),
+      measured_by_nanosecond ? "nanosecond" : "microsecond",
+      hist.ToString().c_str());
+  if (!through_db) {
+    env->DeleteFile(file_name);
+  } else {
+    delete db;
+    db = nullptr;
+    DestroyDB(dbname, opts);
+  }
+}
+}  // namespace
+}  // namespace ROCKSDB_NAMESPACE
+
+DEFINE_bool(query_empty, false,
+            "query non-existing keys instead of existing ones.");
+DEFINE_int32(num_keys1, 4096, "number of distinguish prefix of keys");
+DEFINE_int32(num_keys2, 512, "number of distinguish keys for each prefix");
+DEFINE_int32(iter, 3, "query non-existing keys instead of existing ones");
+DEFINE_int32(prefix_len, 16, "Prefix length used for iterators and indexes");
+DEFINE_bool(iterator, false, "For test iterator");
+DEFINE_bool(through_db, false,
+            "If enable, a DB instance will be created and the query will be "
+            "against DB. Otherwise, will be directly against a table reader.");
+DEFINE_bool(mmap_read, true, "Whether use mmap read");
+DEFINE_string(table_factory, "block_based",
+              "Table factory to use: `block_based` (default), `plain_table` or "
+              "`cuckoo_hash`.");
+DEFINE_string(time_unit, "microsecond",
+              "The time unit used for measuring performance. User can specify "
+              "`microsecond` (default) or `nanosecond`");
+
+int main(int argc, char** argv) {
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  std::shared_ptr<ROCKSDB_NAMESPACE::TableFactory> tf;
+  ROCKSDB_NAMESPACE::Options options;
+  if (FLAGS_prefix_len < 16) {
+    options.prefix_extractor.reset(
+        ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_len));
+  }
+  ROCKSDB_NAMESPACE::ReadOptions ro;
+  ROCKSDB_NAMESPACE::EnvOptions env_options;
+  options.create_if_missing = true;
+  options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression;
+
+  if (FLAGS_table_factory == "cuckoo_hash") {
+#ifndef ROCKSDB_LITE
+    options.allow_mmap_reads = FLAGS_mmap_read;
+    env_options.use_mmap_reads = FLAGS_mmap_read;
+    ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
+    table_options.hash_table_ratio = 0.75;
+    tf.reset(ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options));
+#else
+    fprintf(stderr, "Plain table is not supported in lite mode\n");
+    exit(1);
+#endif  // ROCKSDB_LITE
+  } else if (FLAGS_table_factory == "plain_table") {
+#ifndef ROCKSDB_LITE
+    options.allow_mmap_reads = FLAGS_mmap_read;
+    env_options.use_mmap_reads = FLAGS_mmap_read;
+
+    ROCKSDB_NAMESPACE::PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 16;
+    plain_table_options.bloom_bits_per_key = (FLAGS_prefix_len == 16) ? 0 : 8;
+    plain_table_options.hash_table_ratio = 0.75;
+
+    tf.reset(new ROCKSDB_NAMESPACE::PlainTableFactory(plain_table_options));
+    options.prefix_extractor.reset(
+        ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_len));
+#else
+    fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
+    exit(1);
+#endif  // ROCKSDB_LITE
+  } else if (FLAGS_table_factory == "block_based") {
+    tf.reset(new ROCKSDB_NAMESPACE::BlockBasedTableFactory());
+  } else {
+    fprintf(stderr, "Invalid table type %s\n", FLAGS_table_factory.c_str());
+  }
+
+  if (tf) {
+    // if user provides invalid options, just fall back to microsecond.
+    bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond";
+
+    options.table_factory = tf;
+    ROCKSDB_NAMESPACE::TableReaderBenchmark(
+        options, env_options, ro, FLAGS_num_keys1, FLAGS_num_keys2, FLAGS_iter,
+        FLAGS_prefix_len, FLAGS_query_empty, FLAGS_iterator, FLAGS_through_db,
+        measured_by_nanosecond);
+  } else {
+    return 1;
+  }
+
+  return 0;
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/table/table_test.cc b/src/rocksdb/table/table_test.cc
new file mode 100644
index 000000000..af9c177e8
--- /dev/null
+++ b/src/rocksdb/table/table_test.cc
@@ -0,0 +1,5596 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/table.h"
+
+#include <gtest/gtest.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "cache/lru_cache.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "memtable/stl_wrappers.h"
+#include "monitoring/statistics.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/unique_id.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/flush_block_policy.h"
+#include "table/block_fetcher.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding_lean.h"
+#include "util/compression.h"
+#include "util/file_checksum_helper.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/memory_allocators.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+
+namespace {
+
+const std::string kDummyValue(10000, 'o');
+
+// DummyPropertiesCollector used to test BlockBasedTableProperties
+class DummyPropertiesCollector : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "DummyPropertiesCollector"; }
+
+  Status Finish(UserCollectedProperties* /*properties*/) override {
+    return Status::OK();
+  }
+
+  Status Add(const Slice& /*user_key*/, const Slice& /*value*/) override {
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+};
+
+class DummyPropertiesCollectorFactory1
+    : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new DummyPropertiesCollector();
+  }
+  const char* Name() const override {
+    return "DummyPropertiesCollectorFactory1";
+  }
+};
+
+class DummyPropertiesCollectorFactory2
+    : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new DummyPropertiesCollector();
+  }
+  const char* Name() const override {
+    return "DummyPropertiesCollectorFactory2";
+  }
+};
+
+// Return reverse of "key".
+// Used to test non-lexicographic comparators.
+std::string Reverse(const Slice& key) {
+  auto rev = key.ToString();
+  std::reverse(rev.begin(), rev.end());
+  return rev;
+}
+
+class ReverseKeyComparator : public Comparator {
+ public:
+  const char* Name() const override {
+    return "rocksdb.ReverseBytewiseComparator";
+  }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    return BytewiseComparator()->Compare(Reverse(a), Reverse(b));
+  }
+
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
+    std::string s = Reverse(*start);
+    std::string l = Reverse(limit);
+    BytewiseComparator()->FindShortestSeparator(&s, l);
+    *start = Reverse(s);
+  }
+
+  void FindShortSuccessor(std::string* key) const override {
+    std::string s = Reverse(*key);
+    BytewiseComparator()->FindShortSuccessor(&s);
+    *key = Reverse(s);
+  }
+};
+
+ReverseKeyComparator reverse_key_comparator;
+
+void Increment(const Comparator* cmp, std::string* key) {
+  if (cmp == BytewiseComparator()) {
+    key->push_back('\0');
+  } else {
+    assert(cmp == &reverse_key_comparator);
+    std::string rev = Reverse(*key);
+    rev.push_back('\0');
+    *key = Reverse(rev);
+  }
+}
+
+const auto kUnknownColumnFamily =
+    TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+
+}  // namespace
+
+// Helper class for tests to unify the interface between
+// BlockBuilder/TableBuilder and Block/Table.
+class Constructor {
+ public:
+  explicit Constructor(const Comparator* cmp)
+      : data_(stl_wrappers::LessOfComparator(cmp)) {}
+  virtual ~Constructor() {}
+
+  void Add(const std::string& key, const Slice& value) {
+    data_[key] = value.ToString();
+  }
+
+  // Finish constructing the data structure with all the keys that have
+  // been added so far.  Returns the keys in sorted order in "*keys"
+  // and stores the key/value pairs in "*kvmap"
+  void Finish(const Options& options, const ImmutableOptions& ioptions,
+              const MutableCFOptions& moptions,
+              const BlockBasedTableOptions& table_options,
+              const InternalKeyComparator& internal_comparator,
+              std::vector<std::string>* keys, stl_wrappers::KVMap* kvmap) {
+    last_internal_comparator_ = &internal_comparator;
+    *kvmap = data_;
+    keys->clear();
+    for (const auto& kv : data_) {
+      keys->push_back(kv.first);
+    }
+    data_.clear();
+    Status s = FinishImpl(options, ioptions, moptions, table_options,
+                          internal_comparator, *kvmap);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+  }
+
+  // Construct the data structure from the data in "data"
+  virtual Status FinishImpl(const Options& options,
+                            const ImmutableOptions& ioptions,
+                            const MutableCFOptions& moptions,
+                            const BlockBasedTableOptions& table_options,
+                            const InternalKeyComparator& internal_comparator,
+                            const stl_wrappers::KVMap& data) = 0;
+
+  virtual InternalIterator* NewIterator(
+      const SliceTransform* prefix_extractor = nullptr) const = 0;
+
+  virtual const stl_wrappers::KVMap& data() { return data_; }
+
+  virtual bool IsArenaMode() const { return false; }
+
+  virtual DB* db() const { return nullptr; }  // Overridden in DBConstructor
+
+  virtual bool AnywayDeleteIterator() const { return false; }
+
+ protected:
+  const InternalKeyComparator* last_internal_comparator_;
+
+ private:
+  stl_wrappers::KVMap data_;
+};
+
+// A helper class that converts internal format keys into user keys
+class KeyConvertingIterator : public InternalIterator {
+ public:
+  explicit KeyConvertingIterator(InternalIterator* iter,
+                                 bool arena_mode = false)
+      : iter_(iter), arena_mode_(arena_mode) {}
+  ~KeyConvertingIterator() override {
+    if (arena_mode_) {
+      iter_->~InternalIterator();
+    } else {
+      delete iter_;
+    }
+  }
+  bool Valid() const override { return iter_->Valid() && status_.ok(); }
+  void Seek(const Slice& target) override {
+    ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+    std::string encoded;
+    AppendInternalKey(&encoded, ikey);
+    iter_->Seek(encoded);
+  }
+  void SeekForPrev(const Slice& target) override {
+    ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+    std::string encoded;
+    AppendInternalKey(&encoded, ikey);
+    iter_->SeekForPrev(encoded);
+  }
+  void SeekToFirst() override { iter_->SeekToFirst(); }
+  void SeekToLast() override { iter_->SeekToLast(); }
+  void Next() override { iter_->Next(); }
+  void Prev() override { iter_->Prev(); }
+  IterBoundCheck UpperBoundCheckResult() override {
+    return iter_->UpperBoundCheckResult();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    ParsedInternalKey parsed_key;
+    Status pik_status =
+        ParseInternalKey(iter_->key(), &parsed_key, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      status_ = pik_status;
+      return Slice(status_.getState());
+    }
+    return parsed_key.user_key;
+  }
+
+  Slice value() const override { return iter_->value(); }
+  Status status() const override {
+    return status_.ok() ? iter_->status() : status_;
+  }
+
+ private:
+  mutable Status status_;
+  InternalIterator* iter_;
+  bool arena_mode_;
+
+  // No copying allowed
+  KeyConvertingIterator(const KeyConvertingIterator&);
+  void operator=(const KeyConvertingIterator&);
+};
+
+// `BlockConstructor` APIs always accept/return user keys.
+class BlockConstructor : public Constructor {
+ public:
+  explicit BlockConstructor(const Comparator* cmp)
+      : Constructor(cmp), comparator_(cmp), block_(nullptr) {}
+  ~BlockConstructor() override { delete block_; }
+  Status FinishImpl(const Options& /*options*/,
+                    const ImmutableOptions& /*ioptions*/,
+                    const MutableCFOptions& /*moptions*/,
+                    const BlockBasedTableOptions& table_options,
+                    const InternalKeyComparator& /*internal_comparator*/,
+                    const stl_wrappers::KVMap& kv_map) override {
+    delete block_;
+    block_ = nullptr;
+    BlockBuilder builder(table_options.block_restart_interval);
+
+    for (const auto& kv : kv_map) {
+      // `DataBlockIter` assumes it reads only internal keys. `BlockConstructor`
+      // clients provide user keys, so we need to convert to internal key format
+      // before writing the data block.
+      ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
+      std::string encoded;
+      AppendInternalKey(&encoded, ikey);
+      builder.Add(encoded, kv.second);
+    }
+    // Open the block
+    data_ = builder.Finish().ToString();
+    BlockContents contents;
+    contents.data = data_;
+    block_ = new Block(std::move(contents));
+    return Status::OK();
+  }
+  InternalIterator* NewIterator(
+      const SliceTransform* /*prefix_extractor*/) const override {
+    // `DataBlockIter` returns the internal keys it reads.
+    // `KeyConvertingIterator` converts them to user keys before they are
+    // exposed to the `BlockConstructor` clients.
+    return new KeyConvertingIterator(
+        block_->NewDataIterator(comparator_, kDisableGlobalSequenceNumber));
+  }
+
+ private:
+  const Comparator* comparator_;
+  std::string data_;
+  Block* block_;
+
+  BlockConstructor();
+};
+
+class TableConstructor : public Constructor {
+ public:
+  explicit TableConstructor(const Comparator* cmp,
+                            bool convert_to_internal_key = false,
+                            int level = -1, SequenceNumber largest_seqno = 0)
+      : Constructor(cmp),
+        largest_seqno_(largest_seqno),
+        convert_to_internal_key_(convert_to_internal_key),
+        level_(level) {
+    env_ = ROCKSDB_NAMESPACE::Env::Default();
+  }
+  ~TableConstructor() override { Reset(); }
+
+  Status FinishImpl(const Options& options, const ImmutableOptions& ioptions,
+                    const MutableCFOptions& moptions,
+                    const BlockBasedTableOptions& /*table_options*/,
+                    const InternalKeyComparator& internal_comparator,
+                    const stl_wrappers::KVMap& kv_map) override {
+    Reset();
+    soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+    std::unique_ptr<FSWritableFile> sink(new test::StringSink());
+    file_writer_.reset(new WritableFileWriter(
+        std::move(sink), "" /* don't care */, FileOptions()));
+    std::unique_ptr<TableBuilder> builder;
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+    if (largest_seqno_ != 0) {
+      // Pretend that it's an external file written by SstFileWriter.
+      int_tbl_prop_collector_factories.emplace_back(
+          new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+                                                      0 /* global_seqno*/));
+    }
+
+    std::string column_family_name;
+    builder.reset(ioptions.table_factory->NewTableBuilder(
+        TableBuilderOptions(ioptions, moptions, internal_comparator,
+                            &int_tbl_prop_collector_factories,
+                            options.compression, options.compression_opts,
+                            kUnknownColumnFamily, column_family_name, level_),
+        file_writer_.get()));
+
+    for (const auto& kv : kv_map) {
+      if (convert_to_internal_key_) {
+        ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
+        std::string encoded;
+        AppendInternalKey(&encoded, ikey);
+        builder->Add(encoded, kv.second);
+      } else {
+        builder->Add(kv.first, kv.second);
+      }
+      EXPECT_OK(builder->status());
+    }
+    Status s = builder->Finish();
+    EXPECT_OK(file_writer_->Flush());
+    EXPECT_TRUE(s.ok()) << s.ToString();
+
+    EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize());
+
+    // Open the table
+    file_num_ = cur_file_num_++;
+
+    return Reopen(ioptions, moptions);
+  }
+
+  InternalIterator* NewIterator(
+      const SliceTransform* prefix_extractor) const override {
+    InternalIterator* iter = table_reader_->NewIterator(
+        read_options_, prefix_extractor, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+    if (convert_to_internal_key_) {
+      return new KeyConvertingIterator(iter);
+    } else {
+      return iter;
+    }
+  }
+
+  uint64_t ApproximateOffsetOf(const Slice& key) const {
+    if (convert_to_internal_key_) {
+      InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+      const Slice skey = ikey.Encode();
+      return table_reader_->ApproximateOffsetOf(
+          skey, TableReaderCaller::kUncategorized);
+    }
+    return table_reader_->ApproximateOffsetOf(
+        key, TableReaderCaller::kUncategorized);
+  }
+
+  virtual Status Reopen(const ImmutableOptions& ioptions,
+                        const MutableCFOptions& moptions) {
+    std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+        TEST_GetSink()->contents(), file_num_, ioptions.allow_mmap_reads));
+
+    file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
+    return ioptions.table_factory->NewTableReader(
+        TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
+                           *last_internal_comparator_, /*skip_filters*/ false,
+                           /*immortal*/ false, false, level_,
+                           &block_cache_tracer_, moptions.write_buffer_size, "",
+                           file_num_, kNullUniqueId64x2, largest_seqno_),
+        std::move(file_reader_), TEST_GetSink()->contents().size(),
+        &table_reader_);
+  }
+
+  virtual TableReader* GetTableReader() { return table_reader_.get(); }
+
+  bool AnywayDeleteIterator() const override {
+    return convert_to_internal_key_;
+  }
+
+  void ResetTableReader() { table_reader_.reset(); }
+
+  bool ConvertToInternalKey() { return convert_to_internal_key_; }
+
+  test::StringSink* TEST_GetSink() {
+    return static_cast<test::StringSink*>(file_writer_->writable_file());
+  }
+
+  BlockCacheTracer block_cache_tracer_;
+
+ private:
+  void Reset() {
+    file_num_ = 0;
+    table_reader_.reset();
+    file_writer_.reset();
+    file_reader_.reset();
+  }
+
+  const ReadOptions read_options_;
+  uint64_t file_num_;
+  std::unique_ptr<WritableFileWriter> file_writer_;
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  std::unique_ptr<TableReader> table_reader_;
+  SequenceNumber largest_seqno_;
+  bool convert_to_internal_key_;
+  int level_;
+
+  TableConstructor();
+
+  static uint64_t cur_file_num_;
+  EnvOptions soptions;
+  Env* env_;
+};
+uint64_t TableConstructor::cur_file_num_ = 1;
+
+class MemTableConstructor : public Constructor {
+ public:
+  explicit MemTableConstructor(const Comparator* cmp, WriteBufferManager* wb)
+      : Constructor(cmp),
+        internal_comparator_(cmp),
+        write_buffer_manager_(wb),
+        table_factory_(new SkipListFactory) {
+    options_.memtable_factory = table_factory_;
+    ImmutableOptions ioptions(options_);
+    memtable_ =
+        new MemTable(internal_comparator_, ioptions, MutableCFOptions(options_),
+                     wb, kMaxSequenceNumber, 0 /* column_family_id */);
+    memtable_->Ref();
+  }
+  ~MemTableConstructor() override { delete memtable_->Unref(); }
+  Status FinishImpl(const Options&, const ImmutableOptions& ioptions,
+                    const MutableCFOptions& /*moptions*/,
+                    const BlockBasedTableOptions& /*table_options*/,
+                    const InternalKeyComparator& /*internal_comparator*/,
+                    const stl_wrappers::KVMap& kv_map) override {
+    delete memtable_->Unref();
+    ImmutableOptions mem_ioptions(ioptions);
+    memtable_ = new MemTable(internal_comparator_, mem_ioptions,
+                             MutableCFOptions(options_), write_buffer_manager_,
+                             kMaxSequenceNumber, 0 /* column_family_id */);
+    memtable_->Ref();
+    int seq = 1;
+    for (const auto& kv : kv_map) {
+      Status s = memtable_->Add(seq, kTypeValue, kv.first, kv.second,
+                                nullptr /* kv_prot_info */);
+      if (!s.ok()) {
+        return s;
+      }
+      seq++;
+    }
+    return Status::OK();
+  }
+  InternalIterator* NewIterator(
+      const SliceTransform* /*prefix_extractor*/) const override {
+    return new KeyConvertingIterator(
+        memtable_->NewIterator(ReadOptions(), &arena_), true);
+  }
+
+  bool AnywayDeleteIterator() const override { return true; }
+
+  bool IsArenaMode() const override { return true; }
+
+ private:
+  mutable Arena arena_;
+  InternalKeyComparator internal_comparator_;
+  Options options_;
+  WriteBufferManager* write_buffer_manager_;
+  MemTable* memtable_;
+  std::shared_ptr<SkipListFactory> table_factory_;
+};
+
+class InternalIteratorFromIterator : public InternalIterator {
+ public:
+  explicit InternalIteratorFromIterator(Iterator* it) : it_(it) {}
+  bool Valid() const override { return it_->Valid(); }
+  void Seek(const Slice& target) override { it_->Seek(target); }
+  void SeekForPrev(const Slice& target) override { it_->SeekForPrev(target); }
+  void SeekToFirst() override { it_->SeekToFirst(); }
+  void SeekToLast() override { it_->SeekToLast(); }
+  void Next() override { it_->Next(); }
+  void Prev() override { it_->Prev(); }
+  Slice key() const override { return it_->key(); }
+  Slice value() const override { return it_->value(); }
+  Status status() const override { return it_->status(); }
+
+ private:
+  std::unique_ptr<Iterator> it_;
+};
+
+class DBConstructor : public Constructor {
+ public:
+  explicit DBConstructor(const Comparator* cmp)
+      : Constructor(cmp), comparator_(cmp) {
+    db_ = nullptr;
+    NewDB();
+  }
+  ~DBConstructor() override { delete db_; }
+  Status FinishImpl(const Options& /*options*/,
+                    const ImmutableOptions& /*ioptions*/,
+                    const MutableCFOptions& /*moptions*/,
+                    const BlockBasedTableOptions& /*table_options*/,
+                    const InternalKeyComparator& /*internal_comparator*/,
+                    const stl_wrappers::KVMap& kv_map) override {
+    delete db_;
+    db_ = nullptr;
+    NewDB();
+    for (const auto& kv : kv_map) {
+      WriteBatch batch;
+      EXPECT_OK(batch.Put(kv.first, kv.second));
+      EXPECT_TRUE(db_->Write(WriteOptions(), &batch).ok());
+    }
+    return Status::OK();
+  }
+
+  InternalIterator* NewIterator(
+      const SliceTransform* /*prefix_extractor*/) const override {
+    return new InternalIteratorFromIterator(db_->NewIterator(ReadOptions()));
+  }
+
+  DB* db() const override { return db_; }
+
+ private:
+  void NewDB() {
+    std::string name = test::PerThreadDBPath("table_testdb");
+
+    Options options;
+    options.comparator = comparator_;
+    Status status = DestroyDB(name, options);
+    ASSERT_TRUE(status.ok()) << status.ToString();
+
+    options.create_if_missing = true;
+    options.error_if_exists = true;
+    options.write_buffer_size = 10000;  // Something small to force merging
+    status = DB::Open(options, name, &db_);
+    ASSERT_TRUE(status.ok()) << status.ToString();
+  }
+
+  const Comparator* comparator_;
+  DB* db_;
+};
+
+enum TestType {
+  BLOCK_BASED_TABLE_TEST,
+#ifndef ROCKSDB_LITE
+  PLAIN_TABLE_SEMI_FIXED_PREFIX,
+  PLAIN_TABLE_FULL_STR_PREFIX,
+  PLAIN_TABLE_TOTAL_ORDER,
+#endif  // !ROCKSDB_LITE
+  BLOCK_TEST,
+  MEMTABLE_TEST,
+  DB_TEST
+};
+
+struct TestArgs {
+  TestType type;
+  bool reverse_compare;
+  int restart_interval;
+  CompressionType compression;
+  uint32_t compression_parallel_threads;
+  uint32_t format_version;
+  bool use_mmap;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestArgs& args) {
+  os << "type: " << args.type << " reverse_compare: " << args.reverse_compare
+     << " restart_interval: " << args.restart_interval
+     << " compression: " << args.compression
+     << " compression_parallel_threads: " << args.compression_parallel_threads
+     << " format_version: " << args.format_version
+     << " use_mmap: " << args.use_mmap;
+
+  return os;
+}
+
+static std::vector<TestArgs> GenerateArgList() {
+  std::vector<TestArgs> test_args;
+  std::vector<TestType> test_types = {BLOCK_BASED_TABLE_TEST,
+#ifndef ROCKSDB_LITE
+                                      PLAIN_TABLE_SEMI_FIXED_PREFIX,
+                                      PLAIN_TABLE_FULL_STR_PREFIX,
+                                      PLAIN_TABLE_TOTAL_ORDER,
+#endif  // !ROCKSDB_LITE
+                                      BLOCK_TEST,
+                                      MEMTABLE_TEST,
+                                      DB_TEST};
+  std::vector<bool> reverse_compare_types = {false, true};
+  std::vector<int> restart_intervals = {16, 1, 1024};
+  std::vector<uint32_t> compression_parallel_threads = {1, 4};
+
+  // Only add compression if it is supported
+  std::vector<std::pair<CompressionType, bool>> compression_types;
+  compression_types.emplace_back(kNoCompression, false);
+  if (Snappy_Supported()) {
+    compression_types.emplace_back(kSnappyCompression, false);
+  }
+  if (Zlib_Supported()) {
+    compression_types.emplace_back(kZlibCompression, false);
+    compression_types.emplace_back(kZlibCompression, true);
+  }
+  if (BZip2_Supported()) {
+    compression_types.emplace_back(kBZip2Compression, false);
+    compression_types.emplace_back(kBZip2Compression, true);
+  }
+  if (LZ4_Supported()) {
+    compression_types.emplace_back(kLZ4Compression, false);
+    compression_types.emplace_back(kLZ4Compression, true);
+    compression_types.emplace_back(kLZ4HCCompression, false);
+    compression_types.emplace_back(kLZ4HCCompression, true);
+  }
+  if (XPRESS_Supported()) {
+    compression_types.emplace_back(kXpressCompression, false);
+    compression_types.emplace_back(kXpressCompression, true);
+  }
+  if (ZSTD_Supported()) {
+    compression_types.emplace_back(kZSTD, false);
+    compression_types.emplace_back(kZSTD, true);
+  }
+
+  for (auto test_type : test_types) {
+    for (auto reverse_compare : reverse_compare_types) {
+#ifndef ROCKSDB_LITE
+      if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX ||
+          test_type == PLAIN_TABLE_FULL_STR_PREFIX ||
+          test_type == PLAIN_TABLE_TOTAL_ORDER) {
+        // Plain table doesn't use restart index or compression.
+        TestArgs one_arg;
+        one_arg.type = test_type;
+        one_arg.reverse_compare = reverse_compare;
+        one_arg.restart_interval = restart_intervals[0];
+        one_arg.compression = compression_types[0].first;
+        one_arg.compression_parallel_threads = 1;
+        one_arg.format_version = 0;
+        one_arg.use_mmap = true;
+        test_args.push_back(one_arg);
+        one_arg.use_mmap = false;
+        test_args.push_back(one_arg);
+        continue;
+      }
+#endif  // !ROCKSDB_LITE
+
+      for (auto restart_interval : restart_intervals) {
+        for (auto compression_type : compression_types) {
+          for (auto num_threads : compression_parallel_threads) {
+            TestArgs one_arg;
+            one_arg.type = test_type;
+            one_arg.reverse_compare = reverse_compare;
+            one_arg.restart_interval = restart_interval;
+            one_arg.compression = compression_type.first;
+            one_arg.compression_parallel_threads = num_threads;
+            one_arg.format_version = compression_type.second ? 2 : 1;
+            one_arg.use_mmap = false;
+            test_args.push_back(one_arg);
+          }
+        }
+      }
+    }
+  }
+  return test_args;
+}
+
+// In order to make all tests run for plain table format, including
+// those operating on empty keys, create a new prefix transformer which
+// return fixed prefix if the slice is not shorter than the prefix length,
+// and the full slice if it is shorter.
+class FixedOrLessPrefixTransform : public SliceTransform {
+ private:
+  const size_t prefix_len_;
+
+ public:
+  explicit FixedOrLessPrefixTransform(size_t prefix_len)
+      : prefix_len_(prefix_len) {}
+
+  const char* Name() const override { return "rocksdb.FixedPrefix"; }
+
+  Slice Transform(const Slice& src) const override {
+    assert(InDomain(src));
+    if (src.size() < prefix_len_) {
+      return src;
+    }
+    return Slice(src.data(), prefix_len_);
+  }
+
+  bool InDomain(const Slice& /*src*/) const override { return true; }
+
+  bool InRange(const Slice& dst) const override {
+    return (dst.size() <= prefix_len_);
+  }
+  bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
+};
+
+class HarnessTest : public testing::Test {
+ public:
+  explicit HarnessTest(const TestArgs& args)
+      : args_(args),
+        ioptions_(options_),
+        moptions_(options_),
+        write_buffer_(options_.db_write_buffer_size),
+        support_prev_(true),
+        only_support_prefix_seek_(false) {
+    options_.compression = args_.compression;
+    options_.compression_opts.parallel_threads =
+        args_.compression_parallel_threads;
+    // Use shorter block size for tests to exercise block boundary
+    // conditions more.
+    if (args_.reverse_compare) {
+      options_.comparator = &reverse_key_comparator;
+    }
+
+    internal_comparator_.reset(
+        new test::PlainInternalKeyComparator(options_.comparator));
+
+    options_.allow_mmap_reads = args_.use_mmap;
+    switch (args_.type) {
+      case BLOCK_BASED_TABLE_TEST:
+        table_options_.flush_block_policy_factory.reset(
+            new FlushBlockBySizePolicyFactory());
+        table_options_.block_size = 256;
+        table_options_.block_restart_interval = args_.restart_interval;
+        table_options_.index_block_restart_interval = args_.restart_interval;
+        table_options_.format_version = args_.format_version;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+      case PLAIN_TABLE_SEMI_FIXED_PREFIX:
+        support_prev_ = false;
+        only_support_prefix_seek_ = true;
+        options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2));
+        options_.table_factory.reset(NewPlainTableFactory());
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+      case PLAIN_TABLE_FULL_STR_PREFIX:
+        support_prev_ = false;
+        only_support_prefix_seek_ = true;
+        options_.prefix_extractor.reset(NewNoopTransform());
+        options_.table_factory.reset(NewPlainTableFactory());
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+      case PLAIN_TABLE_TOTAL_ORDER:
+        support_prev_ = false;
+        only_support_prefix_seek_ = false;
+        options_.prefix_extractor = nullptr;
+
+        {
+          PlainTableOptions plain_table_options;
+          plain_table_options.user_key_len = kPlainTableVariableLength;
+          plain_table_options.bloom_bits_per_key = 0;
+          plain_table_options.hash_table_ratio = 0;
+
+          options_.table_factory.reset(
+              NewPlainTableFactory(plain_table_options));
+        }
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+#endif  // !ROCKSDB_LITE
+      case BLOCK_TEST:
+        table_options_.block_size = 256;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
+        constructor_.reset(new BlockConstructor(options_.comparator));
+        break;
+      case MEMTABLE_TEST:
+        table_options_.block_size = 256;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
+        constructor_.reset(
+            new MemTableConstructor(options_.comparator, &write_buffer_));
+        break;
+      case DB_TEST:
+        table_options_.block_size = 256;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
+        constructor_.reset(new DBConstructor(options_.comparator));
+        break;
+    }
+    ioptions_ = ImmutableOptions(options_);
+    moptions_ = MutableCFOptions(options_);
+  }
+
+  void Add(const std::string& key, const std::string& value) {
+    constructor_->Add(key, value);
+  }
+
+  void Test(Random* rnd) {
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap data;
+    constructor_->Finish(options_, ioptions_, moptions_, table_options_,
+                         *internal_comparator_, &keys, &data);
+
+    TestForwardScan(keys, data);
+    if (support_prev_) {
+      TestBackwardScan(keys, data);
+    }
+    TestRandomAccess(rnd, keys, data);
+  }
+
+  void TestForwardScan(const std::vector<std::string>& /*keys*/,
+                       const stl_wrappers::KVMap& data) {
+    InternalIterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    for (stl_wrappers::KVMap::const_iterator model_iter = data.begin();
+         model_iter != data.end(); ++model_iter) {
+      ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+      iter->Next();
+      ASSERT_OK(iter->status());
+    }
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~InternalIterator();
+    } else {
+      delete iter;
+    }
+  }
+
+  void TestBackwardScan(const std::vector<std::string>& /*keys*/,
+                        const stl_wrappers::KVMap& data) {
+    InternalIterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    iter->SeekToLast();
+    ASSERT_OK(iter->status());
+    for (stl_wrappers::KVMap::const_reverse_iterator model_iter = data.rbegin();
+         model_iter != data.rend(); ++model_iter) {
+      ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+      iter->Prev();
+      ASSERT_OK(iter->status());
+    }
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~InternalIterator();
+    } else {
+      delete iter;
+    }
+  }
+
+  void TestRandomAccess(Random* rnd, const std::vector<std::string>& keys,
+                        const stl_wrappers::KVMap& data) {
+    static const bool kVerbose = false;
+    InternalIterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    stl_wrappers::KVMap::const_iterator model_iter = data.begin();
+    if (kVerbose) fprintf(stderr, "---\n");
+    for (int i = 0; i < 200; i++) {
+      const int toss = rnd->Uniform(support_prev_ ? 5 : 3);
+      switch (toss) {
+        case 0: {
+          if (iter->Valid()) {
+            if (kVerbose) fprintf(stderr, "Next\n");
+            iter->Next();
+            ASSERT_OK(iter->status());
+            ++model_iter;
+            ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          }
+          break;
+        }
+
+        case 1: {
+          if (kVerbose) fprintf(stderr, "SeekToFirst\n");
+          iter->SeekToFirst();
+          ASSERT_OK(iter->status());
+          model_iter = data.begin();
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+
+        case 2: {
+          std::string key = PickRandomKey(rnd, keys);
+          model_iter = data.lower_bound(key);
+          if (kVerbose)
+            fprintf(stderr, "Seek '%s'\n", EscapeString(key).c_str());
+          iter->Seek(Slice(key));
+          ASSERT_OK(iter->status());
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+
+        case 3: {
+          if (iter->Valid()) {
+            if (kVerbose) fprintf(stderr, "Prev\n");
+            iter->Prev();
+            ASSERT_OK(iter->status());
+            if (model_iter == data.begin()) {
+              model_iter = data.end();  // Wrap around to invalid value
+            } else {
+              --model_iter;
+            }
+            ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          }
+          break;
+        }
+
+        case 4: {
+          if (kVerbose) fprintf(stderr, "SeekToLast\n");
+          iter->SeekToLast();
+          ASSERT_OK(iter->status());
+          if (keys.empty()) {
+            model_iter = data.end();
+          } else {
+            std::string last = data.rbegin()->first;
+            model_iter = data.lower_bound(last);
+          }
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+      }
+    }
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~InternalIterator();
+    } else {
+      delete iter;
+    }
+  }
+
+  std::string ToString(const stl_wrappers::KVMap& data,
+                       const stl_wrappers::KVMap::const_iterator& it) {
+    if (it == data.end()) {
+      return "END";
+    } else {
+      return "'" + it->first + "->" + it->second + "'";
+    }
+  }
+
+  std::string ToString(const stl_wrappers::KVMap& data,
+                       const stl_wrappers::KVMap::const_reverse_iterator& it) {
+    if (it == data.rend()) {
+      return "END";
+    } else {
+      return "'" + it->first + "->" + it->second + "'";
+    }
+  }
+
+  std::string ToString(const InternalIterator* it) {
+    if (!it->Valid()) {
+      return "END";
+    } else {
+      return "'" + it->key().ToString() + "->" + it->value().ToString() + "'";
+    }
+  }
+
+  std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) {
+    if (keys.empty()) {
+      return "foo";
+    } else {
+      const int index = rnd->Uniform(static_cast<int>(keys.size()));
+      std::string result = keys[index];
+      switch (rnd->Uniform(support_prev_ ? 3 : 1)) {
+        case 0:
+          // Return an existing key
+          break;
+        case 1: {
+          // Attempt to return something smaller than an existing key
+          if (result.size() > 0 && result[result.size() - 1] > '\0' &&
+              (!only_support_prefix_seek_ ||
+               options_.prefix_extractor->Transform(result).size() <
+                   result.size())) {
+            result[result.size() - 1]--;
+          }
+          break;
+        }
+        case 2: {
+          // Return something larger than an existing key
+          Increment(options_.comparator, &result);
+          break;
+        }
+      }
+      return result;
+    }
+  }
+
+  // Returns nullptr if not running against a DB
+  DB* db() const { return constructor_->db(); }
+
+ private:
+  TestArgs args_;
+  Options options_;
+  ImmutableOptions ioptions_;
+  MutableCFOptions moptions_;
+  BlockBasedTableOptions table_options_;
+  std::unique_ptr<Constructor> constructor_;
+  WriteBufferManager write_buffer_;
+  bool support_prev_;
+  bool only_support_prefix_seek_;
+  std::shared_ptr<InternalKeyComparator> internal_comparator_;
+};
+
+class ParameterizedHarnessTest : public HarnessTest,
+                                 public testing::WithParamInterface<TestArgs> {
+ public:
+  ParameterizedHarnessTest() : HarnessTest(GetParam()) {}
+};
+
+INSTANTIATE_TEST_CASE_P(TableTest, ParameterizedHarnessTest,
+                        ::testing::ValuesIn(GenerateArgList()));
+
+class DBHarnessTest : public HarnessTest {
+ public:
+  DBHarnessTest()
+      : HarnessTest(TestArgs{DB_TEST, /* reverse_compare */ false,
+                             /* restart_interval */ 16, kNoCompression,
+                             /* compression_parallel_threads */ 1,
+                             /* format_version */ 0, /* use_mmap */ false}) {}
+};
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val), (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
+}
+
+// Tests against all kinds of tables
+class TableTest : public testing::Test {
+ public:
+  const InternalKeyComparator& GetPlainInternalComparator(
+      const Comparator* comp) {
+    if (!plain_internal_comparator) {
+      plain_internal_comparator.reset(
+          new test::PlainInternalKeyComparator(comp));
+    }
+    return *plain_internal_comparator;
+  }
+  void IndexTest(BlockBasedTableOptions table_options);
+
+ private:
+  std::unique_ptr<InternalKeyComparator> plain_internal_comparator;
+};
+
+class GeneralTableTest : public TableTest {};
+class BlockBasedTableTestBase : public TableTest {};
+class BlockBasedTableTest
+    : public BlockBasedTableTestBase,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+  BlockBasedTableTest() : format_(GetParam()) {
+    env_ = ROCKSDB_NAMESPACE::Env::Default();
+  }
+
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions options;
+    options.format_version = format_;
+    return options;
+  }
+
+  void SetupTracingTest(TableConstructor* c) {
+    test_path_ = test::PerThreadDBPath("block_based_table_tracing_test");
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/block_cache_trace_file";
+
+    BlockCacheTraceWriterOptions trace_writer_opt;
+    BlockCacheTraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_,
+                                 &trace_writer));
+    std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+        NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
+                                 std::move(trace_writer));
+    ASSERT_NE(block_cache_trace_writer, nullptr);
+    // Always return Status::OK().
+    assert(c->block_cache_tracer_
+               .StartTrace(trace_opt, std::move(block_cache_trace_writer))
+               .ok());
+
+    {
+      std::string user_key = "k01";
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      c->Add(encoded_key, kDummyValue);
+    }
+    {
+      std::string user_key = "k02";
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      c->Add(encoded_key, kDummyValue);
+    }
+  }
+
+  void VerifyBlockAccessTrace(
+      TableConstructor* c,
+      const std::vector<BlockCacheTraceRecord>& expected_records) {
+    c->block_cache_tracer_.EndTrace();
+
+    {
+      std::unique_ptr<TraceReader> trace_reader;
+      Status s = NewFileTraceReader(env_, EnvOptions(), trace_file_path_,
+                                    &trace_reader);
+      EXPECT_OK(s);
+      BlockCacheTraceReader reader(std::move(trace_reader));
+      BlockCacheTraceHeader header;
+      EXPECT_OK(reader.ReadHeader(&header));
+      uint32_t index = 0;
+      while (s.ok()) {
+        BlockCacheTraceRecord access;
+        s = reader.ReadAccess(&access);
+        if (!s.ok()) {
+          break;
+        }
+        ASSERT_LT(index, expected_records.size());
+        EXPECT_NE("", access.block_key);
+        EXPECT_EQ(access.block_type, expected_records[index].block_type);
+        EXPECT_GT(access.block_size, 0);
+        EXPECT_EQ(access.caller, expected_records[index].caller);
+        EXPECT_EQ(access.no_insert, expected_records[index].no_insert);
+        EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit);
+        // Get
+        if (access.caller == TableReaderCaller::kUserGet) {
+          EXPECT_EQ(access.referenced_key,
+                    expected_records[index].referenced_key);
+          EXPECT_EQ(access.get_id, expected_records[index].get_id);
+          EXPECT_EQ(access.get_from_user_specified_snapshot,
+                    expected_records[index].get_from_user_specified_snapshot);
+          if (access.block_type == TraceType::kBlockTraceDataBlock) {
+            EXPECT_GT(access.referenced_data_size, 0);
+            EXPECT_GT(access.num_keys_in_block, 0);
+            EXPECT_EQ(access.referenced_key_exist_in_block,
+                      expected_records[index].referenced_key_exist_in_block);
+          }
+        } else {
+          EXPECT_EQ(access.referenced_key, "");
+          EXPECT_EQ(access.get_id, 0);
+          EXPECT_FALSE(access.get_from_user_specified_snapshot);
+          EXPECT_EQ(access.referenced_data_size, 0);
+          EXPECT_EQ(access.num_keys_in_block, 0);
+          EXPECT_FALSE(access.referenced_key_exist_in_block);
+        }
+        index++;
+      }
+      EXPECT_EQ(index, expected_records.size());
+    }
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+ protected:
+  uint64_t IndexUncompressedHelper(bool indexCompress);
+
+ private:
+  uint32_t format_;
+  Env* env_;
+  std::string trace_file_path_;
+  std::string test_path_;
+};
+class PlainTableTest : public TableTest {};
+class TablePropertyTest : public testing::Test {};
+class BBTTailPrefetchTest : public TableTest {};
+
+// The helper class to test the file checksum
+class FileChecksumTestHelper {
+ public:
+  FileChecksumTestHelper(bool convert_to_internal_key = false)
+      : convert_to_internal_key_(convert_to_internal_key) {}
+  ~FileChecksumTestHelper() {}
+
+  void CreateWritableFile() {
+    sink_ = new test::StringSink();
+    std::unique_ptr<FSWritableFile> holder(sink_);
+    file_writer_.reset(new WritableFileWriter(
+        std::move(holder), "" /* don't care */, FileOptions()));
+  }
+
+  void SetFileChecksumGenerator(FileChecksumGenerator* checksum_generator) {
+    if (file_writer_ != nullptr) {
+      file_writer_->TEST_SetFileChecksumGenerator(checksum_generator);
+    } else {
+      delete checksum_generator;
+    }
+  }
+
+  WritableFileWriter* GetFileWriter() { return file_writer_.get(); }
+
+  Status ResetTableBuilder(std::unique_ptr<TableBuilder>&& builder) {
+    assert(builder != nullptr);
+    table_builder_ = std::move(builder);
+    return Status::OK();
+  }
+
+  void AddKVtoKVMap(int num_entries) {
+    Random rnd(test::RandomSeed());
+    for (int i = 0; i < num_entries; i++) {
+      std::string v = rnd.RandomString(100);
+      kv_map_[test::RandomKey(&rnd, 20)] = v;
+    }
+  }
+
+  Status WriteKVAndFlushTable() {
+    for (const auto& kv : kv_map_) {
+      if (convert_to_internal_key_) {
+        ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
+        std::string encoded;
+        AppendInternalKey(&encoded, ikey);
+        table_builder_->Add(encoded, kv.second);
+      } else {
+        table_builder_->Add(kv.first, kv.second);
+      }
+      EXPECT_TRUE(table_builder_->status().ok());
+    }
+    Status s = table_builder_->Finish();
+    EXPECT_OK(file_writer_->Flush());
+    EXPECT_OK(s);
+
+    EXPECT_EQ(sink_->contents().size(), table_builder_->FileSize());
+    return s;
+  }
+
+  std::string GetFileChecksum() {
+    EXPECT_OK(file_writer_->Close());
+    return table_builder_->GetFileChecksum();
+  }
+
+  const char* GetFileChecksumFuncName() {
+    return table_builder_->GetFileChecksumFuncName();
+  }
+
+  Status CalculateFileChecksum(FileChecksumGenerator* file_checksum_generator,
+                               std::string* checksum) {
+    assert(file_checksum_generator != nullptr);
+    cur_file_num_ = checksum_file_num_++;
+    test::StringSink* ss_rw =
+        static_cast<test::StringSink*>(file_writer_->writable_file());
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw->contents()));
+    file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
+
+    std::unique_ptr<char[]> scratch(new char[2048]);
+    Slice result;
+    uint64_t offset = 0;
+    Status s;
+    s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
+                           nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+    if (!s.ok()) {
+      return s;
+    }
+    while (result.size() != 0) {
+      file_checksum_generator->Update(scratch.get(), result.size());
+      offset += static_cast<uint64_t>(result.size());
+      s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
+                             nullptr,
+                             Env::IO_TOTAL /* rate_limiter_priority */);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    EXPECT_EQ(offset, static_cast<uint64_t>(table_builder_->FileSize()));
+    file_checksum_generator->Finalize();
+    *checksum = file_checksum_generator->GetChecksum();
+    return Status::OK();
+  }
+
+ private:
+  bool convert_to_internal_key_;
+  uint64_t cur_file_num_;
+  std::unique_ptr<WritableFileWriter> file_writer_;
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  std::unique_ptr<TableBuilder> table_builder_;
+  stl_wrappers::KVMap kv_map_;
+  test::StringSink* sink_ = nullptr;
+
+  static uint64_t checksum_file_num_;
+};
+
+uint64_t FileChecksumTestHelper::checksum_file_num_ = 1;
+
+INSTANTIATE_TEST_CASE_P(FormatVersions, BlockBasedTableTest,
+                        testing::ValuesIn(test::kFooterFormatVersionsToTest));
+
+// This test serves as the living tutorial for the prefix scan of user collected
+// properties.
+TEST_F(TablePropertyTest, PrefixScanTest) {
+  UserCollectedProperties props{
+      {"num.111.1", "1"}, {"num.111.2", "2"}, {"num.111.3", "3"},
+      {"num.333.1", "1"}, {"num.333.2", "2"}, {"num.333.3", "3"},
+      {"num.555.1", "1"}, {"num.555.2", "2"}, {"num.555.3", "3"},
+  };
+
+  // prefixes that exist
+  for (const std::string prefix : {"num.111", "num.333", "num.555"}) {
+    int num = 0;
+    for (auto pos = props.lower_bound(prefix);
+         pos != props.end() &&
+         pos->first.compare(0, prefix.size(), prefix) == 0;
+         ++pos) {
+      ++num;
+      auto key = prefix + "." + std::to_string(num);
+      ASSERT_EQ(key, pos->first);
+      ASSERT_EQ(std::to_string(num), pos->second);
+    }
+    ASSERT_EQ(3, num);
+  }
+
+  // prefixes that don't exist
+  for (const std::string prefix :
+       {"num.000", "num.222", "num.444", "num.666"}) {
+    auto pos = props.lower_bound(prefix);
+    ASSERT_TRUE(pos == props.end() ||
+                pos->first.compare(0, prefix.size(), prefix) != 0);
+  }
+}
+
+namespace {
+struct TestIds {
+  UniqueId64x3 internal_id;
+  UniqueId64x3 external_id;
+};
+
+inline bool operator==(const TestIds& lhs, const TestIds& rhs) {
+  return lhs.internal_id == rhs.internal_id &&
+         lhs.external_id == rhs.external_id;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestIds& ids) {
+  return os << std::hex << "{{{ 0x" << ids.internal_id[0] << "U, 0x"
+            << ids.internal_id[1] << "U, 0x" << ids.internal_id[2]
+            << "U }}, {{ 0x" << ids.external_id[0] << "U, 0x"
+            << ids.external_id[1] << "U, 0x" << ids.external_id[2] << "U }}}";
+}
+
+TestIds GetUniqueId(TableProperties* tp, std::unordered_set<uint64_t>* seen,
+                    const std::string& db_id, const std::string& db_session_id,
+                    uint64_t file_number) {
+  // First test session id logic
+  if (db_session_id.size() == 20) {
+    uint64_t upper;
+    uint64_t lower;
+    EXPECT_OK(DecodeSessionId(db_session_id, &upper, &lower));
+    EXPECT_EQ(EncodeSessionId(upper, lower), db_session_id);
+  }
+
+  // Get external using public API
+  tp->db_id = db_id;
+  tp->db_session_id = db_session_id;
+  tp->orig_file_number = file_number;
+  TestIds t;
+  {
+    std::string euid;
+    EXPECT_OK(GetExtendedUniqueIdFromTableProperties(*tp, &euid));
+    EXPECT_EQ(euid.size(), 24U);
+    t.external_id[0] = DecodeFixed64(&euid[0]);
+    t.external_id[1] = DecodeFixed64(&euid[8]);
+    t.external_id[2] = DecodeFixed64(&euid[16]);
+
+    std::string uid;
+    EXPECT_OK(GetUniqueIdFromTableProperties(*tp, &uid));
+    EXPECT_EQ(uid.size(), 16U);
+    EXPECT_EQ(uid, euid.substr(0, 16));
+    EXPECT_EQ(t.external_id[0], DecodeFixed64(&uid[0]));
+    EXPECT_EQ(t.external_id[1], DecodeFixed64(&uid[8]));
+  }
+  // All these should be effectively random
+  EXPECT_TRUE(seen->insert(t.external_id[0]).second);
+  EXPECT_TRUE(seen->insert(t.external_id[1]).second);
+  EXPECT_TRUE(seen->insert(t.external_id[2]).second);
+
+  // Get internal with internal API
+  EXPECT_OK(GetSstInternalUniqueId(db_id, db_session_id, file_number,
+                                   &t.internal_id));
+  EXPECT_NE(t.internal_id, kNullUniqueId64x3);
+
+  // Verify relationship
+  UniqueId64x3 tmp = t.internal_id;
+  InternalUniqueIdToExternal(&tmp);
+  EXPECT_EQ(tmp, t.external_id);
+  ExternalUniqueIdToInternal(&tmp);
+  EXPECT_EQ(tmp, t.internal_id);
+
+  // And 128-bit internal version
+  UniqueId64x2 tmp2{};
+  EXPECT_OK(GetSstInternalUniqueId(db_id, db_session_id, file_number, &tmp2));
+  EXPECT_NE(tmp2, kNullUniqueId64x2);
+
+  EXPECT_EQ(tmp2[0], t.internal_id[0]);
+  EXPECT_EQ(tmp2[1], t.internal_id[1]);
+  InternalUniqueIdToExternal(&tmp2);
+  EXPECT_EQ(tmp2[0], t.external_id[0]);
+  EXPECT_EQ(tmp2[1], t.external_id[1]);
+  ExternalUniqueIdToInternal(&tmp2);
+  EXPECT_EQ(tmp2[0], t.internal_id[0]);
+  EXPECT_EQ(tmp2[1], t.internal_id[1]);
+
+  return t;
+}
+}  // namespace
+
+TEST_F(TablePropertyTest, UniqueIdsSchemaAndQuality) {
+  // To ensure the computation only depends on the expected entries, we set
+  // the rest randomly
+  TableProperties tp;
+  TEST_SetRandomTableProperties(&tp);
+
+  // DB id is normally RFC-4122
+  const std::string db_id1 = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
+  // Allow other forms of DB id
+  const std::string db_id2 = "1728000184588763620";
+  const std::string db_id3 = "x";
+
+  // DB session id is normally 20 chars in base-36, but 13 to 24 chars
+  // is ok, roughly 64 to 128 bits.
+  const std::string ses_id1 = "ABCDEFGHIJ0123456789";
+  // Same trailing 13 digits
+  const std::string ses_id2 = "HIJ0123456789";
+  const std::string ses_id3 = "0123ABCDEFGHIJ0123456789";
+  // Different trailing 12 digits
+  const std::string ses_id4 = "ABCDEFGH888888888888";
+  // And change length
+  const std::string ses_id5 = "ABCDEFGHIJ012";
+  const std::string ses_id6 = "ABCDEFGHIJ0123456789ABCD";
+
+  using T = TestIds;
+  std::unordered_set<uint64_t> seen;
+  // Establish a stable schema for the unique IDs. These values must not
+  // change for existing table files.
+  // (Note: parens needed for macro parsing, extra braces needed for some
+  // compilers.)
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id1, 1),
+      T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
+         {{0xf0bd230365df7464U, 0xca089303f3648eb4U, 0x4b44f7e7324b2817U}}}));
+  // Only change internal_id[1] with file number
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id1, 2),
+      T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757feU, 0x907f41dfd90724ffU}},
+         {{0xf13fdf7adcfebb6dU, 0x97cd2226cc033ea2U, 0x198c438182091f0eU}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id1, 123456789),
+      T({{{0x61d7dcf415d9cf19U, 0x160d77aaee5c9ae9U, 0x907f41dfd90724ffU}},
+         {{0x81fbcebe1ac6c4f0U, 0x6b14a64cfdc0f1c4U, 0x7d8fb6eaf18edbb3U}}}));
+  // Change internal_id[1] and internal_id[2] with db_id
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id2, ses_id1, 1),
+      T({{{0x61d7dcf415d9cf19U, 0xf89c471f572f0d25U, 0x1f0f2a5eb0e6257eU}},
+         {{0x7f1d01d453616991U, 0x32ddf2afec804ab2U, 0xd10a1ee2f0c7d9c1U}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id3, ses_id1, 1),
+      T({{{0x61d7dcf415d9cf19U, 0xfed297a8154a57d0U, 0x8b931b9cdebd9e8U}},
+         {{0x62b2f43183f6894bU, 0x897ff2b460eefad1U, 0xf4ec189fb2d15e04U}}}));
+  // Keeping same last 13 digits of ses_id keeps same internal_id[0]
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id2, 1),
+      T({{{0x61d7dcf415d9cf19U, 0x5f6cc4fa2d528c8U, 0x7b70845d5bfb5446U}},
+         {{0x96d1c83ffcc94266U, 0x82663eac0ec6e14aU, 0x94a88b49678b77f6U}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id3, 1),
+      T({{{0x61d7dcf415d9cf19U, 0xfc7232879db37ea2U, 0xc0378d74ea4c89cdU}},
+         {{0xdf2ef57e98776905U, 0xda5b31c987da833bU, 0x79c1b4bd0a9e760dU}}}));
+  // Changing last 12 digits of ses_id only changes internal_id[0]
+  // (vs. db_id1, ses_id1, 1)
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id4, 1),
+      T({{{0x4f07cc0d003a83a8U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
+         {{0xbcf85336a9f71f04U, 0x4f2949e2f3adb60dU, 0x9ca0def976abfa10U}}}));
+  // ses_id can change everything.
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id5, 1),
+      T({{{0x94b8768e43f87ce6U, 0xc2559653ac4e7c93U, 0xde6dff6bbb1223U}},
+         {{0x5a9537af681817fbU, 0x1afcd1fecaead5eaU, 0x767077ad9ebe0008U}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id6, 1),
+      T({{{0x43cfb0ffa3b710edU, 0x263c580426406a1bU, 0xfacc91379a80d29dU}},
+         {{0xfa90547d84cb1cdbU, 0x2afe99c641992d4aU, 0x205b7f7b60e51cc2U}}}));
+
+  // Now verify more thoroughly that any small change in inputs completely
+  // changes external unique id.
+  // (Relying on 'seen' checks etc. in GetUniqueId)
+  std::string db_id = "00000000-0000-0000-0000-000000000000";
+  std::string ses_id = "000000000000000000000000";
+  uint64_t file_num = 1;
+  // change db_id
+  for (size_t i = 0; i < db_id.size(); ++i) {
+    if (db_id[i] == '-') {
+      continue;
+    }
+    for (char alt : std::string("123456789abcdef")) {
+      db_id[i] = alt;
+      GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
+    }
+    db_id[i] = '0';
+  }
+  // change ses_id
+  for (size_t i = 0; i < ses_id.size(); ++i) {
+    for (char alt : std::string("123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")) {
+      ses_id[i] = alt;
+      GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
+    }
+    ses_id[i] = '0';
+  }
+  // change file_num
+  for (int i = 1; i < 64; ++i) {
+    GetUniqueId(&tp, &seen, db_id, ses_id, file_num << i);
+  }
+
+  // Verify that "all zeros" in first 128 bits is equivalent for internal and
+  // external IDs. This way, as long as we avoid "all zeros" in internal IDs,
+  // we avoid it in external IDs.
+  {
+    UniqueId64x3 id1{{0, 0, Random::GetTLSInstance()->Next64()}};
+    UniqueId64x3 id2 = id1;
+    InternalUniqueIdToExternal(&id1);
+    EXPECT_EQ(id1, id2);
+    ExternalUniqueIdToInternal(&id2);
+    EXPECT_EQ(id1, id2);
+  }
+}
+
+namespace {
+void SetGoodTableProperties(TableProperties* tp) {
+  // To ensure the computation only depends on the expected entries, we set
+  // the rest randomly
+  TEST_SetRandomTableProperties(tp);
+  tp->db_id = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
+  tp->db_session_id = "ABCDEFGHIJ0123456789";
+  tp->orig_file_number = 1;
+}
+}  // namespace
+
+TEST_F(TablePropertyTest, UniqueIdHumanStrings) {
+  TableProperties tp;
+  SetGoodTableProperties(&tp);
+
+  std::string tmp;
+  EXPECT_OK(GetExtendedUniqueIdFromTableProperties(tp, &tmp));
+  EXPECT_EQ(tmp,
+            (std::string{{'\x64', '\x74', '\xdf', '\x65', '\x03', '\x23',
+                          '\xbd', '\xf0', '\xb4', '\x8e', '\x64', '\xf3',
+                          '\x03', '\x93', '\x08', '\xca', '\x17', '\x28',
+                          '\x4b', '\x32', '\xe7', '\xf7', '\x44', '\x4b'}}));
+  EXPECT_EQ(UniqueIdToHumanString(tmp),
+            "6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B");
+
+  EXPECT_OK(GetUniqueIdFromTableProperties(tp, &tmp));
+  EXPECT_EQ(UniqueIdToHumanString(tmp), "6474DF650323BDF0-B48E64F3039308CA");
+
+  // including zero padding
+  tmp = std::string(24U, '\0');
+  tmp[15] = '\x12';
+  tmp[23] = '\xAB';
+  EXPECT_EQ(UniqueIdToHumanString(tmp),
+            "0000000000000000-0000000000000012-00000000000000AB");
+
+  // And shortened
+  tmp = std::string(20U, '\0');
+  tmp[5] = '\x12';
+  tmp[10] = '\xAB';
+  tmp[17] = '\xEF';
+  EXPECT_EQ(UniqueIdToHumanString(tmp),
+            "0000000000120000-0000AB0000000000-00EF0000");
+
+  tmp.resize(16);
+  EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB0000000000");
+
+  tmp.resize(11);
+  EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB");
+
+  tmp.resize(6);
+  EXPECT_EQ(UniqueIdToHumanString(tmp), "000000000012");
+
+  // Also internal IDs to human string
+  UniqueId64x3 euid = {12345, 678, 9};
+  EXPECT_EQ(InternalUniqueIdToHumanString(&euid), "{12345,678,9}");
+
+  UniqueId64x2 uid = {1234, 567890};
+  EXPECT_EQ(InternalUniqueIdToHumanString(&uid), "{1234,567890}");
+}
+
+TEST_F(TablePropertyTest, UniqueIdsFailure) {
+  TableProperties tp;
+  std::string tmp;
+
+  // Missing DB id
+  SetGoodTableProperties(&tp);
+  tp.db_id = "";
+  EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+  EXPECT_TRUE(
+      GetExtendedUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+
+  // Missing session id
+  SetGoodTableProperties(&tp);
+  tp.db_session_id = "";
+  EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+  EXPECT_TRUE(
+      GetExtendedUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+
+  // Missing file number
+  SetGoodTableProperties(&tp);
+  tp.orig_file_number = 0;
+  EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+  EXPECT_TRUE(
+      GetExtendedUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+}
+
+// This test include all the basic checks except those for index size and block
+// size, which will be conducted in separated unit tests.
+TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+
+  c.Add("a1", "val1");
+  c.Add("b2", "val2");
+  c.Add("c3", "val3");
+  c.Add("d4", "val4");
+  c.Add("e5", "val5");
+  c.Add("f6", "val6");
+  c.Add("g7", "val7");
+  c.Add("h8", "val8");
+  c.Add("j9", "val9");
+  uint64_t diff_internal_user_bytes = 9 * 8;  // 8 is seq size, 9 k-v totally
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  options.compression = kNoCompression;
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_restart_interval = 1;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_NOT_COMPRESSED), 0);
+
+  auto& props = *c.GetTableReader()->GetTableProperties();
+  ASSERT_EQ(kvmap.size(), props.num_entries);
+
+  auto raw_key_size = kvmap.size() * 2ul;
+  auto raw_value_size = kvmap.size() * 4ul;
+
+  ASSERT_EQ(raw_key_size + diff_internal_user_bytes, props.raw_key_size);
+  ASSERT_EQ(raw_value_size, props.raw_value_size);
+  ASSERT_EQ(1ul, props.num_data_blocks);
+  ASSERT_EQ("", props.filter_policy_name);  // no filter policy is used
+
+  // Verify data size.
+  BlockBuilder block_builder(1);
+  for (const auto& item : kvmap) {
+    block_builder.Add(item.first, item.second);
+  }
+  Slice content = block_builder.Finish();
+  ASSERT_EQ(content.size() + BlockBasedTable::kBlockTrailerSize +
+                diff_internal_user_bytes,
+            props.data_size);
+  c.ResetTableReader();
+}
+
+#ifdef SNAPPY
+uint64_t BlockBasedTableTest::IndexUncompressedHelper(bool compressed) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  constexpr size_t kNumKeys = 10000;
+
+  for (size_t k = 0; k < kNumKeys; ++k) {
+    c.Add("key" + std::to_string(k), "val" + std::to_string(k));
+  }
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  options.compression = kSnappyCompression;
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_restart_interval = 1;
+  table_options.enable_index_compression = compressed;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  c.ResetTableReader();
+  return options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+}
+TEST_P(BlockBasedTableTest, IndexUncompressed) {
+  uint64_t tbl1_compressed_cnt = IndexUncompressedHelper(true);
+  uint64_t tbl2_compressed_cnt = IndexUncompressedHelper(false);
+  // tbl1_compressed_cnt should include 1 index block
+  EXPECT_EQ(tbl2_compressed_cnt + 1, tbl1_compressed_cnt);
+}
+#endif  // SNAPPY
+
+TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
+  TableConstructor c(&reverse_key_comparator);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+
+  {
+    Options options;
+    options.compression = CompressionType::kNoCompression;
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    const ImmutableOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
+             GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+    auto& props = *c.GetTableReader()->GetTableProperties();
+
+    // Default comparator
+    ASSERT_EQ("leveldb.BytewiseComparator", props.comparator_name);
+    // No merge operator
+    ASSERT_EQ("nullptr", props.merge_operator_name);
+    // No prefix extractor
+    ASSERT_EQ("nullptr", props.prefix_extractor_name);
+    // No property collectors
+    ASSERT_EQ("[]", props.property_collectors_names);
+    // No filter policy is used
+    ASSERT_EQ("", props.filter_policy_name);
+    // Compression type == that set:
+    ASSERT_EQ("NoCompression", props.compression_name);
+    c.ResetTableReader();
+  }
+
+  {
+    Options options;
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.comparator = &reverse_key_comparator;
+    options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+    options.prefix_extractor.reset(NewNoopTransform());
+    options.table_properties_collector_factories.emplace_back(
+        new DummyPropertiesCollectorFactory1());
+    options.table_properties_collector_factories.emplace_back(
+        new DummyPropertiesCollectorFactory2());
+
+    const ImmutableOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
+             GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+    auto& props = *c.GetTableReader()->GetTableProperties();
+
+    ASSERT_EQ("rocksdb.ReverseBytewiseComparator", props.comparator_name);
+    ASSERT_EQ("UInt64AddOperator", props.merge_operator_name);
+    ASSERT_EQ("rocksdb.Noop", props.prefix_extractor_name);
+    ASSERT_EQ(
+        "[DummyPropertiesCollectorFactory1,DummyPropertiesCollectorFactory2]",
+        props.property_collectors_names);
+    ASSERT_EQ("", props.filter_policy_name);  // no filter policy is used
+    c.ResetTableReader();
+  }
+}
+
+TEST_P(BlockBasedTableTest, RangeDelBlock) {
+  TableConstructor c(BytewiseComparator());
+  std::vector<std::string> keys = {"1pika", "2chu"};
+  std::vector<std::string> vals = {"p", "c"};
+
+  std::vector<RangeTombstone> expected_tombstones = {
+      {"1pika", "2chu", 0},
+      {"2chu", "c", 1},
+      {"2chu", "c", 0},
+      {"c", "p", 0},
+  };
+
+  for (int i = 0; i < 2; i++) {
+    RangeTombstone t(keys[i], vals[i], i);
+    std::pair<InternalKey, Slice> p = t.Serialize();
+    c.Add(p.first.Encode().ToString(), p.second);
+  }
+
+  std::vector<std::string> sorted_keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  options.compression = kNoCompression;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_restart_interval = 1;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  std::unique_ptr<InternalKeyComparator> internal_cmp(
+      new InternalKeyComparator(options.comparator));
+  c.Finish(options, ioptions, moptions, table_options, *internal_cmp,
+           &sorted_keys, &kvmap);
+
+  for (int j = 0; j < 2; ++j) {
+    std::unique_ptr<InternalIterator> iter(
+        c.GetTableReader()->NewRangeTombstoneIterator(ReadOptions()));
+    if (j > 0) {
+      // For second iteration, delete the table reader object and verify the
+      // iterator can still access its metablock's range tombstones.
+      c.ResetTableReader();
+    }
+    ASSERT_FALSE(iter->Valid());
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    for (size_t i = 0; i < expected_tombstones.size(); i++) {
+      ASSERT_TRUE(iter->Valid());
+      ParsedInternalKey parsed_key;
+      ASSERT_OK(
+          ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */));
+      RangeTombstone t(parsed_key, iter->value());
+      const auto& expected_t = expected_tombstones[i];
+      ASSERT_EQ(t.start_key_, expected_t.start_key_);
+      ASSERT_EQ(t.end_key_, expected_t.end_key_);
+      ASSERT_EQ(t.seq_, expected_t.seq_);
+      iter->Next();
+    }
+    ASSERT_TRUE(!iter->Valid());
+  }
+}
+
+TEST_P(BlockBasedTableTest, FilterPolicyNameProperties) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("a1", "val1");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  Options options;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  auto& props = *c.GetTableReader()->GetTableProperties();
+  ASSERT_EQ(table_options.filter_policy->Name(), props.filter_policy_name);
+  c.ResetTableReader();
+}
+
+//
+// BlockBasedTableTest::PrefetchTest
+//
+void AssertKeysInCache(BlockBasedTable* table_reader,
+                       const std::vector<std::string>& keys_in_cache,
+                       const std::vector<std::string>& keys_not_in_cache,
+                       bool convert = false) {
+  if (convert) {
+    for (auto key : keys_in_cache) {
+      InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+      ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+    }
+    for (auto key : keys_not_in_cache) {
+      InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+      ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+    }
+  } else {
+    for (auto key : keys_in_cache) {
+      ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
+    }
+    for (auto key : keys_not_in_cache) {
+      ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key));
+    }
+  }
+}
+
+void PrefetchRange(TableConstructor* c, Options* opt,
+                   BlockBasedTableOptions* table_options, const char* key_begin,
+                   const char* key_end,
+                   const std::vector<std::string>& keys_in_cache,
+                   const std::vector<std::string>& keys_not_in_cache,
+                   const Status expected_status = Status::OK()) {
+  // reset the cache and reopen the table
+  table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+  opt->table_factory.reset(NewBlockBasedTableFactory(*table_options));
+  const ImmutableOptions ioptions2(*opt);
+  const MutableCFOptions moptions(*opt);
+  ASSERT_OK(c->Reopen(ioptions2, moptions));
+
+  // prefetch
+  auto* table_reader = dynamic_cast<BlockBasedTable*>(c->GetTableReader());
+  Status s;
+  std::unique_ptr<Slice> begin, end;
+  std::unique_ptr<InternalKey> i_begin, i_end;
+  if (key_begin != nullptr) {
+    if (c->ConvertToInternalKey()) {
+      i_begin.reset(new InternalKey(key_begin, kMaxSequenceNumber, kTypeValue));
+      begin.reset(new Slice(i_begin->Encode()));
+    } else {
+      begin.reset(new Slice(key_begin));
+    }
+  }
+  if (key_end != nullptr) {
+    if (c->ConvertToInternalKey()) {
+      i_end.reset(new InternalKey(key_end, kMaxSequenceNumber, kTypeValue));
+      end.reset(new Slice(i_end->Encode()));
+    } else {
+      end.reset(new Slice(key_end));
+    }
+  }
+  s = table_reader->Prefetch(begin.get(), end.get());
+
+  ASSERT_TRUE(s.code() == expected_status.code());
+
+  // assert our expectation in cache warmup
+  AssertKeysInCache(table_reader, keys_in_cache, keys_not_in_cache,
+                    c->ConvertToInternalKey());
+  c->ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, PrefetchTest) {
+  // The purpose of this test is to test the prefetching operation built into
+  // BlockBasedTable.
+  Options opt;
+  std::unique_ptr<InternalKeyComparator> ikc;
+  ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+  opt.compression = kNoCompression;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_size = 1024;
+  // big enough so we don't ever lose cached values.
+  table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+  opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(opt);
+  const MutableCFOptions moptions(opt);
+  c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
+  c.ResetTableReader();
+
+  // We get the following data spread :
+  //
+  // Data block         Index
+  // ========================
+  // [ k01 k02 k03 ]    k03
+  // [ k04         ]    k04
+  // [ k05         ]    k05
+  // [ k06 k07     ]    k07
+
+  // Simple
+  PrefetchRange(&c, &opt, &table_options,
+                /*key_range=*/"k01", "k05",
+                /*keys_in_cache=*/{"k01", "k02", "k03", "k04", "k05"},
+                /*keys_not_in_cache=*/{"k06", "k07"});
+  PrefetchRange(&c, &opt, &table_options, "k01", "k01", {"k01", "k02", "k03"},
+                {"k04", "k05", "k06", "k07"});
+  // odd
+  PrefetchRange(&c, &opt, &table_options, "a", "z",
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+  PrefetchRange(&c, &opt, &table_options, "k00", "k00", {"k01", "k02", "k03"},
+                {"k04", "k05", "k06", "k07"});
+  // Edge cases
+  PrefetchRange(&c, &opt, &table_options, "k00", "k06",
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+  PrefetchRange(&c, &opt, &table_options, "k00", "zzz",
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+  // null keys
+  PrefetchRange(&c, &opt, &table_options, nullptr, nullptr,
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+  PrefetchRange(&c, &opt, &table_options, "k04", nullptr,
+                {"k04", "k05", "k06", "k07"}, {"k01", "k02", "k03"});
+  PrefetchRange(&c, &opt, &table_options, nullptr, "k05",
+                {"k01", "k02", "k03", "k04", "k05"}, {"k06", "k07"});
+  // invalid
+  PrefetchRange(&c, &opt, &table_options, "k06", "k00", {}, {},
+                Status::InvalidArgument(Slice("k06 "), Slice("k07")));
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  for (int i = 0; i <= 4; ++i) {
+    Options options;
+    // Make each key/value an individual block
+    table_options.block_size = 64;
+    switch (i) {
+      case 0:
+        // Binary search index
+        table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+        options.table_factory.reset(new BlockBasedTableFactory(table_options));
+        break;
+      case 1:
+        // Hash search index
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        options.table_factory.reset(new BlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+        break;
+      case 2:
+        // Hash search index with filter policy
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+        options.table_factory.reset(new BlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+        break;
+      case 3:
+        // Two-level index
+        table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+        options.table_factory.reset(new BlockBasedTableFactory(table_options));
+        break;
+      case 4:
+        // Binary search with first key
+        table_options.index_type =
+            BlockBasedTableOptions::kBinarySearchWithFirstKey;
+        options.table_factory.reset(new BlockBasedTableFactory(table_options));
+        break;
+    }
+
+    TableConstructor c(BytewiseComparator(),
+                       true /* convert_to_internal_key_ */);
+    c.Add("aaaa1", std::string('a', 56));
+    c.Add("bbaa1", std::string('a', 56));
+    c.Add("cccc1", std::string('a', 56));
+    c.Add("bbbb1", std::string('a', 56));
+    c.Add("baaa1", std::string('a', 56));
+    c.Add("abbb1", std::string('a', 56));
+    c.Add("cccc2", std::string('a', 56));
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap kvmap;
+    const ImmutableOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
+             GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+    auto props = c.GetTableReader()->GetTableProperties();
+    ASSERT_EQ(7u, props->num_data_blocks);
+    auto* reader = c.GetTableReader();
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    iter->Seek(InternalKey("b", 0, kTypeValue).Encode());
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("baaa1", ExtractUserKey(iter->key()).ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
+
+    iter->Seek(InternalKey("bb", 0, kTypeValue).Encode());
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
+
+    iter->Seek(InternalKey("bbb", 0, kTypeValue).Encode());
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("cccc1", ExtractUserKey(iter->key()).ToString());
+  }
+}
+
+TEST_P(BlockBasedTableTest, NoopTransformSeek) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+
+  Options options;
+  options.comparator = BytewiseComparator();
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewNoopTransform());
+
+  TableConstructor c(options.comparator);
+  // To tickle the PrefixMayMatch bug it is important that the
+  // user-key is a single byte so that the index key exactly matches
+  // the user-key.
+  InternalKey key("a", 1, kTypeValue);
+  c.Add(key.Encode().ToString(), "b");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+  auto* reader = c.GetTableReader();
+  for (int i = 0; i < 2; ++i) {
+    ReadOptions ro;
+    ro.total_order_seek = (i == 0);
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    iter->Seek(key.Encode());
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a", ExtractUserKey(iter->key()).ToString());
+  }
+}
+
+TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) {
+  // if DB is opened with a prefix extractor of a different name,
+  // prefix bloom is skipped when read the file
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.filter_policy.reset(NewBloomFilterPolicy(2));
+  table_options.whole_key_filtering = false;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  TableConstructor c(options.comparator);
+  InternalKey key("abcdefghijk", 1, kTypeValue);
+  c.Add(key.Encode().ToString(), "test");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+  // TODO(Zhongyi): update test to use MutableCFOptions
+  options.prefix_extractor.reset(NewFixedPrefixTransform(9));
+  const ImmutableOptions new_ioptions(options);
+  const MutableCFOptions new_moptions(options);
+  ASSERT_OK(c.Reopen(new_ioptions, new_moptions));
+  auto reader = c.GetTableReader();
+  ReadOptions read_options;
+  std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
+      read_options, new_moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  // Test point lookup
+  // only one kv
+  for (auto& kv : kvmap) {
+    db_iter->Seek(kv.first);
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(db_iter->key(), kv.first);
+    ASSERT_EQ(db_iter->value(), kv.second);
+  }
+}
+
+TEST_P(BlockBasedTableTest, BadChecksumType) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+
+  Options options;
+  options.comparator = BytewiseComparator();
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  TableConstructor c(options.comparator);
+  InternalKey key("abc", 1, kTypeValue);
+  c.Add(key.Encode().ToString(), "test");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+  // Corrupt checksum type (123 is invalid)
+  auto& sink = *c.TEST_GetSink();
+  size_t len = sink.contents_.size();
+  ASSERT_EQ(sink.contents_[len - Footer::kNewVersionsEncodedLength],
+            table_options.checksum);
+  sink.contents_[len - Footer::kNewVersionsEncodedLength] = char{123};
+
+  // (Re-)Open table file with bad checksum type
+  const ImmutableOptions new_ioptions(options);
+  const MutableCFOptions new_moptions(options);
+  Status s = c.Reopen(new_ioptions, new_moptions);
+  ASSERT_NOK(s);
+  ASSERT_EQ(s.ToString(),
+            "Corruption: Corrupt or unsupported checksum type: 123");
+}
+
+namespace {
+std::string ChecksumAsString(const std::string& data,
+                             ChecksumType checksum_type) {
+  uint32_t v = ComputeBuiltinChecksum(checksum_type, data.data(), data.size());
+
+  // Verify consistency with other function
+  if (data.size() >= 1) {
+    EXPECT_EQ(v, ComputeBuiltinChecksumWithLastByte(
+                     checksum_type, data.data(), data.size() - 1, data.back()));
+  }
+  // Little endian as in file
+  std::array<char, 4> raw_bytes;
+  EncodeFixed32(raw_bytes.data(), v);
+  return Slice(raw_bytes.data(), raw_bytes.size()).ToString(/*hex*/ true);
+}
+
+std::string ChecksumAsString(std::string* data, char new_last_byte,
+                             ChecksumType checksum_type) {
+  data->back() = new_last_byte;
+  return ChecksumAsString(*data, checksum_type);
+}
+}  // namespace
+
+// Make sure that checksum values don't change in later versions, even if
+// consistent within current version.
+TEST_P(BlockBasedTableTest, ChecksumSchemas) {
+  std::string b0 = "x";
+  std::string b1 = "This is a short block!x";
+  std::string b2;
+  for (int i = 0; i < 100; ++i) {
+    b2.append("This is a long block!");
+  }
+  b2.append("x");
+  // Trailing 'x' will be replaced by compression type
+
+  std::string empty;
+
+  char ct1 = kNoCompression;
+  char ct2 = kSnappyCompression;
+  char ct3 = kZSTD;
+
+  // Note: first byte of trailer is compression type, last 4 are checksum
+
+  for (ChecksumType t : GetSupportedChecksums()) {
+    switch (t) {
+      case kNoChecksum:
+        EXPECT_EQ(ChecksumAsString(empty, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00000000");
+        break;
+      case kCRC32c:
+        EXPECT_EQ(ChecksumAsString(empty, t), "D8EA82A2");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "D28F2549");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "052B2843");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "46F8F711");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "583F0355");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "2F9B0A57");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "ECE7DA1D");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "943EF0AB");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "43A2EDB1");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00E53D63");
+        break;
+      case kxxHash:
+        EXPECT_EQ(ChecksumAsString(empty, t), "055DCC02");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "3EB065CF");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "31F79238");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "320D2E00");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "4A2E5FB0");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "0BD9F652");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "B4107E50");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "20F4D4BA");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "8F1A1F99");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "A191A338");
+        break;
+      case kxxHash64:
+        EXPECT_EQ(ChecksumAsString(empty, t), "99E9D851");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "682705DB");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "30E7211B");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "B7BB58E8");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B74655EF");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "B6C8BBBE");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "AED9E3B4");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "0D4999FE");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "F5932423");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "6B31BAB1");
+        break;
+      case kXXH3:
+        EXPECT_EQ(ChecksumAsString(empty, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "C294D338");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "1B174353");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "2D0E20C8");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B37FB5E6");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "6AFC258D");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "5CE54616");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "FA2D482E");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "23AED845");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "15B7BBDE");
+        break;
+      default:
+        // Force this test to be updated on new ChecksumTypes
+        assert(false);
+        break;
+    }
+  }
+}
+
+void AddInternalKey(TableConstructor* c, const std::string& prefix,
+                    std::string value = "v", int /*suffix_len*/ = 800) {
+  static Random rnd(1023);
+  InternalKey k(prefix + rnd.RandomString(800), 0, kTypeValue);
+  c->Add(k.Encode().ToString(), value);
+}
+
+void TableTest::IndexTest(BlockBasedTableOptions table_options) {
+  TableConstructor c(BytewiseComparator());
+
+  // keys with prefix length 3, make sure the key/value is big enough to fill
+  // one block
+  AddInternalKey(&c, "0015");
+  AddInternalKey(&c, "0035");
+
+  AddInternalKey(&c, "0054");
+  AddInternalKey(&c, "0055");
+
+  AddInternalKey(&c, "0056");
+  AddInternalKey(&c, "0057");
+
+  AddInternalKey(&c, "0058");
+  AddInternalKey(&c, "0075");
+
+  AddInternalKey(&c, "0076");
+  AddInternalKey(&c, "0095");
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  table_options.block_size = 1700;
+  table_options.block_cache = NewLRUCache(1024, 4);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  auto reader = c.GetTableReader();
+
+  auto props = reader->GetTableProperties();
+  ASSERT_EQ(5u, props->num_data_blocks);
+
+  // TODO(Zhongyi): update test to use MutableCFOptions
+  ReadOptions read_options;
+  std::unique_ptr<InternalIterator> index_iter(reader->NewIterator(
+      read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  // -- Find keys do not exist, but have common prefix.
+  std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
+  std::vector<std::string> lower_bound = {
+      keys[0], keys[1], keys[2], keys[7], keys[9],
+  };
+
+  // find the lower bound of the prefix
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    index_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode());
+    ASSERT_OK(index_iter->status());
+    ASSERT_TRUE(index_iter->Valid());
+
+    // seek the first element in the block
+    ASSERT_EQ(lower_bound[i], index_iter->key().ToString());
+    ASSERT_EQ("v", index_iter->value().ToString());
+  }
+
+  // find the upper bound of prefixes
+  std::vector<std::string> upper_bound = {
+      keys[1],
+      keys[2],
+      keys[7],
+      keys[9],
+  };
+
+  // find existing keys
+  for (const auto& item : kvmap) {
+    auto ukey = ExtractUserKey(item.first).ToString();
+    index_iter->Seek(ukey);
+
+    // ASSERT_OK(regular_iter->status());
+    ASSERT_OK(index_iter->status());
+
+    // ASSERT_TRUE(regular_iter->Valid());
+    ASSERT_TRUE(index_iter->Valid());
+
+    ASSERT_EQ(item.first, index_iter->key().ToString());
+    ASSERT_EQ(item.second, index_iter->value().ToString());
+  }
+
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    // the key is greater than any existing keys.
+    auto key = prefixes[i] + "9";
+    index_iter->Seek(InternalKey(key, 0, kTypeValue).Encode());
+
+    ASSERT_TRUE(index_iter->status().ok() || index_iter->status().IsNotFound());
+    ASSERT_TRUE(!index_iter->status().IsNotFound() || !index_iter->Valid());
+    if (i == prefixes.size() - 1) {
+      // last key
+      ASSERT_TRUE(!index_iter->Valid());
+    } else {
+      ASSERT_TRUE(index_iter->Valid());
+      // seek the first element in the block
+      ASSERT_EQ(upper_bound[i], index_iter->key().ToString());
+      ASSERT_EQ("v", index_iter->value().ToString());
+    }
+  }
+
+  // find keys with prefix that don't match any of the existing prefixes.
+  std::vector<std::string> non_exist_prefixes = {"002", "004", "006", "008"};
+  for (const auto& prefix : non_exist_prefixes) {
+    index_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode());
+    // regular_iter->Seek(prefix);
+
+    ASSERT_OK(index_iter->status());
+    // Seek to non-existing prefixes should yield either invalid, or a
+    // key with prefix greater than the target.
+    if (index_iter->Valid()) {
+      Slice ukey = ExtractUserKey(index_iter->key());
+      Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
+      ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) < 0);
+    }
+  }
+  for (const auto& prefix : non_exist_prefixes) {
+    index_iter->SeekForPrev(InternalKey(prefix, 0, kTypeValue).Encode());
+    // regular_iter->Seek(prefix);
+
+    ASSERT_OK(index_iter->status());
+    // Seek to non-existing prefixes should yield either invalid, or a
+    // key with prefix greater than the target.
+    if (index_iter->Valid()) {
+      Slice ukey = ExtractUserKey(index_iter->key());
+      Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
+      ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) > 0);
+    }
+  }
+
+  {
+    // Test reseek case. It should impact partitioned index more.
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<InternalIterator> index_iter2(reader->NewIterator(
+        ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    // Things to cover in partitioned index:
+    // 1. Both of Seek() and SeekToLast() has optimization to prevent
+    //    rereek leaf index block if it remains to the same one, and
+    //    they reuse the same variable.
+    // 2. When Next() or Prev() is called, the block moves, so the
+    //    optimization should kick in only with the current one.
+    index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0075", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Next();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Next();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+  }
+
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexTest) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  IndexTest(table_options);
+}
+
+TEST_P(BlockBasedTableTest, HashIndexTest) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  IndexTest(table_options);
+}
+
+TEST_P(BlockBasedTableTest, PartitionIndexTest) {
+  const int max_index_keys = 5;
+  const int est_max_index_key_value_size = 32;
+  const int est_max_index_size = max_index_keys * est_max_index_key_value_size;
+  for (int i = 1; i <= est_max_index_size + 1; i++) {
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+    table_options.metadata_block_size = i;
+    IndexTest(table_options);
+  }
+}
+
+TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) {
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  Options options;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+
+  TableConstructor c(BytewiseComparator());
+  AddInternalKey(&c, "pika");
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  ASSERT_EQ(1, keys.size());
+
+  auto reader = c.GetTableReader();
+  ReadOptions ropt;
+  ropt.read_tier = ReadTier::kBlockCacheTier;
+  std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+      ropt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  auto ikey = [](Slice user_key) {
+    return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
+  };
+
+  iter->Seek(ikey("pika"));
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+
+  // This used to crash at some point.
+  iter->Seek(ikey("pika"));
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey1) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
+  IndexTest(table_options);
+}
+
+class CustomFlushBlockPolicy : public FlushBlockPolicyFactory,
+                               public FlushBlockPolicy {
+ public:
+  explicit CustomFlushBlockPolicy(std::vector<int> keys_per_block)
+      : keys_per_block_(keys_per_block) {}
+
+  const char* Name() const override { return "CustomFlushBlockPolicy"; }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&,
+                                        const BlockBuilder&) const override {
+    return new CustomFlushBlockPolicy(keys_per_block_);
+  }
+
+  bool Update(const Slice&, const Slice&) override {
+    if (keys_in_current_block_ >= keys_per_block_.at(current_block_idx_)) {
+      ++current_block_idx_;
+      keys_in_current_block_ = 1;
+      return true;
+    }
+
+    ++keys_in_current_block_;
+    return false;
+  }
+
+  std::vector<int> keys_per_block_;
+
+  int current_block_idx_ = 0;
+  int keys_in_current_block_ = 0;
+};
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) {
+  for (int use_first_key = 0; use_first_key < 2; ++use_first_key) {
+    SCOPED_TRACE("use_first_key = " + std::to_string(use_first_key));
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    table_options.index_type =
+        use_first_key ? BlockBasedTableOptions::kBinarySearchWithFirstKey
+                      : BlockBasedTableOptions::kBinarySearch;
+    table_options.block_cache = NewLRUCache(10000);  // fits all blocks
+    table_options.index_shortening =
+        BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+    table_options.flush_block_policy_factory =
+        std::make_shared<CustomFlushBlockPolicy>(std::vector<int>{2, 1, 3, 2});
+    Options options;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.statistics = CreateDBStatistics();
+    Statistics* stats = options.statistics.get();
+    std::unique_ptr<InternalKeyComparator> comparator(
+        new InternalKeyComparator(BytewiseComparator()));
+    const ImmutableOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+
+    TableConstructor c(BytewiseComparator());
+
+    // Block 0.
+    AddInternalKey(&c, "aaaa", "v0");
+    AddInternalKey(&c, "aaac", "v1");
+
+    // Block 1.
+    AddInternalKey(&c, "aaca", "v2");
+
+    // Block 2.
+    AddInternalKey(&c, "caaa", "v3");
+    AddInternalKey(&c, "caac", "v4");
+    AddInternalKey(&c, "caae", "v5");
+
+    // Block 3.
+    AddInternalKey(&c, "ccaa", "v6");
+    AddInternalKey(&c, "ccac", "v7");
+
+    // Write the file.
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap kvmap;
+    c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+             &kvmap);
+    ASSERT_EQ(8, keys.size());
+
+    auto reader = c.GetTableReader();
+    auto props = reader->GetTableProperties();
+    ASSERT_EQ(4u, props->num_data_blocks);
+    ReadOptions read_options;
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized,
+        /*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true));
+
+    // Shouldn't have read data blocks before iterator is seeked.
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    auto ikey = [](Slice user_key) {
+      return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
+    };
+
+    // Seek to a key between blocks. If index contains first key, we shouldn't
+    // read any data blocks until value is requested.
+    iter->Seek(ikey("aaba"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 0 : 1,
+              stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to the middle of a block. The block should be read right away.
+    iter->Seek(ikey("caab"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[4], iter->key().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v4", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to just before the same block and don't access value.
+    // The iterator should keep pinning the block contents.
+    iter->Seek(ikey("baaa"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[3], iter->key().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to the same block again to check that the block is still pinned.
+    iter->Seek(ikey("caae"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[5], iter->key().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v5", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward and fall through to the next block. Don't access value.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[6], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward again. Block should be read.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[7], iter->key().ToString());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v7", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward and reach the end.
+    iter->Next();
+    EXPECT_FALSE(iter->Valid());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to a single-key block and step forward without accessing value.
+    iter->Seek(ikey("aaca"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 0 : 1,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[3], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 1 : 2,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v3", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    // Seek between blocks and step back without accessing value.
+    iter->Seek(ikey("aaca"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[1], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    // All blocks are in cache now, there'll be no more misses ever.
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v1", iter->value().ToString());
+
+    // Next into the next block again.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 4,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to first and step back without accessing value.
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[0], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 5,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Prev();
+    EXPECT_FALSE(iter->Valid());
+    EXPECT_EQ(use_first_key ? 2 : 5,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Do some SeekForPrev() and SeekToLast() just to cover all methods.
+    iter->SeekForPrev(ikey("caad"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[4], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 3 : 6,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v4", iter->value().ToString());
+    EXPECT_EQ(use_first_key ? 3 : 6,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[7], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 4 : 7,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v7", iter->value().ToString());
+    EXPECT_EQ(use_first_key ? 4 : 7,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    c.ResetTableReader();
+  }
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
+  table_options.block_cache = NewLRUCache(10000);
+  Options options;
+  options.statistics = CreateDBStatistics();
+  Statistics* stats = options.statistics.get();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+
+  TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false,
+                     /* level */ -1, /* largest_seqno */ 42);
+
+  c.Add(InternalKey("b", 0, kTypeValue).Encode().ToString(), "x");
+  c.Add(InternalKey("c", 0, kTypeValue).Encode().ToString(), "y");
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  ASSERT_EQ(2, keys.size());
+
+  auto reader = c.GetTableReader();
+  auto props = reader->GetTableProperties();
+  ASSERT_EQ(1u, props->num_data_blocks);
+  ReadOptions read_options;
+  std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+      read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized,
+      /*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true));
+
+  iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString());
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
+            iter->key().ToString());
+  EXPECT_NE(keys[0], iter->key().ToString());
+  // Key should have been served from index, without reading data blocks.
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+  ASSERT_TRUE(iter->PrepareValue());
+  EXPECT_EQ("x", iter->value().ToString());
+  EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+  EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
+            iter->key().ToString());
+
+  c.ResetTableReader();
+}
+
+// It's very hard to figure out the index block size of a block accurately.
+// To make sure we get the index size, we just make sure as key number
+// grows, the filter block size also grows.
+TEST_P(BlockBasedTableTest, IndexSizeStat) {
+  uint64_t last_index_size = 0;
+
+  // we need to use random keys since the pure human readable texts
+  // may be well compressed, resulting insignifcant change of index
+  // block size.
+  Random rnd(test::RandomSeed());
+  std::vector<std::string> keys;
+
+  for (int i = 0; i < 100; ++i) {
+    keys.push_back(rnd.RandomString(10000));
+  }
+
+  // Each time we load one more key to the table. the table index block
+  // size is expected to be larger than last time's.
+  for (size_t i = 1; i < keys.size(); ++i) {
+    TableConstructor c(BytewiseComparator(),
+                       true /* convert_to_internal_key_ */);
+    for (size_t j = 0; j < i; ++j) {
+      c.Add(keys[j], "val");
+    }
+
+    std::vector<std::string> ks;
+    stl_wrappers::KVMap kvmap;
+    Options options;
+    options.compression = kNoCompression;
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    table_options.block_restart_interval = 1;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    const ImmutableOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
+             GetPlainInternalComparator(options.comparator), &ks, &kvmap);
+    auto index_size = c.GetTableReader()->GetTableProperties()->index_size;
+    ASSERT_GT(index_size, last_index_size);
+    last_index_size = index_size;
+    c.ResetTableReader();
+  }
+}
+
+TEST_P(BlockBasedTableTest, NumBlockStat) {
+  Random rnd(test::RandomSeed());
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  Options options;
+  options.compression = kNoCompression;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_restart_interval = 1;
+  table_options.block_size = 1000;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  for (int i = 0; i < 10; ++i) {
+    // the key/val are slightly smaller than block size, so that each block
+    // holds roughly one key/value pair.
+    c.Add(rnd.RandomString(900), "val");
+  }
+
+  std::vector<std::string> ks;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &ks, &kvmap);
+  ASSERT_EQ(kvmap.size(),
+            c.GetTableReader()->GetTableProperties()->num_data_blocks);
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingGetTest) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  std::string user_key = "k01";
+  InternalKey internal_key(user_key, 0, kTypeValue);
+  std::string encoded_key = internal_key.Encode().ToString();
+  for (uint32_t i = 1; i <= 2; i++) {
+    PinnableSlice value;
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, user_key, &value, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr, nullptr,
+                           nullptr, nullptr, nullptr, /*tracing_get_id=*/i);
+    get_perf_context()->Reset();
+    ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context,
+                                      moptions.prefix_extractor.get()));
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value.ToString(), kDummyValue);
+  }
+
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = false;
+  record.no_insert = false;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have three records for one index, one filter, and one data
+  // block access.
+  record.get_id = 1;
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  record.caller = TableReaderCaller::kUserGet;
+  record.get_from_user_specified_snapshot = false;
+  record.referenced_key = encoded_key;
+  record.referenced_key_exist_in_block = true;
+  record.is_cache_hit = true;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  expected_records.push_back(record);
+  record.is_cache_hit = false;
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  // The second get should all observe cache hits.
+  record.is_cache_hit = true;
+  record.get_id = 2;
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  record.caller = TableReaderCaller::kUserGet;
+  record.get_from_user_specified_snapshot = false;
+  record.referenced_key = encoded_key;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  for (uint32_t i = 1; i <= 2; i++) {
+    std::string user_key = "k01";
+    InternalKey internal_key(user_key, 0, kTypeValue);
+    std::string encoded_key = internal_key.Encode().ToString();
+    c.GetTableReader()->ApproximateOffsetOf(
+        encoded_key, TableReaderCaller::kUserApproximateSize);
+  }
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = false;
+  record.no_insert = false;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have two records for only index blocks.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserApproximateSize;
+  record.is_cache_hit = true;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingIterator) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  for (uint32_t i = 1; i <= 2; i++) {
+    ReadOptions read_options;
+    std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
+        read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUserIterator));
+    iter->SeekToFirst();
+    while (iter->Valid()) {
+      iter->key();
+      iter->value();
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+    iter.reset();
+  }
+
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = false;
+  record.no_insert = false;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have three records for index and two data block access.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserIterator;
+  record.is_cache_hit = true;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  record.is_cache_hit = false;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  // When we iterate this file for the second time, we should observe all cache
+  // hits.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.is_cache_hit = true;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
+// A simple tool that takes the snapshot of block cache statistics.
+class BlockCachePropertiesSnapshot {
+ public:
+  explicit BlockCachePropertiesSnapshot(Statistics* statistics) {
+    block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS);
+    block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT);
+    index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
+    index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
+    data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
+    data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
+    filter_block_cache_miss =
+        statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS);
+    filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT);
+    block_cache_bytes_read = statistics->getTickerCount(BLOCK_CACHE_BYTES_READ);
+    block_cache_bytes_write =
+        statistics->getTickerCount(BLOCK_CACHE_BYTES_WRITE);
+  }
+
+  void AssertIndexBlockStat(int64_t expected_index_block_cache_miss,
+                            int64_t expected_index_block_cache_hit) {
+    ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
+  }
+
+  void AssertFilterBlockStat(int64_t expected_filter_block_cache_miss,
+                             int64_t expected_filter_block_cache_hit) {
+    ASSERT_EQ(expected_filter_block_cache_miss, filter_block_cache_miss);
+    ASSERT_EQ(expected_filter_block_cache_hit, filter_block_cache_hit);
+  }
+
+  // Check if the fetched props matches the expected ones.
+  // TODO(kailiu) Use this only when you disabled filter policy!
+  void AssertEqual(int64_t expected_index_block_cache_miss,
+                   int64_t expected_index_block_cache_hit,
+                   int64_t expected_data_block_cache_miss,
+                   int64_t expected_data_block_cache_hit) const {
+    ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
+    ASSERT_EQ(expected_data_block_cache_miss, data_block_cache_miss);
+    ASSERT_EQ(expected_data_block_cache_hit, data_block_cache_hit);
+    ASSERT_EQ(expected_index_block_cache_miss + expected_data_block_cache_miss,
+              block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit + expected_data_block_cache_hit,
+              block_cache_hit);
+  }
+
+  int64_t GetCacheBytesRead() { return block_cache_bytes_read; }
+
+  int64_t GetCacheBytesWrite() { return block_cache_bytes_write; }
+
+ private:
+  int64_t block_cache_miss = 0;
+  int64_t block_cache_hit = 0;
+  int64_t index_block_cache_miss = 0;
+  int64_t index_block_cache_hit = 0;
+  int64_t data_block_cache_miss = 0;
+  int64_t data_block_cache_hit = 0;
+  int64_t filter_block_cache_miss = 0;
+  int64_t filter_block_cache_hit = 0;
+  int64_t block_cache_bytes_read = 0;
+  int64_t block_cache_bytes_write = 0;
+};
+
+// Make sure, by default, index/filter blocks were pre-loaded (meaning we won't
+// use block cache to store them).
+TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
+  Options options;
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_cache = NewLRUCache(1024, 4);
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("key", "value");
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  // preloading filter/index blocks is enabled.
+  auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
+  ASSERT_FALSE(reader->TEST_IndexBlockInCache());
+
+  {
+    // nothing happens in the beginning
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertIndexBlockStat(0, 0);
+    props.AssertFilterBlockStat(0, 0);
+  }
+
+  {
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, Slice(), nullptr, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr);
+    // a hack that just to trigger BlockBasedTable::GetFilter.
+    ASSERT_OK(reader->Get(ReadOptions(), "non-exist-key", &get_context,
+                          moptions.prefix_extractor.get()));
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertIndexBlockStat(0, 0);
+    props.AssertFilterBlockStat(0, 0);
+  }
+}
+
+// Due to the difficulities of the intersaction between statistics, this test
+// only tests the case when "index block is put to block cache"
+TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
+  // -- Table construction
+  Options options;
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+
+  // Enable the cache for index/filter blocks
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  LRUCacheOptions co;
+  co.capacity = 2048;
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  table_options.block_cache = NewLRUCache(co);
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("key", "value");
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  // preloading filter/index blocks is prohibited.
+  auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
+  ASSERT_TRUE(reader->TEST_IndexBlockInCache());
+
+  // -- PART 1: Open with regular block cache.
+  // Since block_cache is disabled, no cache activities will be involved.
+  std::unique_ptr<InternalIterator> iter;
+
+  int64_t last_cache_bytes_read = 0;
+  // At first, no block will be accessed.
+  {
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    // index will be added to block cache.
+    props.AssertEqual(1,  // index block miss
+                      0, 0, 0);
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
+    last_cache_bytes_read = props.GetCacheBytesRead();
+  }
+
+  // Only index block will be accessed
+  {
+    iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    // NOTE: to help better highlight the "detla" of each ticker, I use
+    // <last_value> + <added_value> to indicate the increment of changed
+    // value; other numbers remain the same.
+    props.AssertEqual(1, 0 + 1,  // index block hit
+                      0, 0);
+    // Cache hit, bytes read from cache should increase
+    ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
+    last_cache_bytes_read = props.GetCacheBytesRead();
+  }
+
+  // Only data block will be accessed
+  {
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1, 1, 0 + 1,  // data block miss
+                      0);
+    // Cache miss, Bytes read from cache should not change
+    ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
+    last_cache_bytes_read = props.GetCacheBytesRead();
+  }
+
+  // Data block will be in cache
+  {
+    iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1, 1 + 1, /* index block hit */
+                      1, 0 + 1 /* data block hit */);
+    // Cache hit, bytes read from cache should increase
+    ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
+  }
+  // release the iterator so that the block cache can reset correctly.
+  iter.reset();
+
+  c.ResetTableReader();
+
+  // -- PART 2: Open with very small block cache
+  // In this test, no block will ever get hit since the block cache is
+  // too small to fit even one entry.
+  table_options.block_cache = NewLRUCache(1, 4);
+  options.statistics = CreateDBStatistics();
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  const ImmutableOptions ioptions2(options);
+  const MutableCFOptions moptions2(options);
+  ASSERT_OK(c.Reopen(ioptions2, moptions2));
+  {
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1,  // index block miss
+                      0, 0, 0);
+    // Cache miss, Bytes read from cache should not change
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
+  }
+
+  {
+    // Both index and data block get accessed.
+    // It first cache index block then data block. But since the cache size
+    // is only 1, index block will be purged after data block is inserted.
+    iter.reset(c.NewIterator(moptions2.prefix_extractor.get()));
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1 + 1,  // index block miss
+                      0, 0,   // data block miss
+                      0);
+    // Cache hit, bytes read from cache should increase
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
+  }
+
+  {
+    // SeekToFirst() accesses data block. With similar reason, we expect data
+    // block's cache miss.
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(2, 0, 0 + 1,  // data block miss
+                      0);
+    // Cache miss, Bytes read from cache should not change
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
+  }
+  iter.reset();
+  c.ResetTableReader();
+
+  // -- PART 3: Open table with bloom filter enabled but not in SST file
+  table_options.block_cache = NewLRUCache(4096, 4);
+  table_options.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  TableConstructor c3(BytewiseComparator());
+  std::string user_key = "k01";
+  InternalKey internal_key(user_key, 0, kTypeValue);
+  c3.Add(internal_key.Encode().ToString(), "hello");
+  ImmutableOptions ioptions3(options);
+  MutableCFOptions moptions3(options);
+  // Generate table without filter policy
+  c3.Finish(options, ioptions3, moptions3, table_options,
+            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  c3.ResetTableReader();
+
+  // Open table with filter policy
+  table_options.filter_policy.reset(NewBloomFilterPolicy(1));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.statistics = CreateDBStatistics();
+  ImmutableOptions ioptions4(options);
+  MutableCFOptions moptions4(options);
+  ASSERT_OK(c3.Reopen(ioptions4, moptions4));
+  reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
+  PinnableSlice value;
+  GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, user_key, &value, nullptr,
+                         nullptr, nullptr, true, nullptr, nullptr);
+  ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context,
+                        moptions4.prefix_extractor.get()));
+  ASSERT_STREQ(value.data(), "hello");
+  BlockCachePropertiesSnapshot props(options.statistics.get());
+  props.AssertFilterBlockStat(0, 0);
+  c3.ResetTableReader();
+}
+
+void ValidateBlockSizeDeviation(int value, int expected) {
+  BlockBasedTableOptions table_options;
+  table_options.block_size_deviation = value;
+  BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
+
+  const BlockBasedTableOptions* normalized_table_options =
+      factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_EQ(normalized_table_options->block_size_deviation, expected);
+
+  delete factory;
+}
+
+void ValidateBlockRestartInterval(int value, int expected) {
+  BlockBasedTableOptions table_options;
+  table_options.block_restart_interval = value;
+  BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
+
+  const BlockBasedTableOptions* normalized_table_options =
+      factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_EQ(normalized_table_options->block_restart_interval, expected);
+
+  delete factory;
+}
+
+TEST_P(BlockBasedTableTest, InvalidOptions) {
+  // invalid values for block_size_deviation (<0 or >100) are silently set to 0
+  ValidateBlockSizeDeviation(-10, 0);
+  ValidateBlockSizeDeviation(-1, 0);
+  ValidateBlockSizeDeviation(0, 0);
+  ValidateBlockSizeDeviation(1, 1);
+  ValidateBlockSizeDeviation(99, 99);
+  ValidateBlockSizeDeviation(100, 100);
+  ValidateBlockSizeDeviation(101, 0);
+  ValidateBlockSizeDeviation(1000, 0);
+
+  // invalid values for block_restart_interval (<1) are silently set to 1
+  ValidateBlockRestartInterval(-10, 1);
+  ValidateBlockRestartInterval(-1, 1);
+  ValidateBlockRestartInterval(0, 1);
+  ValidateBlockRestartInterval(1, 1);
+  ValidateBlockRestartInterval(2, 2);
+  ValidateBlockRestartInterval(1000, 1000);
+}
+
+TEST_P(BlockBasedTableTest, BlockReadCountTest) {
+  // bloom_filter_type = 1 -- full filter using use_block_based_builder=false
+  // bloom_filter_type = 2 -- full filter using use_block_based_builder=true
+  //                          because of API change to hide block-based filter
+  for (int bloom_filter_type = 1; bloom_filter_type <= 2; ++bloom_filter_type) {
+    for (int index_and_filter_in_cache = 0; index_and_filter_in_cache < 2;
+         ++index_and_filter_in_cache) {
+      Options options;
+      options.create_if_missing = true;
+
+      BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+      table_options.block_cache = NewLRUCache(1, 0);
+      table_options.cache_index_and_filter_blocks = index_and_filter_in_cache;
+      table_options.filter_policy.reset(
+          NewBloomFilterPolicy(10, bloom_filter_type == 2));
+      options.table_factory.reset(new BlockBasedTableFactory(table_options));
+      std::vector<std::string> keys;
+      stl_wrappers::KVMap kvmap;
+
+      TableConstructor c(BytewiseComparator());
+      std::string user_key = "k04";
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      c.Add(encoded_key, "hello");
+      ImmutableOptions ioptions(options);
+      MutableCFOptions moptions(options);
+      // Generate table with filter policy
+      c.Finish(options, ioptions, moptions, table_options,
+               GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+      auto reader = c.GetTableReader();
+      PinnableSlice value;
+      {
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, true, nullptr, nullptr);
+        get_perf_context()->Reset();
+        ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        if (index_and_filter_in_cache) {
+          // data, index and filter block
+          ASSERT_EQ(get_perf_context()->block_read_count, 3);
+          ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
+        } else {
+          // just the data block
+          ASSERT_EQ(get_perf_context()->block_read_count, 1);
+        }
+        ASSERT_EQ(get_context.State(), GetContext::kFound);
+        ASSERT_STREQ(value.data(), "hello");
+      }
+
+      // Get non-existing key
+      user_key = "does-not-exist";
+      internal_key = InternalKey(user_key, 0, kTypeValue);
+      encoded_key = internal_key.Encode().ToString();
+
+      value.Reset();
+      {
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, true, nullptr, nullptr);
+        get_perf_context()->Reset();
+        ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+      }
+
+      if (index_and_filter_in_cache) {
+        if (bloom_filter_type == 0) {
+          // with block-based, we read index and then the filter
+          ASSERT_EQ(get_perf_context()->block_read_count, 2);
+          ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
+        } else {
+          // with full-filter, we read filter first and then we stop
+          ASSERT_EQ(get_perf_context()->block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
+        }
+      } else {
+        // filter is already in memory and it figures out that the key doesn't
+        // exist
+        ASSERT_EQ(get_perf_context()->block_read_count, 0);
+      }
+    }
+  }
+}
+
+TEST_P(BlockBasedTableTest, BlockCacheLeak) {
+  // Check that when we reopen a table we don't lose access to blocks already
+  // in the cache. This test checks whether the Table actually makes use of the
+  // unique ID from the file.
+
+  Options opt;
+  std::unique_ptr<InternalKeyComparator> ikc;
+  ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+  opt.compression = kNoCompression;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_size = 1024;
+  // big enough so we don't ever lose cached values.
+  table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+  opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(opt);
+  const MutableCFOptions moptions(opt);
+  c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
+
+  std::unique_ptr<InternalIterator> iter(
+      c.NewIterator(moptions.prefix_extractor.get()));
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->key();
+    iter->value();
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  const ImmutableOptions ioptions1(opt);
+  const MutableCFOptions moptions1(opt);
+  ASSERT_OK(c.Reopen(ioptions1, moptions1));
+  auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+  for (const std::string& key : keys) {
+    InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+    ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+  }
+  c.ResetTableReader();
+
+  // rerun with different block cache
+  table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+  opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  const ImmutableOptions ioptions2(opt);
+  const MutableCFOptions moptions2(opt);
+  ASSERT_OK(c.Reopen(ioptions2, moptions2));
+  table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+  for (const std::string& key : keys) {
+    InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+    ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+  }
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, MemoryAllocator) {
+  auto default_memory_allocator = std::make_shared<DefaultMemoryAllocator>();
+  auto custom_memory_allocator =
+      std::make_shared<CountedMemoryAllocator>(default_memory_allocator);
+  {
+    Options opt;
+    std::unique_ptr<InternalKeyComparator> ikc;
+    ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+    opt.compression = kNoCompression;
+    BlockBasedTableOptions table_options;
+    table_options.block_size = 1024;
+    LRUCacheOptions lruOptions;
+    lruOptions.memory_allocator = custom_memory_allocator;
+    lruOptions.capacity = 16 * 1024 * 1024;
+    lruOptions.num_shard_bits = 4;
+    table_options.block_cache = NewLRUCache(std::move(lruOptions));
+    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    TableConstructor c(BytewiseComparator(),
+                       true /* convert_to_internal_key_ */);
+    c.Add("k01", "hello");
+    c.Add("k02", "hello2");
+    c.Add("k03", std::string(10000, 'x'));
+    c.Add("k04", std::string(200000, 'x'));
+    c.Add("k05", std::string(300000, 'x'));
+    c.Add("k06", "hello3");
+    c.Add("k07", std::string(100000, 'x'));
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap kvmap;
+    const ImmutableOptions ioptions(opt);
+    const MutableCFOptions moptions(opt);
+    c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
+
+    std::unique_ptr<InternalIterator> iter(
+        c.NewIterator(moptions.prefix_extractor.get()));
+    iter->SeekToFirst();
+    while (iter->Valid()) {
+      iter->key();
+      iter->value();
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+  }
+
+  // out of scope, block cache should have been deleted, all allocations
+  // deallocated
+  EXPECT_EQ(custom_memory_allocator->GetNumAllocations(),
+            custom_memory_allocator->GetNumDeallocations());
+  // make sure that allocations actually happened through the cache allocator
+  EXPECT_GT(custom_memory_allocator->GetNumAllocations(), 0);
+}
+
+// Test the file checksum of block based table
+TEST_P(BlockBasedTableTest, NoFileChecksum) {
+  Options options;
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  int level = 0;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+
+  FileChecksumTestHelper f(true);
+  f.CreateWritableFile();
+  std::unique_ptr<TableBuilder> builder;
+  builder.reset(ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, *comparator,
+                          &int_tbl_prop_collector_factories,
+                          options.compression, options.compression_opts,
+                          kUnknownColumnFamily, column_family_name, level),
+      f.GetFileWriter()));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
+  f.AddKVtoKVMap(1000);
+  ASSERT_OK(f.WriteKVAndFlushTable());
+  ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
+  ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum);
+}
+
+TEST_P(BlockBasedTableTest, Crc32cFileChecksum) {
+  FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
+      new FileChecksumGenCrc32cFactory();
+  Options options;
+  options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  int level = 0;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+
+  FileChecksumGenContext gen_context;
+  gen_context.file_name = "db/tmp";
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
+  FileChecksumTestHelper f(true);
+  f.CreateWritableFile();
+  f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
+  std::unique_ptr<TableBuilder> builder;
+  builder.reset(ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, *comparator,
+                          &int_tbl_prop_collector_factories,
+                          options.compression, options.compression_opts,
+                          kUnknownColumnFamily, column_family_name, level),
+      f.GetFileWriter()));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
+  f.AddKVtoKVMap(1000);
+  ASSERT_OK(f.WriteKVAndFlushTable());
+  ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
+
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
+  std::string checksum;
+  ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
+  ASSERT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
+
+  // Unit test the generator itself for schema stability
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen3 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
+  const char data[] = "here is some data";
+  checksum_crc32c_gen3->Update(data, sizeof(data));
+  checksum_crc32c_gen3->Finalize();
+  checksum = checksum_crc32c_gen3->GetChecksum();
+  ASSERT_STREQ(checksum.c_str(), "\345\245\277\110");
+}
+
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(PlainTableTest, BasicPlainTableProperties) {
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 8;
+  plain_table_options.bloom_bits_per_key = 8;
+  plain_table_options.hash_table_ratio = 0;
+
+  PlainTableFactory factory(plain_table_options);
+  std::unique_ptr<FSWritableFile> sink(new test::StringSink());
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(sink), "" /* don't care */, FileOptions()));
+  Options options;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  int unknown_level = -1;
+  std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
+      file_writer.get()));
+
+  for (char c = 'a'; c <= 'z'; ++c) {
+    std::string key(8, c);
+    key.append("\1       ");  // PlainTable expects internal key structure
+    std::string value(28, c + 42);
+    builder->Add(key, value);
+  }
+  ASSERT_OK(builder->Finish());
+  ASSERT_OK(file_writer->Flush());
+
+  test::StringSink* ss =
+      static_cast<test::StringSink*>(file_writer->writable_file());
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(ss->contents(), 72242, true));
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+
+  std::unique_ptr<TableProperties> props;
+  auto s = ReadTableProperties(file_reader.get(), ss->contents().size(),
+                               kPlainTableMagicNumber, ioptions, &props);
+  ASSERT_OK(s);
+
+  ASSERT_EQ(0ul, props->index_size);
+  ASSERT_EQ(0ul, props->filter_size);
+  ASSERT_EQ(16ul * 26, props->raw_key_size);
+  ASSERT_EQ(28ul * 26, props->raw_value_size);
+  ASSERT_EQ(26ul, props->num_entries);
+  ASSERT_EQ(1ul, props->num_data_blocks);
+}
+
+TEST_F(PlainTableTest, NoFileChecksum) {
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 20;
+  plain_table_options.bloom_bits_per_key = 8;
+  plain_table_options.hash_table_ratio = 0;
+  PlainTableFactory factory(plain_table_options);
+
+  Options options;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  int unknown_level = -1;
+  FileChecksumTestHelper f(true);
+  f.CreateWritableFile();
+
+  std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
+      f.GetFileWriter()));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
+  f.AddKVtoKVMap(1000);
+  ASSERT_OK(f.WriteKVAndFlushTable());
+  ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
+  EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum);
+}
+
+TEST_F(PlainTableTest, Crc32cFileChecksum) {
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 20;
+  plain_table_options.bloom_bits_per_key = 8;
+  plain_table_options.hash_table_ratio = 0;
+  PlainTableFactory factory(plain_table_options);
+
+  FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
+      new FileChecksumGenCrc32cFactory();
+  Options options;
+  options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  int unknown_level = -1;
+
+  FileChecksumGenContext gen_context;
+  gen_context.file_name = "db/tmp";
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
+  FileChecksumTestHelper f(true);
+  f.CreateWritableFile();
+  f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
+
+  std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
+      f.GetFileWriter()));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
+  f.AddKVtoKVMap(1000);
+  ASSERT_OK(f.WriteKVAndFlushTable());
+  ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
+
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
+  std::string checksum;
+  ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
+  EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
+}
+
+#endif  // !ROCKSDB_LITE
+
+TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  options.db_host_id = "";
+  test::PlainInternalKeyComparator internal_comparator(options.comparator);
+  options.compression = kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000));
+  // k04 and k05 will be in two consecutive blocks, the index is
+  // an arbitrary slice between k04 and k05, either before or after k04a
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 10000, 211000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000));
+  c.ResetTableReader();
+}
+
+static void DoCompressionTest(CompressionType comp) {
+  Random rnd(301);
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  std::string tmp;
+  c.Add("k01", "hello");
+  c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+  c.Add("k03", "hello3");
+  c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  test::PlainInternalKeyComparator ikc(options.comparator);
+  options.compression = comp;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
+
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3525));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3525));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7075));
+  c.ResetTableReader();
+}
+
+TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) {
+  std::vector<CompressionType> compression_state;
+  if (!Snappy_Supported()) {
+    fprintf(stderr, "skipping snappy compression tests\n");
+  } else {
+    compression_state.push_back(kSnappyCompression);
+  }
+
+  if (!Zlib_Supported()) {
+    fprintf(stderr, "skipping zlib compression tests\n");
+  } else {
+    compression_state.push_back(kZlibCompression);
+  }
+
+  // TODO(kailiu) DoCompressionTest() doesn't work with BZip2.
+  /*
+  if (!BZip2_Supported()) {
+    fprintf(stderr, "skipping bzip2 compression tests\n");
+  } else {
+    compression_state.push_back(kBZip2Compression);
+  }
+  */
+
+  if (!LZ4_Supported()) {
+    fprintf(stderr, "skipping lz4 and lz4hc compression tests\n");
+  } else {
+    compression_state.push_back(kLZ4Compression);
+    compression_state.push_back(kLZ4HCCompression);
+  }
+
+  if (!XPRESS_Supported()) {
+    fprintf(stderr, "skipping xpress and xpress compression tests\n");
+  } else {
+    compression_state.push_back(kXpressCompression);
+  }
+
+  for (auto state : compression_state) {
+    DoCompressionTest(state);
+  }
+}
+
+TEST_F(GeneralTableTest, ApproximateKeyAnchors) {
+  Random rnd(301);
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  std::string tmp;
+  for (int i = 1000; i < 9000; i++) {
+    c.Add(std::to_string(i), rnd.RandomString(2000));
+  }
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  InternalKeyComparator ikc(options.comparator);
+  options.compression = kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 4096;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
+
+  std::vector<TableReader::Anchor> anchors;
+  ASSERT_OK(c.GetTableReader()->ApproximateKeyAnchors(ReadOptions(), anchors));
+  // The target is 128 anchors. But in reality it can be slightly more or fewer.
+  ASSERT_GT(anchors.size(), 120);
+  ASSERT_LT(anchors.size(), 140);
+
+  // We have around 8000 keys. With 128 anchors, in average 62.5 keys per
+  // anchor. Here we take a rough range and estimate the distance between
+  // anchors is between 50 and 100.
+  // Total data size is about 18,000,000, so each anchor range is about
+  // 140,625. We also take a rough range.
+  int prev_num = 1000;
+  // Non-last anchor
+  for (size_t i = 0; i + 1 < anchors.size(); i++) {
+    auto& anchor = anchors[i];
+    ASSERT_GT(anchor.range_size, 100000);
+    ASSERT_LT(anchor.range_size, 200000);
+
+    // Key might be shortened, so fill 0 in the end if it is the case.
+    std::string key_cpy = anchor.user_key;
+    key_cpy.append(4 - key_cpy.size(), '0');
+    int num = std::stoi(key_cpy);
+    ASSERT_GT(num - prev_num, 50);
+    ASSERT_LT(num - prev_num, 100);
+    prev_num = num;
+  }
+
+  ASSERT_EQ("8999", anchors.back().user_key);
+  ASSERT_LT(anchors.back().range_size, 200000);
+
+  c.ResetTableReader();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(ParameterizedHarnessTest, RandomizedHarnessTest) {
+  Random rnd(test::RandomSeed() + 5);
+  for (int num_entries = 0; num_entries < 2000;
+       num_entries += (num_entries < 50 ? 1 : 200)) {
+    for (int e = 0; e < num_entries; e++) {
+      Add(test::RandomKey(&rnd, rnd.Skewed(4)),
+          rnd.RandomString(rnd.Skewed(5)));
+    }
+    Test(&rnd);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBHarnessTest, RandomizedLongDB) {
+  Random rnd(test::RandomSeed());
+  int num_entries = 100000;
+  for (int e = 0; e < num_entries; e++) {
+    std::string v;
+    Add(test::RandomKey(&rnd, rnd.Skewed(4)), rnd.RandomString(rnd.Skewed(5)));
+  }
+  Test(&rnd);
+
+  // We must have created enough data to force merging
+  int files = 0;
+  for (int level = 0; level < db()->NumberLevels(); level++) {
+    std::string value;
+    char name[100];
+    snprintf(name, sizeof(name), "rocksdb.num-files-at-level%d", level);
+    ASSERT_TRUE(db()->GetProperty(name, &value));
+    files += atoi(value.c_str());
+  }
+  ASSERT_GT(files, 0);
+}
+#endif  // ROCKSDB_LITE
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class MemTableTest : public testing::Test {
+ public:
+  MemTableTest() {
+    InternalKeyComparator cmp(BytewiseComparator());
+    auto table_factory = std::make_shared<SkipListFactory>();
+    options_.memtable_factory = table_factory;
+    ImmutableOptions ioptions(options_);
+    wb_ = new WriteBufferManager(options_.db_write_buffer_size);
+    memtable_ = new MemTable(cmp, ioptions, MutableCFOptions(options_), wb_,
+                             kMaxSequenceNumber, 0 /* column_family_id */);
+    memtable_->Ref();
+  }
+
+  ~MemTableTest() {
+    delete memtable_->Unref();
+    delete wb_;
+  }
+
+  MemTable* GetMemTable() { return memtable_; }
+
+ private:
+  MemTable* memtable_;
+  Options options_;
+  WriteBufferManager* wb_;
+};
+
+TEST_F(MemTableTest, Simple) {
+  WriteBatch batch;
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_OK(batch.Put(std::string("k1"), std::string("v1")));
+  ASSERT_OK(batch.Put(std::string("k2"), std::string("v2")));
+  ASSERT_OK(batch.Put(std::string("k3"), std::string("v3")));
+  ASSERT_OK(batch.Put(std::string("largekey"), std::string("vlarge")));
+  ASSERT_OK(batch.DeleteRange(std::string("chi"), std::string("xigua")));
+  ASSERT_OK(batch.DeleteRange(std::string("begin"), std::string("end")));
+  ColumnFamilyMemTablesDefault cf_mems_default(GetMemTable());
+  ASSERT_TRUE(
+      WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr)
+          .ok());
+
+  for (int i = 0; i < 2; ++i) {
+    Arena arena;
+    ScopedArenaIterator arena_iter_guard;
+    std::unique_ptr<InternalIterator> iter_guard;
+    InternalIterator* iter;
+    if (i == 0) {
+      iter = GetMemTable()->NewIterator(ReadOptions(), &arena);
+      arena_iter_guard.set(iter);
+    } else {
+      iter = GetMemTable()->NewRangeTombstoneIterator(
+          ReadOptions(), kMaxSequenceNumber /* read_seq */,
+          false /* immutable_memtable */);
+      iter_guard.reset(iter);
+    }
+    if (iter == nullptr) {
+      continue;
+    }
+    iter->SeekToFirst();
+    while (iter->Valid()) {
+      fprintf(stderr, "key: '%s' -> '%s'\n", iter->key().ToString().c_str(),
+              iter->value().ToString().c_str());
+      iter->Next();
+    }
+  }
+}
+
+// Test the empty key
+TEST_P(ParameterizedHarnessTest, SimpleEmptyKey) {
+  Random rnd(test::RandomSeed() + 1);
+  Add("", "v");
+  Test(&rnd);
+}
+
+TEST_P(ParameterizedHarnessTest, SimpleSingle) {
+  Random rnd(test::RandomSeed() + 2);
+  Add("abc", "v");
+  Test(&rnd);
+}
+
+TEST_P(ParameterizedHarnessTest, SimpleMulti) {
+  Random rnd(test::RandomSeed() + 3);
+  Add("abc", "v");
+  Add("abcd", "v");
+  Add("ac", "v2");
+  Test(&rnd);
+}
+
+TEST_P(ParameterizedHarnessTest, SimpleSpecialKey) {
+  Random rnd(test::RandomSeed() + 4);
+  Add("\xff\xff", "v3");
+  Test(&rnd);
+}
+
+TEST(TableTest, FooterTests) {
+  Random* r = Random::GetTLSInstance();
+  uint64_t data_size = (uint64_t{1} << r->Uniform(40)) + r->Uniform(100);
+  uint64_t index_size = r->Uniform(1000000000);
+  uint64_t metaindex_size = r->Uniform(1000000);
+  // 5 == block trailer size
+  BlockHandle index(data_size + 5, index_size);
+  BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size);
+  uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5;
+  {
+    // legacy block based
+    FooterBuilder footer;
+    footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0,
+                 footer_offset, kCRC32c, meta_index, index);
+    Footer decoded_footer;
+    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
+    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.format_version(), 0U);
+    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
+    // Ensure serialized with legacy magic
+    ASSERT_EQ(
+        DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
+        kLegacyBlockBasedTableMagicNumber);
+  }
+  // block based, various checksums, various versions
+  for (auto t : GetSupportedChecksums()) {
+    for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) {
+      FooterBuilder footer;
+      footer.Build(kBlockBasedTableMagicNumber, fv, footer_offset, t,
+                   meta_index, index);
+      Footer decoded_footer;
+      ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
+      ASSERT_EQ(decoded_footer.table_magic_number(),
+                kBlockBasedTableMagicNumber);
+      ASSERT_EQ(decoded_footer.checksum_type(), t);
+      ASSERT_EQ(decoded_footer.metaindex_handle().offset(),
+                meta_index.offset());
+      ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+      ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+      ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+      ASSERT_EQ(decoded_footer.format_version(), fv);
+      ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
+    }
+  }
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+  {
+    // legacy plain table
+    FooterBuilder footer;
+    footer.Build(kPlainTableMagicNumber, /* format_version */ 0, footer_offset,
+                 kNoChecksum, meta_index);
+    Footer decoded_footer;
+    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
+    ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
+    ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
+    ASSERT_EQ(decoded_footer.format_version(), 0U);
+    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
+    // Ensure serialized with legacy magic
+    ASSERT_EQ(
+        DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
+        kLegacyPlainTableMagicNumber);
+  }
+  {
+    // xxhash plain table (not currently used)
+    FooterBuilder footer;
+    footer.Build(kPlainTableMagicNumber, /* format_version */ 1, footer_offset,
+                 kxxHash, meta_index);
+    Footer decoded_footer;
+    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
+    ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum_type(), kxxHash);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
+    ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
+    ASSERT_EQ(decoded_footer.format_version(), 1U);
+    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
+  }
+#endif  // !ROCKSDB_LITE
+}
+
+class IndexBlockRestartIntervalTest
+    : public TableTest,
+      public ::testing::WithParamInterface<std::pair<int, bool>> {
+ public:
+  static std::vector<std::pair<int, bool>> GetRestartValues() {
+    return {{-1, false}, {0, false},  {1, false}, {8, false},
+            {16, false}, {32, false}, {-1, true}, {0, true},
+            {1, true},   {8, true},   {16, true}, {32, true}};
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    IndexBlockRestartIntervalTest, IndexBlockRestartIntervalTest,
+    ::testing::ValuesIn(IndexBlockRestartIntervalTest::GetRestartValues()));
+
+TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
+  const int kKeysInTable = 10000;
+  const int kKeySize = 100;
+  const int kValSize = 500;
+
+  const int index_block_restart_interval = std::get<0>(GetParam());
+  const bool value_delta_encoding = std::get<1>(GetParam());
+
+  Options options;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 64;  // small block size to get big index block
+  table_options.index_block_restart_interval = index_block_restart_interval;
+  if (value_delta_encoding) {
+    table_options.format_version = 4;
+  } else {
+    table_options.format_version = 3;
+  }
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  TableConstructor c(BytewiseComparator());
+  static Random rnd(301);
+  for (int i = 0; i < kKeysInTable; i++) {
+    InternalKey k(rnd.RandomString(kKeySize), 0, kTypeValue);
+    c.Add(k.Encode().ToString(), rnd.RandomString(kValSize));
+  }
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  auto reader = c.GetTableReader();
+
+  ReadOptions read_options;
+  std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
+      read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  // Test point lookup
+  for (auto& kv : kvmap) {
+    db_iter->Seek(kv.first);
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(db_iter->key(), kv.first);
+    ASSERT_EQ(db_iter->value(), kv.second);
+  }
+
+  // Test iterating
+  auto kv_iter = kvmap.begin();
+  for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+    ASSERT_EQ(db_iter->key(), kv_iter->first);
+    ASSERT_EQ(db_iter->value(), kv_iter->second);
+    kv_iter++;
+  }
+  ASSERT_EQ(kv_iter, kvmap.end());
+  c.ResetTableReader();
+}
+
+class PrefixTest : public testing::Test {
+ public:
+  PrefixTest() : testing::Test() {}
+  ~PrefixTest() override {}
+};
+
+namespace {
+// A simple PrefixExtractor that only works for test PrefixAndWholeKeyTest
+class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform {
+ public:
+  ~TestPrefixExtractor() override{};
+  const char* Name() const override { return "TestPrefixExtractor"; }
+
+  ROCKSDB_NAMESPACE::Slice Transform(
+      const ROCKSDB_NAMESPACE::Slice& src) const override {
+    assert(IsValid(src));
+    return ROCKSDB_NAMESPACE::Slice(src.data(), 3);
+  }
+
+  bool InDomain(const ROCKSDB_NAMESPACE::Slice& src) const override {
+    return IsValid(src);
+  }
+
+  bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override {
+    return true;
+  }
+
+  bool IsValid(const ROCKSDB_NAMESPACE::Slice& src) const {
+    if (src.size() != 4) {
+      return false;
+    }
+    if (src[0] != '[') {
+      return false;
+    }
+    if (src[1] < '0' || src[1] > '9') {
+      return false;
+    }
+    if (src[2] != ']') {
+      return false;
+    }
+    if (src[3] < '0' || src[3] > '9') {
+      return false;
+    }
+    return true;
+  }
+};
+}  // namespace
+
+TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
+  ROCKSDB_NAMESPACE::Options options;
+  options.compaction_style = ROCKSDB_NAMESPACE::kCompactionStyleUniversal;
+  options.num_levels = 20;
+  options.create_if_missing = true;
+  options.optimize_filters_for_hits = false;
+  options.target_file_size_base = 268435456;
+  options.prefix_extractor = std::make_shared<TestPrefixExtractor>();
+  ROCKSDB_NAMESPACE::BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.block_size = 262144;
+  bbto.whole_key_filtering = true;
+
+  const std::string kDBPath = test::PerThreadDBPath("table_prefix_test");
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_OK(DestroyDB(kDBPath, options));
+  ROCKSDB_NAMESPACE::DB* db;
+  ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
+
+  // Create a bunch of keys with 10 filters.
+  for (int i = 0; i < 10; i++) {
+    std::string prefix = "[" + std::to_string(i) + "]";
+    for (int j = 0; j < 10; j++) {
+      std::string key = prefix + std::to_string(j);
+      ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, "1"));
+    }
+  }
+
+  // Trigger compaction.
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  delete db;
+  // In the second round, turn whole_key_filtering off and expect
+  // rocksdb still works.
+}
+
+/*
+ * Disable TableWithGlobalSeqno since RocksDB does not store global_seqno in
+ * the SST file any more. Instead, RocksDB deduces global_seqno from the
+ * MANIFEST while reading from an SST. Therefore, it's not possible to test the
+ * functionality of global_seqno in a single, isolated unit test without the
+ * involvement of Version, VersionSet, etc.
+ */
+TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
+  Options options;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  int_tbl_prop_collector_factories.emplace_back(
+      new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+                                                  0 /* global_seqno*/));
+  std::string column_family_name;
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
+      file_writer.get()));
+
+  for (char c = 'a'; c <= 'z'; ++c) {
+    std::string key(8, c);
+    std::string value = key;
+    InternalKey ik(key, 0, kTypeValue);
+
+    builder->Add(ik.Encode(), value);
+  }
+  ASSERT_OK(builder->Finish());
+  ASSERT_OK(file_writer->Flush());
+
+  test::RandomRWStringSink ss_rw(sink);
+  uint32_t version;
+  uint64_t global_seqno;
+  uint64_t global_seqno_offset;
+
+  // Helper function to get version, global_seqno, global_seqno_offset
+  std::function<void()> GetVersionAndGlobalSeqno = [&]() {
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw.contents(), 73342, true));
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(source), ""));
+
+    std::unique_ptr<TableProperties> props;
+    ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
+                                  kBlockBasedTableMagicNumber, ioptions,
+                                  &props));
+
+    UserCollectedProperties user_props = props->user_collected_properties;
+    version = DecodeFixed32(
+        user_props[ExternalSstFilePropertyNames::kVersion].c_str());
+    global_seqno = DecodeFixed64(
+        user_props[ExternalSstFilePropertyNames::kGlobalSeqno].c_str());
+    global_seqno_offset = props->external_sst_file_global_seqno_offset;
+  };
+
+  // Helper function to update the value of the global seqno in the file
+  std::function<void(uint64_t)> SetGlobalSeqno = [&](uint64_t val) {
+    std::string new_global_seqno;
+    PutFixed64(&new_global_seqno, val);
+
+    ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno, IOOptions(),
+                          nullptr));
+  };
+
+  // Helper function to get the contents of the table InternalIterator
+  std::unique_ptr<TableReader> table_reader;
+  const ReadOptions read_options;
+  std::function<InternalIterator*()> GetTableInternalIter = [&]() {
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw.contents(), 73342, true));
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(source), ""));
+
+    options.table_factory->NewTableReader(
+        TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(),
+                           ikc),
+        std::move(file_reader), ss_rw.contents().size(), &table_reader);
+
+    return table_reader->NewIterator(
+        read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+  };
+
+  GetVersionAndGlobalSeqno();
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(0u, global_seqno);
+
+  InternalIterator* iter = GetTableInternalIter();
+  char current_c = 'a';
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey pik;
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
+
+    ASSERT_EQ(pik.type, ValueType::kTypeValue);
+    ASSERT_EQ(pik.sequence, 0);
+    ASSERT_EQ(pik.user_key, iter->value());
+    ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
+    current_c++;
+  }
+  ASSERT_EQ(current_c, 'z' + 1);
+  delete iter;
+
+  // Update global sequence number to 10
+  SetGlobalSeqno(10);
+  GetVersionAndGlobalSeqno();
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(10u, global_seqno);
+
+  iter = GetTableInternalIter();
+  current_c = 'a';
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey pik;
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
+
+    ASSERT_EQ(pik.type, ValueType::kTypeValue);
+    ASSERT_EQ(pik.sequence, 10);
+    ASSERT_EQ(pik.user_key, iter->value());
+    ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
+    current_c++;
+  }
+  ASSERT_EQ(current_c, 'z' + 1);
+
+  // Verify Seek
+  for (char c = 'a'; c <= 'z'; c++) {
+    std::string k = std::string(8, c);
+    InternalKey ik(k, 10, kValueTypeForSeek);
+    iter->Seek(ik.Encode());
+    ASSERT_TRUE(iter->Valid());
+
+    ParsedInternalKey pik;
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
+
+    ASSERT_EQ(pik.type, ValueType::kTypeValue);
+    ASSERT_EQ(pik.sequence, 10);
+    ASSERT_EQ(pik.user_key.ToString(), k);
+    ASSERT_EQ(iter->value().ToString(), k);
+  }
+  delete iter;
+
+  // Update global sequence number to 3
+  SetGlobalSeqno(3);
+  GetVersionAndGlobalSeqno();
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(3u, global_seqno);
+
+  iter = GetTableInternalIter();
+  current_c = 'a';
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey pik;
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
+
+    ASSERT_EQ(pik.type, ValueType::kTypeValue);
+    ASSERT_EQ(pik.sequence, 3);
+    ASSERT_EQ(pik.user_key, iter->value());
+    ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
+    current_c++;
+  }
+  ASSERT_EQ(current_c, 'z' + 1);
+
+  // Verify Seek
+  for (char c = 'a'; c <= 'z'; c++) {
+    std::string k = std::string(8, c);
+    // seqno=4 is less than 3 so we still should get our key
+    InternalKey ik(k, 4, kValueTypeForSeek);
+    iter->Seek(ik.Encode());
+    ASSERT_TRUE(iter->Valid());
+
+    ParsedInternalKey pik;
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
+
+    ASSERT_EQ(pik.type, ValueType::kTypeValue);
+    ASSERT_EQ(pik.sequence, 3);
+    ASSERT_EQ(pik.user_key.ToString(), k);
+    ASSERT_EQ(iter->value().ToString(), k);
+  }
+
+  delete iter;
+}
+
+TEST_P(BlockBasedTableTest, BlockAlignTest) {
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+  bbto.block_align = true;
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
+  Options options;
+  options.compression = kNoCompression;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
+      file_writer.get()));
+
+  for (int i = 1; i <= 10000; ++i) {
+    std::ostringstream ostr;
+    ostr << std::setfill('0') << std::setw(5) << i;
+    std::string key = ostr.str();
+    std::string value = "val";
+    InternalKey ik(key, 0, kTypeValue);
+
+    builder->Add(ik.Encode(), value);
+  }
+  ASSERT_OK(builder->Finish());
+  ASSERT_OK(file_writer->Flush());
+
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(sink->contents(), 73342, false));
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+  // Helper function to get version, global_seqno, global_seqno_offset
+  std::function<void()> VerifyBlockAlignment = [&]() {
+    std::unique_ptr<TableProperties> props;
+    ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(),
+                                  kBlockBasedTableMagicNumber, ioptions,
+                                  &props));
+
+    uint64_t data_block_size = props->data_size / props->num_data_blocks;
+    ASSERT_EQ(data_block_size, 4096);
+    ASSERT_EQ(props->data_size, data_block_size * props->num_data_blocks);
+  };
+
+  VerifyBlockAlignment();
+
+  // The below block of code verifies that we can read back the keys. Set
+  // block_align to false when creating the reader to ensure we can flip between
+  // the two modes without any issues
+  std::unique_ptr<TableReader> table_reader;
+  bbto.block_align = false;
+  Options options2;
+  options2.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ImmutableOptions ioptions2(options2);
+  const MutableCFOptions moptions2(options2);
+
+  ASSERT_OK(ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(),
+                         GetPlainInternalComparator(options2.comparator)),
+      std::move(file_reader), sink->contents().size(), &table_reader));
+
+  ReadOptions read_options;
+  std::unique_ptr<InternalIterator> db_iter(table_reader->NewIterator(
+      read_options, moptions2.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  int expected_key = 1;
+  for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+    std::ostringstream ostr;
+    ostr << std::setfill('0') << std::setw(5) << expected_key++;
+    std::string key = ostr.str();
+    std::string value = "val";
+
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(ExtractUserKey(db_iter->key()).ToString(), key);
+    ASSERT_EQ(db_iter->value().ToString(), value);
+  }
+  expected_key--;
+  ASSERT_EQ(expected_key, 10000);
+  table_reader.reset();
+}
+
+TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+  bbto.block_align = true;
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
+
+  Options options;
+  options.compression = kNoCompression;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
+      file_writer.get()));
+
+  for (int i = 1; i <= 10000; ++i) {
+    std::ostringstream ostr;
+    ostr << std::setfill('0') << std::setw(5) << i;
+    std::string key = ostr.str();
+    std::string value = "val";
+    InternalKey ik(key, 0, kTypeValue);
+
+    builder->Add(ik.Encode(), value);
+  }
+  ASSERT_OK(builder->Finish());
+  ASSERT_OK(file_writer->Flush());
+
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(sink->contents(), 73342, true));
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+
+  {
+    RandomAccessFileReader* file = file_reader.get();
+    uint64_t file_size = sink->contents().size();
+
+    Footer footer;
+    IOOptions opts;
+    ASSERT_OK(ReadFooterFromFile(opts, file, nullptr /* prefetch_buffer */,
+                                 file_size, &footer,
+                                 kBlockBasedTableMagicNumber));
+
+    auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type,
+                                BlockContents* contents) {
+      ReadOptions read_options;
+      read_options.verify_checksums = false;
+      PersistentCacheOptions cache_options;
+
+      BlockFetcher block_fetcher(
+          file, nullptr /* prefetch_buffer */, footer, read_options, handle,
+          contents, ioptions, false /* decompress */,
+          false /*maybe_compressed*/, block_type,
+          UncompressionDict::GetEmptyDict(), cache_options);
+
+      ASSERT_OK(block_fetcher.ReadBlockContents());
+    };
+
+    // -- Read metaindex block
+    auto metaindex_handle = footer.metaindex_handle();
+    BlockContents metaindex_contents;
+
+    BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex,
+                     &metaindex_contents);
+    Block metaindex_block(std::move(metaindex_contents));
+
+    std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
+        BytewiseComparator(), kDisableGlobalSequenceNumber));
+
+    // -- Read properties block
+    BlockHandle properties_handle;
+    ASSERT_OK(FindOptionalMetaBlock(meta_iter.get(), kPropertiesBlockName,
+                                    &properties_handle));
+    ASSERT_FALSE(properties_handle.IsNull());
+    BlockContents properties_contents;
+    BlockFetchHelper(properties_handle, BlockType::kProperties,
+                     &properties_contents);
+    Block properties_block(std::move(properties_contents));
+
+    ASSERT_EQ(properties_block.NumRestarts(), 1u);
+  }
+}
+
+TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
+  // The properties meta-block should come at the end since we always need to
+  // read it when opening a file, unlike index/filter/other meta-blocks, which
+  // are sometimes read depending on the user's configuration. This ordering
+  // allows us to do a small readahead on the end of the file to read properties
+  // and meta-index blocks with one I/O.
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("a1", "val1");
+  c.Add("b2", "val2");
+  c.Add("c3", "val3");
+  c.Add("d4", "val4");
+  c.Add("e5", "val5");
+  c.Add("f6", "val6");
+  c.Add("g7", "val7");
+  c.Add("h8", "val8");
+  c.Add("j9", "val9");
+
+  // write an SST file
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.filter_policy.reset(NewBloomFilterPolicy(
+      8 /* bits_per_key */, false /* use_block_based_filter */));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  // get file reader
+  test::StringSink* table_sink = c.TEST_GetSink();
+  std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+      table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */));
+
+  std::unique_ptr<RandomAccessFileReader> table_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+  size_t table_size = table_sink->contents().size();
+
+  // read footer
+  Footer footer;
+  IOOptions opts;
+  ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(),
+                               nullptr /* prefetch_buffer */, table_size,
+                               &footer, kBlockBasedTableMagicNumber));
+
+  // read metaindex
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  PersistentCacheOptions pcache_opts;
+  BlockFetcher block_fetcher(
+      table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
+      metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
+      false /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), pcache_opts,
+      nullptr /*memory_allocator*/);
+  ASSERT_OK(block_fetcher.ReadBlockContents());
+  Block metaindex_block(std::move(metaindex_contents));
+
+  // verify properties block comes last
+  std::unique_ptr<InternalIterator> metaindex_iter{
+      metaindex_block.NewMetaIterator()};
+  uint64_t max_offset = 0;
+  std::string key_at_max_offset;
+  for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+       metaindex_iter->Next()) {
+    BlockHandle handle;
+    Slice value = metaindex_iter->value();
+    ASSERT_OK(handle.DecodeFrom(&value));
+    if (handle.offset() > max_offset) {
+      max_offset = handle.offset();
+      key_at_max_offset = metaindex_iter->key().ToString();
+    }
+  }
+  ASSERT_EQ(kPropertiesBlockName, key_at_max_offset);
+  // index handle is stored in footer rather than metaindex block, so need
+  // separate logic to verify it comes before properties block.
+  ASSERT_GT(max_offset, footer.index_handle().offset());
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, SeekMetaBlocks) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("foo_a1", "val1");
+  c.Add("foo_b2", "val2");
+  c.Add("foo_c3", "val3");
+  c.Add("foo_d4", "val4");
+  c.Add("foo_e5", "val5");
+  c.Add("foo_f6", "val6");
+  c.Add("foo_g7", "val7");
+  c.Add("foo_h8", "val8");
+  c.Add("foo_j9", "val9");
+
+  // write an SST file
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(
+      8 /* bits_per_key */, false /* use_block_based_filter */));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  // get file reader
+  test::StringSink* table_sink = c.TEST_GetSink();
+  std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+      table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */));
+
+  std::unique_ptr<RandomAccessFileReader> table_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+  size_t table_size = table_sink->contents().size();
+
+  // read footer
+  Footer footer;
+  IOOptions opts;
+  ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(),
+                               nullptr /* prefetch_buffer */, table_size,
+                               &footer, kBlockBasedTableMagicNumber));
+
+  // read metaindex
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  PersistentCacheOptions pcache_opts;
+  BlockFetcher block_fetcher(
+      table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
+      metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
+      false /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), pcache_opts,
+      nullptr /*memory_allocator*/);
+  ASSERT_OK(block_fetcher.ReadBlockContents());
+  Block metaindex_block(std::move(metaindex_contents));
+
+  // verify properties block comes last
+  std::unique_ptr<MetaBlockIter> metaindex_iter(
+      metaindex_block.NewMetaIterator());
+  bool has_hash_prefixes = false;
+  bool has_hash_metadata = false;
+  for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+       metaindex_iter->Next()) {
+    if (metaindex_iter->key().ToString() == kHashIndexPrefixesBlock) {
+      has_hash_prefixes = true;
+    } else if (metaindex_iter->key().ToString() ==
+               kHashIndexPrefixesMetadataBlock) {
+      has_hash_metadata = true;
+    }
+  }
+  if (has_hash_metadata) {
+    metaindex_iter->Seek(kHashIndexPrefixesMetadataBlock);
+    ASSERT_TRUE(metaindex_iter->Valid());
+    ASSERT_EQ(kHashIndexPrefixesMetadataBlock,
+              metaindex_iter->key().ToString());
+  }
+  if (has_hash_prefixes) {
+    metaindex_iter->Seek(kHashIndexPrefixesBlock);
+    ASSERT_TRUE(metaindex_iter->Valid());
+    ASSERT_EQ(kHashIndexPrefixesBlock, metaindex_iter->key().ToString());
+  }
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, BadOptions) {
+  ROCKSDB_NAMESPACE::Options options;
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+  bbto.block_size = 4000;
+  bbto.block_align = true;
+
+  const std::string kDBPath =
+      test::PerThreadDBPath("block_based_table_bad_options_test");
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_OK(DestroyDB(kDBPath, options));
+  ROCKSDB_NAMESPACE::DB* db;
+  ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
+
+  bbto.block_size = 4096;
+  options.compression = kSnappyCompression;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
+}
+
+TEST_F(BBTTailPrefetchTest, TestTailPrefetchStats) {
+  TailPrefetchStats tpstats;
+  ASSERT_EQ(0, tpstats.GetSuggestedPrefetchSize());
+  tpstats.RecordEffectiveSize(size_t{1000});
+  tpstats.RecordEffectiveSize(size_t{1005});
+  tpstats.RecordEffectiveSize(size_t{1002});
+  ASSERT_EQ(1005, tpstats.GetSuggestedPrefetchSize());
+
+  // One single super large value shouldn't influence much
+  tpstats.RecordEffectiveSize(size_t{1002000});
+  tpstats.RecordEffectiveSize(size_t{999});
+  ASSERT_LE(1005, tpstats.GetSuggestedPrefetchSize());
+  ASSERT_GT(1200, tpstats.GetSuggestedPrefetchSize());
+
+  // Only history of 32 is kept
+  for (int i = 0; i < 32; i++) {
+    tpstats.RecordEffectiveSize(size_t{100});
+  }
+  ASSERT_EQ(100, tpstats.GetSuggestedPrefetchSize());
+
+  // 16 large values and 16 small values. The result should be closer
+  // to the small value as the algorithm.
+  for (int i = 0; i < 16; i++) {
+    tpstats.RecordEffectiveSize(size_t{1000});
+  }
+  tpstats.RecordEffectiveSize(size_t{10});
+  tpstats.RecordEffectiveSize(size_t{20});
+  for (int i = 0; i < 6; i++) {
+    tpstats.RecordEffectiveSize(size_t{100});
+  }
+  ASSERT_LE(80, tpstats.GetSuggestedPrefetchSize());
+  ASSERT_GT(200, tpstats.GetSuggestedPrefetchSize());
+}
+
+TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) {
+  TailPrefetchStats tpstats;
+  FilePrefetchBuffer buffer(0 /* readahead_size */, 0 /* max_readahead_size */,
+                            false /* enable */, true /* track_min_offset */);
+  IOOptions opts;
+  buffer.TryReadFromCache(opts, nullptr /* reader */, 500 /* offset */,
+                          10 /* n */, nullptr /* result */,
+                          nullptr /* status */,
+                          Env::IO_TOTAL /* rate_limiter_priority */);
+  buffer.TryReadFromCache(opts, nullptr /* reader */, 480 /* offset */,
+                          10 /* n */, nullptr /* result */,
+                          nullptr /* status */,
+                          Env::IO_TOTAL /* rate_limiter_priority */);
+  buffer.TryReadFromCache(opts, nullptr /* reader */, 490 /* offset */,
+                          10 /* n */, nullptr /* result */,
+                          nullptr /* status */,
+                          Env::IO_TOTAL /* rate_limiter_priority */);
+  ASSERT_EQ(480, buffer.min_offset_read());
+}
+
+TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
+  const int kNumKeys = 500;
+  const int kKeySize = 8;
+  const int kValSize = 40;
+
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  TableConstructor c(options.comparator);
+
+  static Random rnd(1048);
+  for (int i = 0; i < kNumKeys; i++) {
+    // padding one "0" to mark existent keys.
+    std::string random_key(rnd.RandomString(kKeySize - 1) + "1");
+    InternalKey k(random_key, 0, kTypeValue);
+    c.Add(k.Encode().ToString(), rnd.RandomString(kValSize));
+  }
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+  auto reader = c.GetTableReader();
+
+  std::unique_ptr<InternalIterator> seek_iter;
+  ReadOptions read_options;
+  seek_iter.reset(reader->NewIterator(
+      read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  for (int i = 0; i < 2; ++i) {
+    ReadOptions ro;
+    // for every kv, we seek using two method: Get() and Seek()
+    // Get() will use the SuffixIndexHash in Block. For non-existent key it
+    //      will invalidate the iterator
+    // Seek() will use the default BinarySeek() in Block. So for non-existent
+    //      key it will land at the closest key that is large than target.
+
+    // Search for existent keys
+    for (auto& kv : kvmap) {
+      if (i == 0) {
+        // Search using Seek()
+        seek_iter->Seek(kv.first);
+        ASSERT_OK(seek_iter->status());
+        ASSERT_TRUE(seek_iter->Valid());
+        ASSERT_EQ(seek_iter->key(), kv.first);
+        ASSERT_EQ(seek_iter->value(), kv.second);
+      } else {
+        // Search using Get()
+        PinnableSlice value;
+        std::string user_key = ExtractUserKey(kv.first).ToString();
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, true, nullptr, nullptr);
+        ASSERT_OK(reader->Get(ro, kv.first, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kFound);
+        ASSERT_EQ(value, Slice(kv.second));
+        value.Reset();
+      }
+    }
+
+    // Search for non-existent keys
+    for (auto& kv : kvmap) {
+      std::string user_key = ExtractUserKey(kv.first).ToString();
+      user_key.back() = '0';  // make it non-existent key
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      if (i == 0) {  // Search using Seek()
+        seek_iter->Seek(encoded_key);
+        ASSERT_OK(seek_iter->status());
+        if (seek_iter->Valid()) {
+          ASSERT_TRUE(BytewiseComparator()->Compare(
+                          user_key, ExtractUserKey(seek_iter->key())) < 0);
+        }
+      } else {  // Search using Get()
+        PinnableSlice value;
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, true, nullptr, nullptr);
+        ASSERT_OK(reader->Get(ro, encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+        value.Reset();
+      }
+    }
+  }
+}
+
+// BlockBasedTableIterator should invalidate itself and return
+// OutOfBound()=true immediately after Seek(), to allow LevelIterator
+// filter out corresponding level.
+TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) {
+  TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/);
+  c.Add("foo", "v1");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
+  options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_opt,
+           GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
+  auto* reader = c.GetTableReader();
+  ReadOptions read_opt;
+  std::string upper_bound = "bar";
+  Slice upper_bound_slice(upper_bound);
+  read_opt.iterate_upper_bound = &upper_bound_slice;
+  std::unique_ptr<InternalIterator> iter;
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+  iter->SeekToFirst();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+  iter->Seek("foo");
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
+}
+
+// BlockBasedTableIterator should invalidate itself and return
+// OutOfBound()=true after Next(), if it finds current index key is no smaller
+// than upper bound, unless it is pointing to the last data block.
+TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
+  TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/);
+  c.Add("bar", "v");
+  c.Add("foo", "v");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
+  table_opt.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_opt,
+           GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
+  auto* reader = c.GetTableReader();
+  ReadOptions read_opt;
+  std::string ub1 = "bar_after";
+  Slice ub_slice1(ub1);
+  read_opt.iterate_upper_bound = &ub_slice1;
+  std::unique_ptr<InternalIterator> iter;
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
+  std::string ub2 = "foo_after";
+  Slice ub_slice2(ub2);
+  read_opt.iterate_upper_bound = &ub_slice2;
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_FALSE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
+}
+
+class ChargeCompressionDictionaryBuildingBufferTest
+    : public BlockBasedTableTestBase {};
+TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) {
+  constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+  constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
+  constexpr std::size_t kMaxDictBytes = 1024;
+  constexpr std::size_t kMaxDictBufferBytes = 1024;
+
+  for (CacheEntryRoleOptions::Decision
+           charge_compression_dictionary_building_buffer :
+       {CacheEntryRoleOptions::Decision::kEnabled,
+        CacheEntryRoleOptions::Decision::kDisabled}) {
+    BlockBasedTableOptions table_options;
+    LRUCacheOptions lo;
+    lo.capacity = kCacheCapacity;
+    lo.num_shard_bits = 0;  // 2^0 shard
+    lo.strict_capacity_limit = true;
+    std::shared_ptr<Cache> cache(NewLRUCache(lo));
+    table_options.block_cache = cache;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+    table_options.cache_usage_options.options_overrides.insert(
+        {CacheEntryRole::kCompressionDictionaryBuildingBuffer,
+         {/*.charged = */ charge_compression_dictionary_building_buffer}});
+    Options options;
+    options.compression = kSnappyCompression;
+    options.compression_opts.max_dict_bytes = kMaxDictBytes;
+    options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    test::StringSink* sink = new test::StringSink();
+    std::unique_ptr<FSWritableFile> holder(sink);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(holder), "test_file_name", FileOptions()));
+
+    ImmutableOptions ioptions(options);
+    MutableCFOptions moptions(options);
+    InternalKeyComparator ikc(options.comparator);
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+    std::unique_ptr<TableBuilder> builder(
+        options.table_factory->NewTableBuilder(
+            TableBuilderOptions(
+                ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
+                kSnappyCompression, options.compression_opts,
+                kUnknownColumnFamily, "test_cf", -1 /* level */),
+            file_writer.get()));
+
+    std::string key1 = "key1";
+    std::string value1 = "val1";
+    InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
+    // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
+    // therefore won't trigger any data block's buffering
+    builder->Add(ik1.Encode(), value1);
+    ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+    std::string key2 = "key2";
+    std::string value2 = "val2";
+    InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
+    // Adding the second key will trigger a flush of the last data block (the
+    // one containing key1 and value1) by FlushBlockEveryKeyPolicy and hence
+    // trigger buffering of that data block.
+    builder->Add(ik2.Encode(), value2);
+    // Cache charging will increase for last buffered data block (the one
+    // containing key1 and value1) since the buffer limit is not exceeded after
+    // that buffering and the cache will not be full after this reservation
+    if (charge_compression_dictionary_building_buffer ==
+        CacheEntryRoleOptions::Decision::kEnabled) {
+      EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+      EXPECT_LT(cache->GetPinnedUsage(),
+                1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+    } else {
+      EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+    }
+
+    ASSERT_OK(builder->Finish());
+    EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+  }
+}
+
+TEST_F(ChargeCompressionDictionaryBuildingBufferTest,
+       BasicWithBufferLimitExceed) {
+  constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+  constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
+  constexpr std::size_t kMaxDictBytes = 1024;
+  constexpr std::size_t kMaxDictBufferBytes = 2 * kSizeDummyEntry;
+
+  // `CacheEntryRoleOptions::charged` is enabled by default for
+  // CacheEntryRole::kCompressionDictionaryBuildingBuffer
+  BlockBasedTableOptions table_options;
+  LRUCacheOptions lo;
+  lo.capacity = kCacheCapacity;
+  lo.num_shard_bits = 0;  // 2^0 shard
+  lo.strict_capacity_limit = true;
+  std::shared_ptr<Cache> cache(NewLRUCache(lo));
+  table_options.block_cache = cache;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+
+  Options options;
+  options.compression = kSnappyCompression;
+  options.compression_opts.max_dict_bytes = kMaxDictBytes;
+  options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "test_file_name", FileOptions()));
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kSnappyCompression,
+                          options.compression_opts, kUnknownColumnFamily,
+                          "test_cf", -1 /* level */),
+      file_writer.get()));
+
+  std::string key1 = "key1";
+  std::string value1(kSizeDummyEntry, '0');
+  InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
+  // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
+  // therefore won't trigger any data block's buffering
+  builder->Add(ik1.Encode(), value1);
+  ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  std::string key2 = "key2";
+  std::string value2(kSizeDummyEntry, '0');
+  InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
+  // Adding the second key will trigger a flush of the last data block (the one
+  // containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik2.Encode(), value2);
+  // Cache charging will increase for last buffered data block (the one
+  // containing key1 and value1) since the buffer limit is not exceeded after
+  // the buffering and the cache will not be full after this reservation
+  EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
+  EXPECT_LT(cache->GetPinnedUsage(),
+            2 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  std::string key3 = "key3";
+  std::string value3 = "val3";
+  InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue);
+  // Adding the third key will trigger a flush of the last data block (the one
+  // containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik3.Encode(), value3);
+  // Cache charging will decrease since the buffer limit is now exceeded
+  // after the last buffering and EnterUnbuffered() is triggered
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  ASSERT_OK(builder->Finish());
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+}
+
+TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) {
+  constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+  // A small kCacheCapacity is chosen so that increase cache charging for
+  // buffering two data blocks, each containing key1/value1, key2/a big
+  // value2, will cause cache full
+  constexpr std::size_t kCacheCapacity =
+      1 * kSizeDummyEntry + kSizeDummyEntry / 2;
+  constexpr std::size_t kMaxDictBytes = 1024;
+  // A big kMaxDictBufferBytes is chosen so that adding a big key value pair
+  // (key2, value2) won't exceed the buffer limit
+  constexpr std::size_t kMaxDictBufferBytes = 1024 * 1024 * 1024;
+
+  // `CacheEntryRoleOptions::charged` is enabled by default for
+  // CacheEntryRole::kCompressionDictionaryBuildingBuffer
+  BlockBasedTableOptions table_options;
+  LRUCacheOptions lo;
+  lo.capacity = kCacheCapacity;
+  lo.num_shard_bits = 0;  // 2^0 shard
+  lo.strict_capacity_limit = true;
+  std::shared_ptr<Cache> cache(NewLRUCache(lo));
+  table_options.block_cache = cache;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+
+  Options options;
+  options.compression = kSnappyCompression;
+  options.compression_opts.max_dict_bytes = kMaxDictBytes;
+  options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "test_file_name", FileOptions()));
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kSnappyCompression,
+                          options.compression_opts, kUnknownColumnFamily,
+                          "test_cf", -1 /* level */),
+      file_writer.get()));
+
+  std::string key1 = "key1";
+  std::string value1 = "val1";
+  InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
+  // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
+  // therefore won't trigger any data block's buffering
+  builder->Add(ik1.Encode(), value1);
+  ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  std::string key2 = "key2";
+  std::string value2(kSizeDummyEntry, '0');
+  InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
+  // Adding the second key will trigger a flush of the last data block (the one
+  // containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik2.Encode(), value2);
+  // Cache charging will increase for the last buffered data block (the one
+  // containing key1 and value1) since the buffer limit is not exceeded after
+  // the buffering and the cache will not be full after this reservation
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+  EXPECT_LT(cache->GetPinnedUsage(),
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  std::string key3 = "key3";
+  std::string value3 = "value3";
+  InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue);
+  // Adding the third key will trigger a flush of the last data block (the one
+  // containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik3.Encode(), value3);
+  // Cache charging will decrease since the cache is now full after
+  // increasing reservation for the last buffered block and EnterUnbuffered() is
+  // triggered
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  ASSERT_OK(builder->Finish());
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+}
+
+class CacheUsageOptionsOverridesTest : public DBTestBase {
+ public:
+  CacheUsageOptionsOverridesTest()
+      : DBTestBase("cache_usage_options_overrides_test",
+                   /*env_do_fsync=*/false) {}
+};
+
+TEST_F(CacheUsageOptionsOverridesTest, SanitizeAndValidateOptions) {
+  // To test `cache_usage_options.options_overrides` is sanitized
+  // where `cache_usage_options.options` is used when there is no entry in
+  // `cache_usage_options.options_overrides`
+  Options options;
+  options.create_if_missing = true;
+  BlockBasedTableOptions table_options = BlockBasedTableOptions();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Destroy(options);
+  Status s = TryReopen(options);
+  EXPECT_TRUE(s.ok());
+  const auto* sanitized_table_options =
+      options.table_factory->GetOptions<BlockBasedTableOptions>();
+  const auto sanitized_options_overrides =
+      sanitized_table_options->cache_usage_options.options_overrides;
+  EXPECT_EQ(sanitized_options_overrides.size(), kNumCacheEntryRoles);
+  for (auto options_overrides_iter = sanitized_options_overrides.cbegin();
+       options_overrides_iter != sanitized_options_overrides.cend();
+       ++options_overrides_iter) {
+    CacheEntryRoleOptions role_options = options_overrides_iter->second;
+    CacheEntryRoleOptions default_options =
+        sanitized_table_options->cache_usage_options.options;
+    EXPECT_TRUE(role_options == default_options);
+  }
+  Destroy(options);
+
+  // To test option validation on unsupported CacheEntryRole
+  table_options = BlockBasedTableOptions();
+  table_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kDataBlock,
+       {/*.charged = */ CacheEntryRoleOptions::Decision::kDisabled}});
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Destroy(options);
+  s = TryReopen(options);
+  EXPECT_TRUE(s.IsNotSupported());
+  EXPECT_TRUE(
+      s.ToString().find("Enable/Disable CacheEntryRoleOptions::charged") !=
+      std::string::npos);
+  EXPECT_TRUE(
+      s.ToString().find(kCacheEntryRoleToCamelString[static_cast<uint32_t>(
+          CacheEntryRole::kDataBlock)]) != std::string::npos);
+  Destroy(options);
+
+  // To test option validation on existence of block cache
+  table_options = BlockBasedTableOptions();
+  table_options.no_block_cache = true;
+  table_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kFilterConstruction,
+       {/*.charged = */ CacheEntryRoleOptions::Decision::kEnabled}});
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Destroy(options);
+  s = TryReopen(options);
+  EXPECT_TRUE(s.IsInvalidArgument());
+  EXPECT_TRUE(s.ToString().find("Enable CacheEntryRoleOptions::charged") !=
+              std::string::npos);
+  EXPECT_TRUE(
+      s.ToString().find(kCacheEntryRoleToCamelString[static_cast<std::size_t>(
+          CacheEntryRole::kFilterConstruction)]) != std::string::npos);
+  EXPECT_TRUE(s.ToString().find("block cache is disabled") !=
+              std::string::npos);
+  Destroy(options);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/two_level_iterator.cc b/src/rocksdb/table/two_level_iterator.cc
new file mode 100644
index 000000000..4b6634e5c
--- /dev/null
+++ b/src/rocksdb/table/two_level_iterator.cc
@@ -0,0 +1,220 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/two_level_iterator.h"
+
+#include "db/pinned_iterators_manager.h"
+#include "memory/arena.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class TwoLevelIndexIterator : public InternalIteratorBase<IndexValue> {
+ public:
+  explicit TwoLevelIndexIterator(
+      TwoLevelIteratorState* state,
+      InternalIteratorBase<IndexValue>* first_level_iter);
+
+  ~TwoLevelIndexIterator() override {
+    first_level_iter_.DeleteIter(false /* is_arena_mode */);
+    second_level_iter_.DeleteIter(false /* is_arena_mode */);
+    delete state_;
+  }
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() override;
+  void Prev() override;
+
+  bool Valid() const override { return second_level_iter_.Valid(); }
+  Slice key() const override {
+    assert(Valid());
+    return second_level_iter_.key();
+  }
+  Slice user_key() const override {
+    assert(Valid());
+    return second_level_iter_.user_key();
+  }
+  IndexValue value() const override {
+    assert(Valid());
+    return second_level_iter_.value();
+  }
+  Status status() const override {
+    if (!first_level_iter_.status().ok()) {
+      assert(second_level_iter_.iter() == nullptr);
+      return first_level_iter_.status();
+    } else if (second_level_iter_.iter() != nullptr &&
+               !second_level_iter_.status().ok()) {
+      return second_level_iter_.status();
+    } else {
+      return status_;
+    }
+  }
+  void SetPinnedItersMgr(
+      PinnedIteratorsManager* /*pinned_iters_mgr*/) override {}
+  bool IsKeyPinned() const override { return false; }
+  bool IsValuePinned() const override { return false; }
+
+ private:
+  void SaveError(const Status& s) {
+    if (status_.ok() && !s.ok()) status_ = s;
+  }
+  void SkipEmptyDataBlocksForward();
+  void SkipEmptyDataBlocksBackward();
+  void SetSecondLevelIterator(InternalIteratorBase<IndexValue>* iter);
+  void InitDataBlock();
+
+  TwoLevelIteratorState* state_;
+  IteratorWrapperBase<IndexValue> first_level_iter_;
+  IteratorWrapperBase<IndexValue> second_level_iter_;  // May be nullptr
+  Status status_;
+  // If second_level_iter is non-nullptr, then "data_block_handle_" holds the
+  // "index_value" passed to block_function_ to create the second_level_iter.
+  BlockHandle data_block_handle_;
+};
+
+TwoLevelIndexIterator::TwoLevelIndexIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<IndexValue>* first_level_iter)
+    : state_(state), first_level_iter_(first_level_iter) {}
+
+void TwoLevelIndexIterator::Seek(const Slice& target) {
+  first_level_iter_.Seek(target);
+
+  InitDataBlock();
+  if (second_level_iter_.iter() != nullptr) {
+    second_level_iter_.Seek(target);
+  }
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIndexIterator::SeekForPrev(const Slice& target) {
+  first_level_iter_.Seek(target);
+  InitDataBlock();
+  if (second_level_iter_.iter() != nullptr) {
+    second_level_iter_.SeekForPrev(target);
+  }
+  if (!Valid()) {
+    if (!first_level_iter_.Valid() && first_level_iter_.status().ok()) {
+      first_level_iter_.SeekToLast();
+      InitDataBlock();
+      if (second_level_iter_.iter() != nullptr) {
+        second_level_iter_.SeekForPrev(target);
+      }
+    }
+    SkipEmptyDataBlocksBackward();
+  }
+}
+
+void TwoLevelIndexIterator::SeekToFirst() {
+  first_level_iter_.SeekToFirst();
+  InitDataBlock();
+  if (second_level_iter_.iter() != nullptr) {
+    second_level_iter_.SeekToFirst();
+  }
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIndexIterator::SeekToLast() {
+  first_level_iter_.SeekToLast();
+  InitDataBlock();
+  if (second_level_iter_.iter() != nullptr) {
+    second_level_iter_.SeekToLast();
+  }
+  SkipEmptyDataBlocksBackward();
+}
+
+void TwoLevelIndexIterator::Next() {
+  assert(Valid());
+  second_level_iter_.Next();
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIndexIterator::Prev() {
+  assert(Valid());
+  second_level_iter_.Prev();
+  SkipEmptyDataBlocksBackward();
+}
+
+void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() {
+  while (second_level_iter_.iter() == nullptr ||
+         (!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
+    // Move to next block
+    if (!first_level_iter_.Valid()) {
+      SetSecondLevelIterator(nullptr);
+      return;
+    }
+    first_level_iter_.Next();
+    InitDataBlock();
+    if (second_level_iter_.iter() != nullptr) {
+      second_level_iter_.SeekToFirst();
+    }
+  }
+}
+
+void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() {
+  while (second_level_iter_.iter() == nullptr ||
+         (!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
+    // Move to next block
+    if (!first_level_iter_.Valid()) {
+      SetSecondLevelIterator(nullptr);
+      return;
+    }
+    first_level_iter_.Prev();
+    InitDataBlock();
+    if (second_level_iter_.iter() != nullptr) {
+      second_level_iter_.SeekToLast();
+    }
+  }
+}
+
+void TwoLevelIndexIterator::SetSecondLevelIterator(
+    InternalIteratorBase<IndexValue>* iter) {
+  InternalIteratorBase<IndexValue>* old_iter = second_level_iter_.Set(iter);
+  delete old_iter;
+}
+
+void TwoLevelIndexIterator::InitDataBlock() {
+  if (!first_level_iter_.Valid()) {
+    SetSecondLevelIterator(nullptr);
+  } else {
+    BlockHandle handle = first_level_iter_.value().handle;
+    if (second_level_iter_.iter() != nullptr &&
+        !second_level_iter_.status().IsIncomplete() &&
+        handle.offset() == data_block_handle_.offset()) {
+      // second_level_iter is already constructed with this iterator, so
+      // no need to change anything
+    } else {
+      InternalIteratorBase<IndexValue>* iter =
+          state_->NewSecondaryIterator(handle);
+      data_block_handle_ = handle;
+      SetSecondLevelIterator(iter);
+      if (iter == nullptr) {
+        status_ = Status::Corruption("Missing block for partition " +
+                                     handle.ToString());
+      }
+    }
+  }
+}
+
+}  // namespace
+
+InternalIteratorBase<IndexValue>* NewTwoLevelIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<IndexValue>* first_level_iter) {
+  return new TwoLevelIndexIterator(state, first_level_iter);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/two_level_iterator.h b/src/rocksdb/table/two_level_iterator.h
new file mode 100644
index 000000000..1fed93417
--- /dev/null
+++ b/src/rocksdb/table/two_level_iterator.h
@@ -0,0 +1,43 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "table/iterator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ReadOptions;
+class InternalKeyComparator;
+
+// TwoLevelIteratorState expects iterators are not created using the arena
+struct TwoLevelIteratorState {
+  TwoLevelIteratorState() {}
+
+  virtual ~TwoLevelIteratorState() {}
+  virtual InternalIteratorBase<IndexValue>* NewSecondaryIterator(
+      const BlockHandle& handle) = 0;
+};
+
+// Return a new two level iterator.  A two-level iterator contains an
+// index iterator whose values point to a sequence of blocks where
+// each block is itself a sequence of key,value pairs.  The returned
+// two-level iterator yields the concatenation of all key/value pairs
+// in the sequence of blocks.  Takes ownership of "index_iter" and
+// will delete it when no longer needed.
+//
+// Uses a supplied function to convert an index_iter value into
+// an iterator over the contents of the corresponding block.
+// Note: this function expects first_level_iter was not created using the arena
+extern InternalIteratorBase<IndexValue>* NewTwoLevelIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<IndexValue>* first_level_iter);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/unique_id.cc b/src/rocksdb/table/unique_id.cc
new file mode 100644
index 000000000..fcdd75650
--- /dev/null
+++ b/src/rocksdb/table/unique_id.cc
@@ -0,0 +1,223 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdint>
+
+#include "table/unique_id_impl.h"
+#include "util/coding_lean.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string EncodeSessionId(uint64_t upper, uint64_t lower) {
+  std::string db_session_id(20U, '\0');
+  char *buf = &db_session_id[0];
+  // Preserving `lower` is slightly tricky. 36^12 is slightly more than
+  // 62 bits, so we use 12 chars plus the bottom two bits of one more.
+  // (A tiny fraction of 20 digit strings go unused.)
+  uint64_t a = (upper << 2) | (lower >> 62);
+  uint64_t b = lower & (UINT64_MAX >> 2);
+  PutBaseChars<36>(&buf, 8, a, /*uppercase*/ true);
+  PutBaseChars<36>(&buf, 12, b, /*uppercase*/ true);
+  assert(buf == &db_session_id.back() + 1);
+  return db_session_id;
+}
+
+Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
+                       uint64_t *lower) {
+  const size_t len = db_session_id.size();
+  if (len == 0) {
+    return Status::NotSupported("Missing db_session_id");
+  }
+  // Anything from 13 to 24 chars is reasonable. We don't have to limit to
+  // exactly 20.
+  if (len < 13) {
+    return Status::NotSupported("Too short db_session_id");
+  }
+  if (len > 24) {
+    return Status::NotSupported("Too long db_session_id");
+  }
+  uint64_t a = 0, b = 0;
+  const char *buf = &db_session_id.front();
+  bool success = ParseBaseChars<36>(&buf, len - 12U, &a);
+  if (!success) {
+    return Status::NotSupported("Bad digit in db_session_id");
+  }
+  success = ParseBaseChars<36>(&buf, 12U, &b);
+  if (!success) {
+    return Status::NotSupported("Bad digit in db_session_id");
+  }
+  assert(buf == &db_session_id.back() + 1);
+  *upper = a >> 2;
+  *lower = (b & (UINT64_MAX >> 2)) | (a << 62);
+  return Status::OK();
+}
+
+Status GetSstInternalUniqueId(const std::string &db_id,
+                              const std::string &db_session_id,
+                              uint64_t file_number, UniqueIdPtr out,
+                              bool force) {
+  if (!force) {
+    if (db_id.empty()) {
+      return Status::NotSupported("Missing db_id");
+    }
+    if (file_number == 0) {
+      return Status::NotSupported("Missing or bad file number");
+    }
+    if (db_session_id.empty()) {
+      return Status::NotSupported("Missing db_session_id");
+    }
+  }
+  uint64_t session_upper = 0;  // Assignment to appease clang-analyze
+  uint64_t session_lower = 0;  // Assignment to appease clang-analyze
+  {
+    Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower);
+    if (!s.ok()) {
+      if (!force) {
+        return s;
+      } else {
+        // A reasonable fallback in case malformed
+        Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper,
+                 &session_lower);
+        if (session_lower == 0) {
+          session_lower = session_upper | 1;
+        }
+      }
+    }
+  }
+
+  // Exactly preserve session lower to ensure that session ids generated
+  // during the same process lifetime are guaranteed unique.
+  // DBImpl also guarantees (in recent versions) that this is not zero,
+  // so that we can guarantee unique ID is never all zeros. (Can't assert
+  // that here because of testing and old versions.)
+  // We put this first in anticipation of matching a small-ish set of cache
+  // key prefixes to cover entries relevant to any DB.
+  out.ptr[0] = session_lower;
+
+  // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy)
+  // for very high global uniqueness entropy.
+  // (It is possible that many DBs descended from one common DB id are copied
+  // around and proliferate, in which case session id is critical, but it is
+  // more common for different DBs to have different DB ids.)
+  uint64_t db_a, db_b;
+  Hash2x64(db_id.data(), db_id.size(), session_upper, &db_a, &db_b);
+
+  // Xor in file number for guaranteed uniqueness by file number for a given
+  // session and DB id. (Xor slightly better than + here. See
+  // https://github.com/pdillinger/unique_id )
+  out.ptr[1] = db_a ^ file_number;
+
+  // Extra (optional) global uniqueness
+  if (out.extended) {
+    out.ptr[2] = db_b;
+  }
+
+  return Status::OK();
+}
+
+namespace {
+// For InternalUniqueIdToExternal / ExternalUniqueIdToInternal we want all
+// zeros in first 128 bits to map to itself, so that excluding zero in
+// internal IDs (session_lower != 0 above) does the same for external IDs.
+// These values are meaningless except for making that work.
+constexpr uint64_t kHiOffsetForZero = 17391078804906429400U;
+constexpr uint64_t kLoOffsetForZero = 6417269962128484497U;
+}  // namespace
+
+void InternalUniqueIdToExternal(UniqueIdPtr in_out) {
+  uint64_t hi, lo;
+  BijectiveHash2x64(in_out.ptr[1] + kHiOffsetForZero,
+                    in_out.ptr[0] + kLoOffsetForZero, &hi, &lo);
+  in_out.ptr[0] = lo;
+  in_out.ptr[1] = hi;
+  if (in_out.extended) {
+    in_out.ptr[2] += lo + hi;
+  }
+}
+
+void ExternalUniqueIdToInternal(UniqueIdPtr in_out) {
+  uint64_t lo = in_out.ptr[0];
+  uint64_t hi = in_out.ptr[1];
+  if (in_out.extended) {
+    in_out.ptr[2] -= lo + hi;
+  }
+  BijectiveUnhash2x64(hi, lo, &hi, &lo);
+  in_out.ptr[0] = lo - kLoOffsetForZero;
+  in_out.ptr[1] = hi - kHiOffsetForZero;
+}
+
+std::string EncodeUniqueIdBytes(UniqueIdPtr in) {
+  std::string ret(in.extended ? 24U : 16U, '\0');
+  EncodeFixed64(&ret[0], in.ptr[0]);
+  EncodeFixed64(&ret[8], in.ptr[1]);
+  if (in.extended) {
+    EncodeFixed64(&ret[16], in.ptr[2]);
+  }
+  return ret;
+}
+
+Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out) {
+  if (unique_id.size() != (out.extended ? 24 : 16)) {
+    return Status::NotSupported("Not a valid unique_id");
+  }
+  const char *buf = &unique_id.front();
+  out.ptr[0] = DecodeFixed64(&buf[0]);
+  out.ptr[1] = DecodeFixed64(&buf[8]);
+  if (out.extended) {
+    out.ptr[2] = DecodeFixed64(&buf[16]);
+  }
+  return Status::OK();
+}
+
+template <typename ID>
+Status GetUniqueIdFromTablePropertiesHelper(const TableProperties &props,
+                                            std::string *out_id) {
+  ID tmp{};
+  Status s = GetSstInternalUniqueId(props.db_id, props.db_session_id,
+                                    props.orig_file_number, &tmp);
+  if (s.ok()) {
+    InternalUniqueIdToExternal(&tmp);
+    *out_id = EncodeUniqueIdBytes(&tmp);
+  } else {
+    out_id->clear();
+  }
+  return s;
+}
+
+Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props,
+                                              std::string *out_id) {
+  return GetUniqueIdFromTablePropertiesHelper<UniqueId64x3>(props, out_id);
+}
+
+Status GetUniqueIdFromTableProperties(const TableProperties &props,
+                                      std::string *out_id) {
+  return GetUniqueIdFromTablePropertiesHelper<UniqueId64x2>(props, out_id);
+}
+
+std::string UniqueIdToHumanString(const std::string &id) {
+  // Not so efficient, but that's OK
+  std::string str = Slice(id).ToString(/*hex*/ true);
+  for (size_t i = 16; i < str.size(); i += 17) {
+    str.insert(i, "-");
+  }
+  return str;
+}
+
+std::string InternalUniqueIdToHumanString(UniqueIdPtr in) {
+  std::string str = "{";
+  str += std::to_string(in.ptr[0]);
+  str += ",";
+  str += std::to_string(in.ptr[1]);
+  if (in.extended) {
+    str += ",";
+    str += std::to_string(in.ptr[2]);
+  }
+  str += "}";
+  return str;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/unique_id_impl.h b/src/rocksdb/table/unique_id_impl.h
new file mode 100644
index 000000000..6e3dc62c7
--- /dev/null
+++ b/src/rocksdb/table/unique_id_impl.h
@@ -0,0 +1,93 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+
+#include "rocksdb/unique_id.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Standard size unique ID, good enough for almost all practical purposes
+using UniqueId64x2 = std::array<uint64_t, 2>;
+
+// Value never used as an actual unique ID so can be used for "null"
+constexpr UniqueId64x2 kNullUniqueId64x2 = {};
+
+// Extended size unique ID, for extra certainty of uniqueness among SST files
+// spanning many hosts over a long time (rarely if ever needed)
+using UniqueId64x3 = std::array<uint64_t, 3>;
+
+// Value never used as an actual unique ID so can be used for "null"
+constexpr UniqueId64x3 kNullUniqueId64x3 = {};
+
+// Dynamic pointer wrapper for one of the two above
+struct UniqueIdPtr {
+  uint64_t *ptr = nullptr;
+  bool extended = false;
+
+  /*implicit*/ UniqueIdPtr(UniqueId64x2 *id) {
+    ptr = (*id).data();
+    extended = false;
+  }
+  /*implicit*/ UniqueIdPtr(UniqueId64x3 *id) {
+    ptr = (*id).data();
+    extended = true;
+  }
+};
+
+// Helper for GetUniqueIdFromTableProperties. This function can also be used
+// for temporary ids for files without sufficient information in table
+// properties. The internal unique id is more structured than the public
+// unique id, so can be manipulated in more ways but very carefully.
+// These must be long term stable to ensure GetUniqueIdFromTableProperties
+// is long term stable.
+Status GetSstInternalUniqueId(const std::string &db_id,
+                              const std::string &db_session_id,
+                              uint64_t file_number, UniqueIdPtr out,
+                              bool force = false);
+
+// Helper for GetUniqueIdFromTableProperties. External unique ids go through
+// this extra hashing layer so that prefixes of the unique id have predictable
+// "full" entropy. This hashing layer is 1-to-1 on the first 128 bits and on
+// the full 192 bits.
+// This transformation must be long term stable to ensure
+// GetUniqueIdFromTableProperties is long term stable.
+void InternalUniqueIdToExternal(UniqueIdPtr in_out);
+
+// Reverse of InternalUniqueIdToExternal mostly for testing purposes
+// (demonstrably 1-to-1 on the first 128 bits and on the full 192 bits).
+void ExternalUniqueIdToInternal(UniqueIdPtr in_out);
+
+// Convert numerical format to byte format for public API
+std::string EncodeUniqueIdBytes(UniqueIdPtr in);
+
+// Reverse of EncodeUniqueIdBytes.
+Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out);
+
+// For presenting internal IDs for debugging purposes. Visually distinct from
+// UniqueIdToHumanString for external IDs.
+std::string InternalUniqueIdToHumanString(UniqueIdPtr in);
+
+// Reformat a random value down to our "DB session id" format,
+// which is intended to be compact and friendly for use in file names.
+// `lower` is fully preserved and data is lost from `upper`.
+//
+// Detail: Encoded into 20 chars in base-36 ([0-9A-Z]), which is ~103 bits of
+// entropy, which is enough to expect no collisions across a billion servers
+// each opening DBs a million times (~2^50). Benefits vs. RFC-4122 unique id:
+// * Save ~ dozen bytes per SST file
+// * Shorter shared backup file names (some platforms have low limits)
+// * Visually distinct from DB id format (usually RFC-4122)
+std::string EncodeSessionId(uint64_t upper, uint64_t lower);
+
+// Reverse of EncodeSessionId. Returns NotSupported on error rather than
+// Corruption because non-standard session IDs should be allowed with degraded
+// functionality.
+Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
+                       uint64_t *lower);
+
+}  // namespace ROCKSDB_NAMESPACE