Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/table/block_based
parent: Initial commit. (diff)
download: ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
43 files changed, 16655 insertions, 0 deletions
diff --git a/src/rocksdb/table/block_based/block.cc b/src/rocksdb/table/block_based/block.cc
new file mode 100644
index 000000000..a04dd8ac2
--- /dev/null
+++ b/src/rocksdb/table/block_based/block.cc
@@ -0,0 +1,1004 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Decodes the blocks generated by block_builder.cc.
+
+#include "table/block_based/block.h"
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_footer.h"
+#include "table/format.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Helper routine: decode the next block entry starting at "p",
+// storing the number of shared key bytes, non_shared key bytes,
+// and the length of the value in "*shared", "*non_shared", and
+// "*value_length", respectively.  Will not derefence past "limit".
+//
+// If any errors are detected, returns nullptr.  Otherwise, returns a
+// pointer to the key delta (just past the three decoded values).
+struct DecodeEntry {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    assert(limit - p >= 3);
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+    if ((*shared | *non_shared | *value_length) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 3;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+        return nullptr;
+      }
+    }
+
+    // Using an assert in place of "return null" since we should not pay the
+    // cost of checking for corruption on every single key decoding
+    assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
+    return p;
+  }
+};
+
+// Helper routine: similar to DecodeEntry but does not have assertions.
+// Instead, returns nullptr so that caller can detect and report failure.
+struct CheckAndDecodeEntry {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    if (limit - p < 3) {
+      return nullptr;
+    }
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+    if ((*shared | *non_shared | *value_length) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 3;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+        return nullptr;
+      }
+    }
+
+    if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
+      return nullptr;
+    }
+    return p;
+  }
+};
+
+struct DecodeKey {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared) {
+    uint32_t value_length;
+    return DecodeEntry()(p, limit, shared, non_shared, &value_length);
+  }
+};
+
+// In format_version 4, which is used by index blocks, the value size is not
+// encoded before the entry, as the value is known to be the handle with the
+// known size.
+struct DecodeKeyV4 {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    if (limit - p < 3) return nullptr;
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    if ((*shared | *non_shared) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 2;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+    }
+    return p;
+  }
+};
+
+void DataBlockIter::Next() {
+  assert(Valid());
+  ParseNextDataKey<DecodeEntry>();
+}
+
+void DataBlockIter::NextOrReport() {
+  assert(Valid());
+  ParseNextDataKey<CheckAndDecodeEntry>();
+}
+
+void IndexBlockIter::Next() {
+  assert(Valid());
+  ParseNextIndexKey();
+}
+
+void IndexBlockIter::Prev() {
+  assert(Valid());
+  // Scan backwards to a restart point before current_
+  const uint32_t original = current_;
+  while (GetRestartPoint(restart_index_) >= original) {
+    if (restart_index_ == 0) {
+      // No more entries
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return;
+    }
+    restart_index_--;
+  }
+  SeekToRestartPoint(restart_index_);
+  // Loop until end of current entry hits the start of original entry
+  while (ParseNextIndexKey() && NextEntryOffset() < original) {
+  }
+}
+
+// Similar to IndexBlockIter::Prev but also caches the prev entries
+void DataBlockIter::Prev() {
+  assert(Valid());
+
+  assert(prev_entries_idx_ == -1 ||
+         static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
+  // Check if we can use cached prev_entries_
+  if (prev_entries_idx_ > 0 &&
+      prev_entries_[prev_entries_idx_].offset == current_) {
+    // Read cached CachedPrevEntry
+    prev_entries_idx_--;
+    const CachedPrevEntry& current_prev_entry =
+        prev_entries_[prev_entries_idx_];
+
+    const char* key_ptr = nullptr;
+    if (current_prev_entry.key_ptr != nullptr) {
+      // The key is not delta encoded and stored in the data block
+      key_ptr = current_prev_entry.key_ptr;
+      key_pinned_ = true;
+    } else {
+      // The key is delta encoded and stored in prev_entries_keys_buff_
+      key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
+      key_pinned_ = false;
+    }
+    const Slice current_key(key_ptr, current_prev_entry.key_size);
+
+    current_ = current_prev_entry.offset;
+    key_.SetKey(current_key, false /* copy */);
+    value_ = current_prev_entry.value;
+
+    return;
+  }
+
+  // Clear prev entries cache
+  prev_entries_idx_ = -1;
+  prev_entries_.clear();
+  prev_entries_keys_buff_.clear();
+
+  // Scan backwards to a restart point before current_
+  const uint32_t original = current_;
+  while (GetRestartPoint(restart_index_) >= original) {
+    if (restart_index_ == 0) {
+      // No more entries
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return;
+    }
+    restart_index_--;
+  }
+
+  SeekToRestartPoint(restart_index_);
+
+  do {
+    if (!ParseNextDataKey<DecodeEntry>()) {
+      break;
+    }
+    Slice current_key = key();
+
+    if (key_.IsKeyPinned()) {
+      // The key is not delta encoded
+      prev_entries_.emplace_back(current_, current_key.data(), 0,
+                                 current_key.size(), value());
+    } else {
+      // The key is delta encoded, cache decoded key in buffer
+      size_t new_key_offset = prev_entries_keys_buff_.size();
+      prev_entries_keys_buff_.append(current_key.data(), current_key.size());
+
+      prev_entries_.emplace_back(current_, nullptr, new_key_offset,
+                                 current_key.size(), value());
+    }
+    // Loop until end of current entry hits the start of original entry
+  } while (NextEntryOffset() < original);
+  prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
+}
+
+void DataBlockIter::Seek(const Slice& target) {
+  Slice seek_key = target;
+  PERF_TIMER_GUARD(block_seek_nanos);
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+                                  comparator_);
+
+  if (!ok) {
+    return;
+  }
+  SeekToRestartPoint(index);
+
+  // Linear search (within restart block) for first key >= target
+  while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
+  }
+}
+
+// Optimized Seek for point lookup for an internal key `target`
+// target = "seek_user_key @ type | seqno".
+//
+// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex, this function behaves identically as Seek().
+//
+// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex:
+//
+// If the return value is FALSE, iter location is undefined, and it means:
+// 1) there is no key in this block falling into the range:
+//    ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
+//    inclusive; AND
+// 2) the last key of this block has a greater user_key from seek_user_key
+//
+// If the return value is TRUE, iter location has two possibilies:
+// 1) If iter is valid, it is set to a location as if set by BinarySeek. In
+//    this case, it points to the first key_ with a larger user_key or a
+//    matching user_key with a seqno no greater than the seeking seqno.
+// 2) If the iter is invalid, it means that either all the user_key is less
+//    than the seek_user_key, or the block ends with a matching user_key but
+//    with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
+//    but larger type).
+bool DataBlockIter::SeekForGetImpl(const Slice& target) {
+  Slice target_user_key = ExtractUserKey(target);
+  uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
+  uint8_t entry =
+      data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
+
+  if (entry == kCollision) {
+    // HashSeek not effective, falling back
+    Seek(target);
+    return true;
+  }
+
+  if (entry == kNoEntry) {
+    // Even if we cannot find the user_key in this block, the result may
+    // exist in the next block. Consider this exmpale:
+    //
+    // Block N:    [aab@100, ... , app@120]
+    // bounary key: axy@50 (we make minimal assumption about a boundary key)
+    // Block N+1:  [axy@10, ...   ]
+    //
+    // If seek_key = axy@60, the search will starts from Block N.
+    // Even if the user_key is not found in the hash map, the caller still
+    // have to conntinue searching the next block.
+    //
+    // In this case, we pretend the key is the the last restart interval.
+    // The while-loop below will search the last restart interval for the
+    // key. It will stop at the first key that is larger than the seek_key,
+    // or to the end of the block if no one is larger.
+    entry = static_cast<uint8_t>(num_restarts_ - 1);
+  }
+
+  uint32_t restart_index = entry;
+
+  // check if the key is in the restart_interval
+  assert(restart_index < num_restarts_);
+  SeekToRestartPoint(restart_index);
+
+  const char* limit = nullptr;
+  if (restart_index_ + 1 < num_restarts_) {
+    limit = data_ + GetRestartPoint(restart_index_ + 1);
+  } else {
+    limit = data_ + restarts_;
+  }
+
+  while (true) {
+    // Here we only linear seek the target key inside the restart interval.
+    // If a key does not exist inside a restart interval, we avoid
+    // further searching the block content accross restart interval boundary.
+    //
+    // TODO(fwu): check the left and write boundary of the restart interval
+    // to avoid linear seek a target key that is out of range.
+    if (!ParseNextDataKey<DecodeEntry>(limit) || Compare(key_, target) >= 0) {
+      // we stop at the first potential matching user key.
+      break;
+    }
+  }
+
+  if (current_ == restarts_) {
+    // Search reaches to the end of the block. There are three possibilites:
+    // 1) there is only one user_key match in the block (otherwise collsion).
+    //    the matching user_key resides in the last restart interval, and it
+    //    is the last key of the restart interval and of the block as well.
+    //    ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
+    //
+    // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
+    //    AND all existing user_keys in the restart interval are smaller than
+    //    seek_user_key.
+    //
+    // 3) The seek_key is a false positive and happens to be hashed to the
+    //    last restart interval, AND all existing user_keys in the restart
+    //    interval are smaller than seek_user_key.
+    //
+    // The result may exist in the next block each case, so we return true.
+    return true;
+  }
+
+  if (user_comparator_->Compare(key_.GetUserKey(), target_user_key) != 0) {
+    // the key is not in this block and cannot be at the next block either.
+    return false;
+  }
+
+  // Here we are conservative and only support a limited set of cases
+  ValueType value_type = ExtractValueType(key_.GetKey());
+  if (value_type != ValueType::kTypeValue &&
+      value_type != ValueType::kTypeDeletion &&
+      value_type != ValueType::kTypeSingleDeletion &&
+      value_type != ValueType::kTypeBlobIndex) {
+    Seek(target);
+    return true;
+  }
+
+  // Result found, and the iter is correctly set.
+  return true;
+}
+
+void IndexBlockIter::Seek(const Slice& target) {
+  TEST_SYNC_POINT("IndexBlockIter::Seek:0");
+  Slice seek_key = target;
+  if (!key_includes_seq_) {
+    seek_key = ExtractUserKey(target);
+  }
+  PERF_TIMER_GUARD(block_seek_nanos);
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  status_ = Status::OK();
+  uint32_t index = 0;
+  bool ok = false;
+  if (prefix_index_) {
+    bool prefix_may_exist = true;
+    ok = PrefixSeek(target, &index, &prefix_may_exist);
+    if (!prefix_may_exist) {
+      // This is to let the caller to distinguish between non-existing prefix,
+      // and when key is larger than the last key, which both set Valid() to
+      // false.
+      current_ = restarts_;
+      status_ = Status::NotFound();
+    }
+  } else if (value_delta_encoded_) {
+    ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
+                                 comparator_);
+  } else {
+    ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+                               comparator_);
+  }
+
+  if (!ok) {
+    return;
+  }
+  SeekToRestartPoint(index);
+
+  // Linear search (within restart block) for first key >= target
+  while (ParseNextIndexKey() && Compare(key_, seek_key) < 0) {
+  }
+}
+
+void DataBlockIter::SeekForPrev(const Slice& target) {
+  PERF_TIMER_GUARD(block_seek_nanos);
+  Slice seek_key = target;
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+                                  comparator_);
+
+  if (!ok) {
+    return;
+  }
+  SeekToRestartPoint(index);
+
+  // Linear search (within restart block) for first key >= seek_key
+  while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
+  }
+  if (!Valid()) {
+    SeekToLast();
+  } else {
+    while (Valid() && Compare(key_, seek_key) > 0) {
+      Prev();
+    }
+  }
+}
+
+void DataBlockIter::SeekToFirst() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(0);
+  ParseNextDataKey<DecodeEntry>();
+}
+
+void DataBlockIter::SeekToFirstOrReport() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(0);
+  ParseNextDataKey<CheckAndDecodeEntry>();
+}
+
+void IndexBlockIter::SeekToFirst() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  status_ = Status::OK();
+  SeekToRestartPoint(0);
+  ParseNextIndexKey();
+}
+
+void DataBlockIter::SeekToLast() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(num_restarts_ - 1);
+  while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) {
+    // Keep skipping
+  }
+}
+
+void IndexBlockIter::SeekToLast() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  status_ = Status::OK();
+  SeekToRestartPoint(num_restarts_ - 1);
+  while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
+    // Keep skipping
+  }
+}
+
+template <class TValue>
+void BlockIter<TValue>::CorruptionError() {
+  current_ = restarts_;
+  restart_index_ = num_restarts_;
+  status_ = Status::Corruption("bad entry in block");
+  key_.Clear();
+  value_.clear();
+}
+
+template <typename DecodeEntryFunc>
+bool DataBlockIter::ParseNextDataKey(const char* limit) {
+  current_ = NextEntryOffset();
+  const char* p = data_ + current_;
+  if (!limit) {
+    limit = data_ + restarts_;  // Restarts come right after data
+  }
+
+  if (p >= limit) {
+    // No more entries to return.  Mark as invalid.
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    return false;
+  }
+
+  // Decode next entry
+  uint32_t shared, non_shared, value_length;
+  p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
+  if (p == nullptr || key_.Size() < shared) {
+    CorruptionError();
+    return false;
+  } else {
+    if (shared == 0) {
+      // If this key dont share any bytes with prev key then we dont need
+      // to decode it and can use it's address in the block directly.
+      key_.SetKey(Slice(p, non_shared), false /* copy */);
+      key_pinned_ = true;
+    } else {
+      // This key share `shared` bytes with prev key, we need to decode it
+      key_.TrimAppend(shared, p, non_shared);
+      key_pinned_ = false;
+    }
+
+    if (global_seqno_ != kDisableGlobalSequenceNumber) {
+      // If we are reading a file with a global sequence number we should
+      // expect that all encoded sequence numbers are zeros and any value
+      // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
+      assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0);
+
+      ValueType value_type = ExtractValueType(key_.GetKey());
+      assert(value_type == ValueType::kTypeValue ||
+             value_type == ValueType::kTypeMerge ||
+             value_type == ValueType::kTypeDeletion ||
+             value_type == ValueType::kTypeRangeDeletion);
+
+      if (key_pinned_) {
+        // TODO(tec): Investigate updating the seqno in the loaded block
+        // directly instead of doing a copy and update.
+
+        // We cannot use the key address in the block directly because
+        // we have a global_seqno_ that will overwrite the encoded one.
+        key_.OwnKey();
+        key_pinned_ = false;
+      }
+
+      key_.UpdateInternalKey(global_seqno_, value_type);
+    }
+
+    value_ = Slice(p + non_shared, value_length);
+    if (shared == 0) {
+      while (restart_index_ + 1 < num_restarts_ &&
+             GetRestartPoint(restart_index_ + 1) < current_) {
+        ++restart_index_;
+      }
+    }
+    // else we are in the middle of a restart interval and the restart_index_
+    // thus has not changed
+    return true;
+  }
+}
+
+bool IndexBlockIter::ParseNextIndexKey() {
+  current_ = NextEntryOffset();
+  const char* p = data_ + current_;
+  const char* limit = data_ + restarts_;  // Restarts come right after data
+  if (p >= limit) {
+    // No more entries to return.  Mark as invalid.
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    return false;
+  }
+
+  // Decode next entry
+  uint32_t shared, non_shared, value_length;
+  if (value_delta_encoded_) {
+    p = DecodeKeyV4()(p, limit, &shared, &non_shared);
+    value_length = 0;
+  } else {
+    p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
+  }
+  if (p == nullptr || key_.Size() < shared) {
+    CorruptionError();
+    return false;
+  }
+  if (shared == 0) {
+    // If this key dont share any bytes with prev key then we dont need
+    // to decode it and can use it's address in the block directly.
+    key_.SetKey(Slice(p, non_shared), false /* copy */);
+    key_pinned_ = true;
+  } else {
+    // This key share `shared` bytes with prev key, we need to decode it
+    key_.TrimAppend(shared, p, non_shared);
+    key_pinned_ = false;
+  }
+  value_ = Slice(p + non_shared, value_length);
+  if (shared == 0) {
+    while (restart_index_ + 1 < num_restarts_ &&
+           GetRestartPoint(restart_index_ + 1) < current_) {
+      ++restart_index_;
+    }
+  }
+  // else we are in the middle of a restart interval and the restart_index_
+  // thus has not changed
+  if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
+    DecodeCurrentValue(shared);
+  }
+  return true;
+}
+
+// The format:
+// restart_point   0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// restart_point   1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// ...
+// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// where, k is key, v is value, and its encoding is in parenthesis.
+// The format of each key is (shared_size, non_shared_size, shared, non_shared)
+// The format of each value, i.e., block hanlde, is (offset, size) whenever the
+// shared_size is 0, which included the first entry in each restart point.
+// Otherwise the format is delta-size = block handle size - size of last block
+// handle.
+void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
+  Slice v(value_.data(), data_ + restarts_ - value_.data());
+  // Delta encoding is used if `shared` != 0.
+  Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
+      &v, have_first_key_,
+      (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr);
+  assert(decode_s.ok());
+  value_ = Slice(value_.data(), v.data() - value_.data());
+
+  if (global_seqno_state_ != nullptr) {
+    // Overwrite sequence number the same way as in DataBlockIter.
+
+    IterKey& first_internal_key = global_seqno_state_->first_internal_key;
+    first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
+                                      /* copy */ true);
+
+    assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
+
+    ValueType value_type = ExtractValueType(first_internal_key.GetKey());
+    assert(value_type == ValueType::kTypeValue ||
+           value_type == ValueType::kTypeMerge ||
+           value_type == ValueType::kTypeDeletion ||
+           value_type == ValueType::kTypeRangeDeletion);
+
+    first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
+                                         value_type);
+    decoded_value_.first_internal_key = first_internal_key.GetKey();
+  }
+}
+
+// Binary search in restart array to find the first restart point that
+// is either the last restart point with a key less than target,
+// which means the key of next restart point is larger than target, or
+// the first restart point with a key = target
+template <class TValue>
+template <typename DecodeKeyFunc>
+bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left,
+                                   uint32_t right, uint32_t* index,
+                                   const Comparator* comp) {
+  assert(left <= right);
+
+  while (left < right) {
+    uint32_t mid = (left + right + 1) / 2;
+    uint32_t region_offset = GetRestartPoint(mid);
+    uint32_t shared, non_shared;
+    const char* key_ptr = DecodeKeyFunc()(
+        data_ + region_offset, data_ + restarts_, &shared, &non_shared);
+    if (key_ptr == nullptr || (shared != 0)) {
+      CorruptionError();
+      return false;
+    }
+    Slice mid_key(key_ptr, non_shared);
+    int cmp = comp->Compare(mid_key, target);
+    if (cmp < 0) {
+      // Key at "mid" is smaller than "target". Therefore all
+      // blocks before "mid" are uninteresting.
+      left = mid;
+    } else if (cmp > 0) {
+      // Key at "mid" is >= "target". Therefore all blocks at or
+      // after "mid" are uninteresting.
+      right = mid - 1;
+    } else {
+      left = right = mid;
+    }
+  }
+
+  *index = left;
+  return true;
+}
+
+// Compare target key and the block key of the block of `block_index`.
+// Return -1 if error.
+int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
+  uint32_t region_offset = GetRestartPoint(block_index);
+  uint32_t shared, non_shared;
+  const char* key_ptr =
+      value_delta_encoded_
+          ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
+                          &non_shared)
+          : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
+                        &non_shared);
+  if (key_ptr == nullptr || (shared != 0)) {
+    CorruptionError();
+    return 1;  // Return target is smaller
+  }
+  Slice block_key(key_ptr, non_shared);
+  return Compare(block_key, target);
+}
+
+// Binary search in block_ids to find the first block
+// with a key >= target
+bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
+                                          uint32_t* block_ids, uint32_t left,
+                                          uint32_t right, uint32_t* index,
+                                          bool* prefix_may_exist) {
+  assert(left <= right);
+  assert(index);
+  assert(prefix_may_exist);
+  *prefix_may_exist = true;
+  uint32_t left_bound = left;
+
+  while (left <= right) {
+    uint32_t mid = (right + left) / 2;
+
+    int cmp = CompareBlockKey(block_ids[mid], target);
+    if (!status_.ok()) {
+      return false;
+    }
+    if (cmp < 0) {
+      // Key at "target" is larger than "mid". Therefore all
+      // blocks before or at "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "target" is <= "mid". Therefore all blocks
+      // after "mid" are uninteresting.
+      // If there is only one block left, we found it.
+      if (left == right) break;
+      right = mid;
+    }
+  }
+
+  if (left == right) {
+    // In one of the two following cases:
+    // (1) left is the first one of block_ids
+    // (2) there is a gap of blocks between block of `left` and `left-1`.
+    // we can further distinguish the case of key in the block or key not
+    // existing, by comparing the target key and the key of the previous
+    // block to the left of the block found.
+    if (block_ids[left] > 0 &&
+        (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
+        CompareBlockKey(block_ids[left] - 1, target) > 0) {
+      current_ = restarts_;
+      *prefix_may_exist = false;
+      return false;
+    }
+
+    *index = block_ids[left];
+    return true;
+  } else {
+    assert(left > right);
+
+    // If the next block key is larger than seek key, it is possible that
+    // no key shares the prefix with `target`, or all keys with the same
+    // prefix as `target` are smaller than prefix. In the latter case,
+    // we are mandated to set the position the same as the total order.
+    // In the latter case, either:
+    // (1) `target` falls into the range of the next block. In this case,
+    //     we can place the iterator to the next block, or
+    // (2) `target` is larger than all block keys. In this case we can
+    //     keep the iterator invalidate without setting `prefix_may_exist`
+    //     to false.
+    // We might sometimes end up with setting the total order position
+    // while there is no key sharing the prefix as `target`, but it
+    // still follows the contract.
+    uint32_t right_index = block_ids[right];
+    assert(right_index + 1 <= num_restarts_);
+    if (right_index + 1 < num_restarts_) {
+      if (CompareBlockKey(right_index + 1, target) >= 0) {
+        *index = right_index + 1;
+        return true;
+      } else {
+        // We have to set the flag here because we are not positioning
+        // the iterator to the total order position.
+        *prefix_may_exist = false;
+      }
+    }
+
+    // Mark iterator invalid
+    current_ = restarts_;
+    return false;
+  }
+}
+
+bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
+                                bool* prefix_may_exist) {
+  assert(index);
+  assert(prefix_may_exist);
+  assert(prefix_index_);
+  *prefix_may_exist = true;
+  Slice seek_key = target;
+  if (!key_includes_seq_) {
+    seek_key = ExtractUserKey(target);
+  }
+  uint32_t* block_ids = nullptr;
+  uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
+
+  if (num_blocks == 0) {
+    current_ = restarts_;
+    *prefix_may_exist = false;
+    return false;
+  } else {
+    assert(block_ids);
+    return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
+                                prefix_may_exist);
+  }
+}
+
+uint32_t Block::NumRestarts() const {
+  assert(size_ >= 2 * sizeof(uint32_t));
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // In BlockBuilder, we have ensured a block with HashIndex is less than
+    // kMaxBlockSizeSupportedByHashIndex (64KiB).
+    //
+    // Therefore, if we encounter a block with a size > 64KiB, the block
+    // cannot have HashIndex. So the footer will directly interpreted as
+    // num_restarts.
+    //
+    // Such check is for backward compatibility. We can ensure legacy block
+    // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
+    // correctly as no HashIndex even if the MSB of num_restarts is set.
+    return num_restarts;
+  }
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return num_restarts;
+}
+
+BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
+  assert(size_ >= 2 * sizeof(uint32_t));
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // The check is for the same reason as that in NumRestarts()
+    return BlockBasedTableOptions::kDataBlockBinarySearch;
+  }
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return index_type;
+}
+
+Block::~Block() {
+  // This sync point can be re-enabled if RocksDB can control the
+  // initialization order of any/all static options created by the user.
+  // TEST_SYNC_POINT("Block::~Block");
+}
+
+Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
+             size_t read_amp_bytes_per_bit, Statistics* statistics)
+    : contents_(std::move(contents)),
+      data_(contents_.data.data()),
+      size_(contents_.data.size()),
+      restart_offset_(0),
+      num_restarts_(0),
+      global_seqno_(_global_seqno) {
+  TEST_SYNC_POINT("Block::Block:0");
+  if (size_ < sizeof(uint32_t)) {
+    size_ = 0;  // Error marker
+  } else {
+    // Should only decode restart points for uncompressed blocks
+    num_restarts_ = NumRestarts();
+    switch (IndexType()) {
+      case BlockBasedTableOptions::kDataBlockBinarySearch:
+        restart_offset_ = static_cast<uint32_t>(size_) -
+                          (1 + num_restarts_) * sizeof(uint32_t);
+        if (restart_offset_ > size_ - sizeof(uint32_t)) {
+          // The size is too small for NumRestarts() and therefore
+          // restart_offset_ wrapped around.
+          size_ = 0;
+        }
+        break;
+      case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+        if (size_ < sizeof(uint32_t) /* block footer */ +
+                        sizeof(uint16_t) /* NUM_BUCK */) {
+          size_ = 0;
+          break;
+        }
+
+        uint16_t map_offset;
+        data_block_hash_index_.Initialize(
+            contents.data.data(),
+            static_cast<uint16_t>(contents.data.size() -
+                                  sizeof(uint32_t)), /*chop off
+                                                 NUM_RESTARTS*/
+            &map_offset);
+
+        restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
+
+        if (restart_offset_ > map_offset) {
+          // map_offset is too small for NumRestarts() and
+          // therefore restart_offset_ wrapped around.
+          size_ = 0;
+          break;
+        }
+        break;
+      default:
+        size_ = 0;  // Error marker
+    }
+  }
+  if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
+    read_amp_bitmap_.reset(new BlockReadAmpBitmap(
+        restart_offset_, read_amp_bytes_per_bit, statistics));
+  }
+}
+
+DataBlockIter* Block::NewDataIterator(const Comparator* cmp,
+                                      const Comparator* ucmp,
+                                      DataBlockIter* iter, Statistics* stats,
+                                      bool block_contents_pinned) {
+  DataBlockIter* ret_iter;
+  if (iter != nullptr) {
+    ret_iter = iter;
+  } else {
+    ret_iter = new DataBlockIter;
+  }
+  if (size_ < 2 * sizeof(uint32_t)) {
+    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+    return ret_iter;
+  }
+  if (num_restarts_ == 0) {
+    // Empty block.
+    ret_iter->Invalidate(Status::OK());
+    return ret_iter;
+  } else {
+    ret_iter->Initialize(
+        cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
+        read_amp_bitmap_.get(), block_contents_pinned,
+        data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
+    if (read_amp_bitmap_) {
+      if (read_amp_bitmap_->GetStatistics() != stats) {
+        // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
+        read_amp_bitmap_->SetStatistics(stats);
+      }
+    }
+  }
+
+  return ret_iter;
+}
+
+IndexBlockIter* Block::NewIndexIterator(
+    const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter,
+    Statistics* /*stats*/, bool total_order_seek, bool have_first_key,
+    bool key_includes_seq, bool value_is_full, bool block_contents_pinned,
+    BlockPrefixIndex* prefix_index) {
+  IndexBlockIter* ret_iter;
+  if (iter != nullptr) {
+    ret_iter = iter;
+  } else {
+    ret_iter = new IndexBlockIter;
+  }
+  if (size_ < 2 * sizeof(uint32_t)) {
+    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+    return ret_iter;
+  }
+  if (num_restarts_ == 0) {
+    // Empty block.
+    ret_iter->Invalidate(Status::OK());
+    return ret_iter;
+  } else {
+    BlockPrefixIndex* prefix_index_ptr =
+        total_order_seek ? nullptr : prefix_index;
+    ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
+                         global_seqno_, prefix_index_ptr, have_first_key,
+                         key_includes_seq, value_is_full,
+                         block_contents_pinned);
+  }
+
+  return ret_iter;
+}
+
+size_t Block::ApproximateMemoryUsage() const {
+  size_t usage = usable_size();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size((void*)this);
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  if (read_amp_bitmap_) {
+    usage += read_amp_bitmap_->ApproximateMemoryUsage();
+  }
+  return usage;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block.h b/src/rocksdb/table/block_based/block.h
new file mode 100644
index 000000000..e82a1b2a6
--- /dev/null
+++ b/src/rocksdb/table/block_based/block.h
@@ -0,0 +1,631 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "port/malloc.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_hash_index.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct BlockContents;
+class Comparator;
+template <class TValue>
+class BlockIter;
+class DataBlockIter;
+class IndexBlockIter;
+class BlockPrefixIndex;
+
+// BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data
+// bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of
+// bytes in the Block we update the bitmap and increment
+// READ_AMP_ESTIMATE_USEFUL_BYTES.
+class BlockReadAmpBitmap {
+ public:
+  explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
+                              Statistics* statistics)
+      : bitmap_(nullptr),
+        bytes_per_bit_pow_(0),
+        statistics_(statistics),
+        rnd_(Random::GetTLSInstance()->Uniform(
+            static_cast<int>(bytes_per_bit))) {
+    TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
+    assert(block_size > 0 && bytes_per_bit > 0);
+
+    // convert bytes_per_bit to be a power of 2
+    while (bytes_per_bit >>= 1) {
+      bytes_per_bit_pow_++;
+    }
+
+    // num_bits_needed = ceil(block_size / bytes_per_bit)
+    size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1;
+    assert(num_bits_needed > 0);
+
+    // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
+    size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
+
+    // Create bitmap and set all the bits to 0
+    bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
+
+    RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
+  }
+
+  ~BlockReadAmpBitmap() { delete[] bitmap_; }
+
+  void Mark(uint32_t start_offset, uint32_t end_offset) {
+    assert(end_offset >= start_offset);
+    // Index of first bit in mask
+    uint32_t start_bit =
+        (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
+        bytes_per_bit_pow_;
+    // Index of last bit in mask + 1
+    uint32_t exclusive_end_bit =
+        (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
+    if (start_bit >= exclusive_end_bit) {
+      return;
+    }
+    assert(exclusive_end_bit > 0);
+
+    if (GetAndSet(start_bit) == 0) {
+      uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
+                                  << bytes_per_bit_pow_;
+      RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
+                 new_useful_bytes);
+    }
+  }
+
+  Statistics* GetStatistics() {
+    return statistics_.load(std::memory_order_relaxed);
+  }
+
+  void SetStatistics(Statistics* stats) { statistics_.store(stats); }
+
+  uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
+
+  size_t ApproximateMemoryUsage() const {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    return malloc_usable_size((void*)this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return sizeof(*this);
+  }
+
+ private:
+  // Get the current value of bit at `bit_idx` and set it to 1
+  inline bool GetAndSet(uint32_t bit_idx) {
+    const uint32_t byte_idx = bit_idx / kBitsPerEntry;
+    const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry);
+
+    return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) &
+           bit_mask;
+  }
+
+  const uint32_t kBytesPersEntry = sizeof(uint32_t);   // 4 bytes
+  const uint32_t kBitsPerEntry = kBytesPersEntry * 8;  // 32 bits
+
+  // Bitmap used to record the bytes that we read, use atomic to protect
+  // against multiple threads updating the same bit
+  std::atomic<uint32_t>* bitmap_;
+  // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
+  // muliplication and division
+  uint8_t bytes_per_bit_pow_;
+  // Pointer to DB Statistics object, Since this bitmap may outlive the DB
+  // this pointer maybe invalid, but the DB will update it to a valid pointer
+  // by using SetStatistics() before calling Mark()
+  std::atomic<Statistics*> statistics_;
+  uint32_t rnd_;
+};
+
+// This Block class is not for any old block: it is designed to hold only
+// uncompressed blocks containing sorted key-value pairs. It is thus
+// suitable for storing uncompressed data blocks, index blocks (including
+// partitions), range deletion blocks, properties blocks, metaindex blocks,
+// as well as the top level of the partitioned filter structure (which is
+// actually an index of the filter partitions). It is NOT suitable for
+// compressed blocks in general, filter blocks/partitions, or compression
+// dictionaries (since the latter do not contain sorted key-value pairs).
+// Use BlockContents directly for those.
+//
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details of the format and the various block types.
+class Block {
+ public:
+  // Initialize the block with the specified contents.
+  explicit Block(BlockContents&& contents, SequenceNumber _global_seqno,
+                 size_t read_amp_bytes_per_bit = 0,
+                 Statistics* statistics = nullptr);
+  // No copying allowed
+  Block(const Block&) = delete;
+  void operator=(const Block&) = delete;
+
+  ~Block();
+
+  size_t size() const { return size_; }
+  const char* data() const { return data_; }
+  // The additional memory space taken by the block data.
+  size_t usable_size() const { return contents_.usable_size(); }
+  uint32_t NumRestarts() const;
+  bool own_bytes() const { return contents_.own_bytes(); }
+
+  BlockBasedTableOptions::DataBlockIndexType IndexType() const;
+
+  // If comparator is InternalKeyComparator, user_comparator is its user
+  // comparator; they are equal otherwise.
+  //
+  // If iter is null, return new Iterator
+  // If iter is not null, update this one and return it as Iterator*
+  //
+  // Updates read_amp_bitmap_ if it is not nullptr.
+  //
+  // If `block_contents_pinned` is true, the caller will guarantee that when
+  // the cleanup functions are transferred from the iterator to other
+  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
+  // valid. Either the iterator holds cache handle or ownership of some resource
+  // and release them in a release function, or caller is sure that the data
+  // will not go away (for example, it's from mmapped file which will not be
+  // closed).
+  //
+  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+  // the iterator will simply be set as "invalid", rather than returning
+  // the key that is just pass the target key.
+  DataBlockIter* NewDataIterator(const Comparator* comparator,
+                                 const Comparator* user_comparator,
+                                 DataBlockIter* iter = nullptr,
+                                 Statistics* stats = nullptr,
+                                 bool block_contents_pinned = false);
+
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default true, means that no delta encoding is
+  // applied to values.
+  //
+  // If `prefix_index` is not nullptr this block will do hash lookup for the key
+  // prefix. If total_order_seek is true, prefix_index_ is ignored.
+  //
+  // `have_first_key` controls whether IndexValue will contain
+  // first_internal_key. It affects data serialization format, so the same value
+  // have_first_key must be used when writing and reading index.
+  // It is determined by IndexType property of the table.
+  IndexBlockIter* NewIndexIterator(const Comparator* comparator,
+                                   const Comparator* user_comparator,
+                                   IndexBlockIter* iter, Statistics* stats,
+                                   bool total_order_seek, bool have_first_key,
+                                   bool key_includes_seq, bool value_is_full,
+                                   bool block_contents_pinned = false,
+                                   BlockPrefixIndex* prefix_index = nullptr);
+
+  // Report an approximation of how much memory has been used.
+  size_t ApproximateMemoryUsage() const;
+
+  SequenceNumber global_seqno() const { return global_seqno_; }
+
+ private:
+  BlockContents contents_;
+  const char* data_;         // contents_.data.data()
+  size_t size_;              // contents_.data.size()
+  uint32_t restart_offset_;  // Offset in data_ of restart array
+  uint32_t num_restarts_;
+  std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
+  // All keys in the block will have seqno = global_seqno_, regardless of
+  // the encoded value (kDisableGlobalSequenceNumber means disabled)
+  const SequenceNumber global_seqno_;
+
+  DataBlockHashIndex data_block_hash_index_;
+};
+
+template <class TValue>
+class BlockIter : public InternalIteratorBase<TValue> {
+ public:
+  void InitializeBase(const Comparator* comparator, const char* data,
+                      uint32_t restarts, uint32_t num_restarts,
+                      SequenceNumber global_seqno, bool block_contents_pinned) {
+    assert(data_ == nullptr);  // Ensure it is called only once
+    assert(num_restarts > 0);  // Ensure the param is valid
+
+    comparator_ = comparator;
+    data_ = data;
+    restarts_ = restarts;
+    num_restarts_ = num_restarts;
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    global_seqno_ = global_seqno;
+    block_contents_pinned_ = block_contents_pinned;
+    cache_handle_ = nullptr;
+  }
+
+  // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
+  // nothing. Calls cleanup functions.
+  void InvalidateBase(Status s) {
+    // Assert that the BlockIter is never deleted while Pinning is Enabled.
+    assert(!pinned_iters_mgr_ ||
+           (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+
+    data_ = nullptr;
+    current_ = restarts_;
+    status_ = s;
+
+    // Call cleanup callbacks.
+    Cleanable::Reset();
+  }
+
+  bool Valid() const override { return current_ < restarts_; }
+  Status status() const override { return status_; }
+  Slice key() const override {
+    assert(Valid());
+    return key_.GetKey();
+  }
+
+#ifndef NDEBUG
+  ~BlockIter() override {
+    // Assert that the BlockIter is never deleted while Pinning is Enabled.
+    assert(!pinned_iters_mgr_ ||
+           (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+  }
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+  }
+  PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
+#endif
+
+  bool IsKeyPinned() const override {
+    return block_contents_pinned_ && key_pinned_;
+  }
+
+  bool IsValuePinned() const override { return block_contents_pinned_; }
+
+  size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
+
+  uint32_t ValueOffset() const {
+    return static_cast<uint32_t>(value_.data() - data_);
+  }
+
+  void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
+
+  Cache::Handle* cache_handle() { return cache_handle_; }
+
+ protected:
+  // Note: The type could be changed to InternalKeyComparator but we see a weird
+  // performance drop by that.
+  const Comparator* comparator_;
+  const char* data_;       // underlying block contents
+  uint32_t num_restarts_;  // Number of uint32_t entries in restart array
+
+  // Index of restart block in which current_ or current_-1 falls
+  uint32_t restart_index_;
+  uint32_t restarts_;  // Offset of restart array (list of fixed32)
+  // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
+  uint32_t current_;
+  IterKey key_;
+  Slice value_;
+  Status status_;
+  bool key_pinned_;
+  // Whether the block data is guaranteed to outlive this iterator, and
+  // as long as the cleanup functions are transferred to another class,
+  // e.g. PinnableSlice, the pointer to the bytes will still be valid.
+  bool block_contents_pinned_;
+  SequenceNumber global_seqno_;
+
+ private:
+  // Store the cache handle, if the block is cached. We need this since the
+  // only other place the handle is stored is as an argument to the Cleanable
+  // function callback, which is hard to retrieve. When multiple value
+  // PinnableSlices reference the block, they need the cache handle in order
+  // to bump up the ref count
+  Cache::Handle* cache_handle_;
+
+ public:
+  // Return the offset in data_ just past the end of the current entry.
+  inline uint32_t NextEntryOffset() const {
+    // NOTE: We don't support blocks bigger than 2GB
+    return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
+  }
+
+  uint32_t GetRestartPoint(uint32_t index) {
+    assert(index < num_restarts_);
+    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+  }
+
+  void SeekToRestartPoint(uint32_t index) {
+    key_.Clear();
+    restart_index_ = index;
+    // current_ will be fixed by ParseNextKey();
+
+    // ParseNextKey() starts at the end of value_, so set value_ accordingly
+    uint32_t offset = GetRestartPoint(index);
+    value_ = Slice(data_ + offset, 0);
+  }
+
+  void CorruptionError();
+
+  template <typename DecodeKeyFunc>
+  inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+                         uint32_t* index, const Comparator* comp);
+};
+
+class DataBlockIter final : public BlockIter<Slice> {
+ public:
+  DataBlockIter()
+      : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
+  DataBlockIter(const Comparator* comparator, const Comparator* user_comparator,
+                const char* data, uint32_t restarts, uint32_t num_restarts,
+                SequenceNumber global_seqno,
+                BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
+                DataBlockHashIndex* data_block_hash_index)
+      : DataBlockIter() {
+    Initialize(comparator, user_comparator, data, restarts, num_restarts,
+               global_seqno, read_amp_bitmap, block_contents_pinned,
+               data_block_hash_index);
+  }
+  void Initialize(const Comparator* comparator,
+                  const Comparator* user_comparator, const char* data,
+                  uint32_t restarts, uint32_t num_restarts,
+                  SequenceNumber global_seqno,
+                  BlockReadAmpBitmap* read_amp_bitmap,
+                  bool block_contents_pinned,
+                  DataBlockHashIndex* data_block_hash_index) {
+    InitializeBase(comparator, data, restarts, num_restarts, global_seqno,
+                   block_contents_pinned);
+    user_comparator_ = user_comparator;
+    key_.SetIsUserKey(false);
+    read_amp_bitmap_ = read_amp_bitmap;
+    last_bitmap_offset_ = current_ + 1;
+    data_block_hash_index_ = data_block_hash_index;
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    if (read_amp_bitmap_ && current_ < restarts_ &&
+        current_ != last_bitmap_offset_) {
+      read_amp_bitmap_->Mark(current_ /* current entry offset */,
+                             NextEntryOffset() - 1);
+      last_bitmap_offset_ = current_;
+    }
+    return value_;
+  }
+
+  void Seek(const Slice& target) override;
+
+  inline bool SeekForGet(const Slice& target) {
+    if (!data_block_hash_index_) {
+      Seek(target);
+      return true;
+    }
+
+    return SeekForGetImpl(target);
+  }
+
+  void SeekForPrev(const Slice& target) override;
+
+  void Prev() override;
+
+  void Next() final override;
+
+  // Try to advance to the next entry in the block. If there is data corruption
+  // or error, report it to the caller instead of aborting the process. May
+  // incur higher CPU overhead because we need to perform check on every entry.
+  void NextOrReport();
+
+  void SeekToFirst() override;
+
+  // Try to seek to the first entry in the block. If there is data corruption
+  // or error, report it to caller instead of aborting the process. May incur
+  // higher CPU overhead because we need to perform check on every entry.
+  void SeekToFirstOrReport();
+
+  void SeekToLast() override;
+
+  void Invalidate(Status s) {
+    InvalidateBase(s);
+    // Clear prev entries cache.
+    prev_entries_keys_buff_.clear();
+    prev_entries_.clear();
+    prev_entries_idx_ = -1;
+  }
+
+ private:
+  // read-amp bitmap
+  BlockReadAmpBitmap* read_amp_bitmap_;
+  // last `current_` value we report to read-amp bitmp
+  mutable uint32_t last_bitmap_offset_;
+  struct CachedPrevEntry {
+    explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
+                             size_t _key_offset, size_t _key_size, Slice _value)
+        : offset(_offset),
+          key_ptr(_key_ptr),
+          key_offset(_key_offset),
+          key_size(_key_size),
+          value(_value) {}
+
+    // offset of entry in block
+    uint32_t offset;
+    // Pointer to key data in block (nullptr if key is delta-encoded)
+    const char* key_ptr;
+    // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr)
+    size_t key_offset;
+    // size of key
+    size_t key_size;
+    // value slice pointing to data in block
+    Slice value;
+  };
+  std::string prev_entries_keys_buff_;
+  std::vector<CachedPrevEntry> prev_entries_;
+  int32_t prev_entries_idx_ = -1;
+
+  DataBlockHashIndex* data_block_hash_index_;
+  const Comparator* user_comparator_;
+
+  template <typename DecodeEntryFunc>
+  inline bool ParseNextDataKey(const char* limit = nullptr);
+
+  inline int Compare(const IterKey& ikey, const Slice& b) const {
+    return comparator_->Compare(ikey.GetInternalKey(), b);
+  }
+
+  bool SeekForGetImpl(const Slice& target);
+};
+
+class IndexBlockIter final : public BlockIter<IndexValue> {
+ public:
+  IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
+
+  Slice key() const override {
+    assert(Valid());
+    return key_.GetKey();
+  }
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default true, means that no delta encoding is
+  // applied to values.
+  void Initialize(const Comparator* comparator,
+                  const Comparator* user_comparator, const char* data,
+                  uint32_t restarts, uint32_t num_restarts,
+                  SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
+                  bool have_first_key, bool key_includes_seq,
+                  bool value_is_full, bool block_contents_pinned) {
+    InitializeBase(key_includes_seq ? comparator : user_comparator, data,
+                   restarts, num_restarts, kDisableGlobalSequenceNumber,
+                   block_contents_pinned);
+    key_includes_seq_ = key_includes_seq;
+    key_.SetIsUserKey(!key_includes_seq_);
+    prefix_index_ = prefix_index;
+    value_delta_encoded_ = !value_is_full;
+    have_first_key_ = have_first_key;
+    if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
+      global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
+    } else {
+      global_seqno_state_.reset();
+    }
+  }
+
+  Slice user_key() const override {
+    if (key_includes_seq_) {
+      return ExtractUserKey(key());
+    }
+    return key();
+  }
+
+  IndexValue value() const override {
+    assert(Valid());
+    if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
+      return decoded_value_;
+    } else {
+      IndexValue entry;
+      Slice v = value_;
+      Status decode_s __attribute__((__unused__)) =
+          entry.DecodeFrom(&v, have_first_key_, nullptr);
+      assert(decode_s.ok());
+      return entry;
+    }
+  }
+
+  // IndexBlockIter follows a different contract for prefix iterator
+  // from data iterators.
+  // If prefix of the seek key `target` exists in the file, it must
+  // return the same result as total order seek.
+  // If the prefix of `target` doesn't exist in the file, it can either
+  // return the result of total order seek, or set both of Valid() = false
+  // and status() = NotFound().
+  void Seek(const Slice& target) override;
+
+  void SeekForPrev(const Slice&) override {
+    assert(false);
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    status_ = Status::InvalidArgument(
+        "RocksDB internal error: should never call SeekForPrev() on index "
+        "blocks");
+    key_.Clear();
+    value_.clear();
+  }
+
+  void Prev() override;
+
+  void Next() override;
+
+  void SeekToFirst() override;
+
+  void SeekToLast() override;
+
+  void Invalidate(Status s) { InvalidateBase(s); }
+
+  bool IsValuePinned() const override {
+    return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
+  }
+
+ private:
+  // Key is in InternalKey format
+  bool key_includes_seq_;
+  bool value_delta_encoded_;
+  bool have_first_key_;  // value includes first_internal_key
+  BlockPrefixIndex* prefix_index_;
+  // Whether the value is delta encoded. In that case the value is assumed to be
+  // BlockHandle. The first value in each restart interval is the full encoded
+  // BlockHandle; the restart of encoded size part of the BlockHandle. The
+  // offset of delta encoded BlockHandles is computed by adding the size of
+  // previous delta encoded values in the same restart interval to the offset of
+  // the first value in that restart interval.
+  IndexValue decoded_value_;
+
+  // When sequence number overwriting is enabled, this struct contains the seqno
+  // to overwrite with, and current first_internal_key with overwritten seqno.
+  // This is rarely used, so we put it behind a pointer and only allocate when
+  // needed.
+  struct GlobalSeqnoState {
+    // First internal key according to current index entry, but with sequence
+    // number overwritten to global_seqno.
+    IterKey first_internal_key;
+    SequenceNumber global_seqno;
+
+    explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {}
+  };
+
+  std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
+
+  // Set *prefix_may_exist to false if no key possibly share the same prefix
+  // as `target`. If not set, the result position should be the same as total
+  // order Seek.
+  bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist);
+  // Set *prefix_may_exist to false if no key can possibly share the same
+  // prefix as `target`. If not set, the result position should be the same
+  // as total order seek.
+  bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
+                            uint32_t left, uint32_t right, uint32_t* index,
+                            bool* prefix_may_exist);
+  inline int CompareBlockKey(uint32_t block_index, const Slice& target);
+
+  inline int Compare(const Slice& a, const Slice& b) const {
+    return comparator_->Compare(a, b);
+  }
+
+  inline int Compare(const IterKey& ikey, const Slice& b) const {
+    return comparator_->Compare(ikey.GetKey(), b);
+  }
+
+  inline bool ParseNextIndexKey();
+
+  // When value_delta_encoded_ is enabled it decodes the value which is assumed
+  // to be BlockHandle and put it to decoded_value_
+  inline void DecodeCurrentValue(uint32_t shared);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_filter_block.cc b/src/rocksdb/table/block_based/block_based_filter_block.cc
new file mode 100644
index 000000000..de3f5cb13
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_filter_block.cc
@@ -0,0 +1,347 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_filter_block.h"
+#include <algorithm>
+
+#include "db/dbformat.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void AppendItem(std::string* props, const std::string& key,
+                const std::string& value) {
+  char cspace = ' ';
+  std::string value_str("");
+  size_t i = 0;
+  const size_t dataLength = 64;
+  const size_t tabLength = 2;
+  const size_t offLength = 16;
+
+  value_str.append(&value[i], std::min(size_t(dataLength), value.size()));
+  i += dataLength;
+  while (i < value.size()) {
+    value_str.append("\n");
+    value_str.append(offLength, cspace);
+    value_str.append(&value[i], std::min(size_t(dataLength), value.size() - i));
+    i += dataLength;
+  }
+
+  std::string result("");
+  if (key.size() < (offLength - tabLength))
+    result.append(size_t((offLength - tabLength)) - key.size(), cspace);
+  result.append(key);
+
+  props->append(result + ": " + value_str + "\n");
+}
+
+template <class TKey>
+void AppendItem(std::string* props, const TKey& key, const std::string& value) {
+  std::string key_str = ROCKSDB_NAMESPACE::ToString(key);
+  AppendItem(props, key_str, value);
+}
+}  // namespace
+
+// See doc/table_format.txt for an explanation of the filter block format.
+
+// Generate new filter every 2KB of data
+static const size_t kFilterBaseLg = 11;
+static const size_t kFilterBase = 1 << kFilterBaseLg;
+
+BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
+    const SliceTransform* prefix_extractor,
+    const BlockBasedTableOptions& table_opt)
+    : policy_(table_opt.filter_policy.get()),
+      prefix_extractor_(prefix_extractor),
+      whole_key_filtering_(table_opt.whole_key_filtering),
+      prev_prefix_start_(0),
+      prev_prefix_size_(0),
+      num_added_(0) {
+  assert(policy_);
+}
+
+void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
+  uint64_t filter_index = (block_offset / kFilterBase);
+  assert(filter_index >= filter_offsets_.size());
+  while (filter_index > filter_offsets_.size()) {
+    GenerateFilter();
+  }
+}
+
+void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
+  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
+    AddPrefix(key);
+  }
+
+  if (whole_key_filtering_) {
+    AddKey(key);
+  }
+}
+
+// Add key to filter if needed
+inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
+  num_added_++;
+  start_.push_back(entries_.size());
+  entries_.append(key.data(), key.size());
+}
+
+// Add prefix to filter if needed
+inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {
+  // get slice for most recently added entry
+  Slice prev;
+  if (prev_prefix_size_ > 0) {
+    prev = Slice(entries_.data() + prev_prefix_start_, prev_prefix_size_);
+  }
+
+  Slice prefix = prefix_extractor_->Transform(key);
+  // insert prefix only when it's different from the previous prefix.
+  if (prev.size() == 0 || prefix != prev) {
+    prev_prefix_start_ = entries_.size();
+    prev_prefix_size_ = prefix.size();
+    AddKey(prefix);
+  }
+}
+
+Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
+                                           Status* status) {
+  // In this impl we ignore BlockHandle
+  *status = Status::OK();
+  if (!start_.empty()) {
+    GenerateFilter();
+  }
+
+  // Append array of per-filter offsets
+  const uint32_t array_offset = static_cast<uint32_t>(result_.size());
+  for (size_t i = 0; i < filter_offsets_.size(); i++) {
+    PutFixed32(&result_, filter_offsets_[i]);
+  }
+
+  PutFixed32(&result_, array_offset);
+  result_.push_back(kFilterBaseLg);  // Save encoding parameter in result
+  return Slice(result_);
+}
+
+void BlockBasedFilterBlockBuilder::GenerateFilter() {
+  const size_t num_entries = start_.size();
+  if (num_entries == 0) {
+    // Fast path if there are no keys for this filter
+    filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
+    return;
+  }
+
+  // Make list of keys from flattened key structure
+  start_.push_back(entries_.size());  // Simplify length computation
+  tmp_entries_.resize(num_entries);
+  for (size_t i = 0; i < num_entries; i++) {
+    const char* base = entries_.data() + start_[i];
+    size_t length = start_[i + 1] - start_[i];
+    tmp_entries_[i] = Slice(base, length);
+  }
+
+  // Generate filter for current set of keys and append to result_.
+  filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
+  policy_->CreateFilter(&tmp_entries_[0], static_cast<int>(num_entries),
+                        &result_);
+
+  tmp_entries_.clear();
+  entries_.clear();
+  start_.clear();
+  prev_prefix_start_ = 0;
+  prev_prefix_size_ = 0;
+}
+
+BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
+    const BlockBasedTable* t, CachableEntry<BlockContents>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {
+  assert(table());
+  assert(table()->get_rep());
+  assert(table()->get_rep()->filter_policy);
+}
+
+std::unique_ptr<FilterBlockReader> BlockBasedFilterBlockReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<BlockContents> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+                                     use_cache, nullptr /* get_context */,
+                                     lookup_context, &filter_block);
+    if (!s.ok()) {
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new BlockBasedFilterBlockReader(table, std::move(filter_block)));
+}
+
+bool BlockBasedFilterBlockReader::KeyMayMatch(
+    const Slice& key, const SliceTransform* /* prefix_extractor */,
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+  assert(block_offset != kNotValid);
+  if (!whole_key_filtering()) {
+    return true;
+  }
+  return MayMatch(key, block_offset, no_io, get_context, lookup_context);
+}
+
+bool BlockBasedFilterBlockReader::PrefixMayMatch(
+    const Slice& prefix, const SliceTransform* /* prefix_extractor */,
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+  assert(block_offset != kNotValid);
+  return MayMatch(prefix, block_offset, no_io, get_context, lookup_context);
+}
+
+bool BlockBasedFilterBlockReader::ParseFieldsFromBlock(
+    const BlockContents& contents, const char** data, const char** offset,
+    size_t* num, size_t* base_lg) {
+  assert(data);
+  assert(offset);
+  assert(num);
+  assert(base_lg);
+
+  const size_t n = contents.data.size();
+  if (n < 5) {  // 1 byte for base_lg and 4 for start of offset array
+    return false;
+  }
+
+  const uint32_t last_word = DecodeFixed32(contents.data.data() + n - 5);
+  if (last_word > n - 5) {
+    return false;
+  }
+
+  *data = contents.data.data();
+  *offset = (*data) + last_word;
+  *num = (n - 5 - last_word) / 4;
+  *base_lg = contents.data[n - 1];
+
+  return true;
+}
+
+bool BlockBasedFilterBlockReader::MayMatch(
+    const Slice& entry, uint64_t block_offset, bool no_io,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
+  CachableEntry<BlockContents> filter_block;
+
+  const Status s =
+      GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+  if (!s.ok()) {
+    return true;
+  }
+
+  assert(filter_block.GetValue());
+
+  const char* data = nullptr;
+  const char* offset = nullptr;
+  size_t num = 0;
+  size_t base_lg = 0;
+  if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
+                            &base_lg)) {
+    return true;  // Errors are treated as potential matches
+  }
+
+  const uint64_t index = block_offset >> base_lg;
+  if (index < num) {
+    const uint32_t start = DecodeFixed32(offset + index * 4);
+    const uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
+    if (start <= limit && limit <= (uint32_t)(offset - data)) {
+      const Slice filter = Slice(data + start, limit - start);
+
+      assert(table());
+      assert(table()->get_rep());
+      const FilterPolicy* const policy = table()->get_rep()->filter_policy;
+
+      const bool may_match = policy->KeyMayMatch(entry, filter);
+      if (may_match) {
+        PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+        return true;
+      } else {
+        PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+        return false;
+      }
+    } else if (start == limit) {
+      // Empty filters do not match any entries
+      return false;
+    }
+  }
+  return true;  // Errors are treated as potential matches
+}
+
+size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
+  size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<BlockBasedFilterBlockReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
+}
+
+std::string BlockBasedFilterBlockReader::ToString() const {
+  CachableEntry<BlockContents> filter_block;
+
+  const Status s =
+      GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
+                           nullptr /* lookup_context */, &filter_block);
+  if (!s.ok()) {
+    return std::string("Unable to retrieve filter block");
+  }
+
+  assert(filter_block.GetValue());
+
+  const char* data = nullptr;
+  const char* offset = nullptr;
+  size_t num = 0;
+  size_t base_lg = 0;
+  if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
+                            &base_lg)) {
+    return std::string("Error parsing filter block");
+  }
+
+  std::string result;
+  result.reserve(1024);
+
+  std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks");
+  AppendItem(&result, s_fb, ROCKSDB_NAMESPACE::ToString(num));
+  AppendItem(&result, s_bo, s_hd);
+
+  for (size_t index = 0; index < num; index++) {
+    uint32_t start = DecodeFixed32(offset + index * 4);
+    uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
+
+    if (start != limit) {
+      result.append(" filter block # " +
+                    ROCKSDB_NAMESPACE::ToString(index + 1) + "\n");
+      Slice filter = Slice(data + start, limit - start);
+      AppendItem(&result, start, filter.ToString(true));
+    }
+  }
+  return result;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_filter_block.h b/src/rocksdb/table/block_based/block_based_filter_block.h
new file mode 100644
index 000000000..01c98a70b
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_filter_block.h
@@ -0,0 +1,119 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file.  It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/format.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A BlockBasedFilterBlockBuilder is used to construct all of the filters for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+//
+// The sequence of calls to BlockBasedFilterBlockBuilder must match the regexp:
+//      (StartBlock Add*)* Finish
+class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+  BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor,
+                               const BlockBasedTableOptions& table_opt);
+  // No copying allowed
+  BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&) = delete;
+  void operator=(const BlockBasedFilterBlockBuilder&) = delete;
+
+  virtual bool IsBlockBased() override { return true; }
+  virtual void StartBlock(uint64_t block_offset) override;
+  virtual void Add(const Slice& key) override;
+  virtual size_t NumAdded() const override { return num_added_; }
+  virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
+  using FilterBlockBuilder::Finish;
+
+ private:
+  void AddKey(const Slice& key);
+  void AddPrefix(const Slice& key);
+  void GenerateFilter();
+
+  // important: all of these might point to invalid addresses
+  // at the time of destruction of this filter block. destructor
+  // should NOT dereference them.
+  const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+
+  size_t prev_prefix_start_;        // the position of the last appended prefix
+                                    // to "entries_".
+  size_t prev_prefix_size_;         // the length of the last appended prefix to
+                                    // "entries_".
+  std::string entries_;             // Flattened entry contents
+  std::vector<size_t> start_;       // Starting index in entries_ of each entry
+  std::string result_;              // Filter data computed so far
+  std::vector<Slice> tmp_entries_;  // policy_->CreateFilter() argument
+  std::vector<uint32_t> filter_offsets_;
+  size_t num_added_;  // Number of keys added
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class BlockBasedFilterBlockReader
+    : public FilterBlockReaderCommon<BlockContents> {
+ public:
+  BlockBasedFilterBlockReader(const BlockBasedTable* t,
+                              CachableEntry<BlockContents>&& filter_block);
+  // No copying allowed
+  BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&) = delete;
+  void operator=(const BlockBasedFilterBlockReader&) = delete;
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
+
+  bool IsBlockBased() override { return true; }
+
+  bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+                   uint64_t block_offset, const bool no_io,
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context) override;
+  bool PrefixMayMatch(const Slice& prefix,
+                      const SliceTransform* prefix_extractor,
+                      uint64_t block_offset, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context) override;
+  size_t ApproximateMemoryUsage() const override;
+
+  // convert this object to a human readable form
+  std::string ToString() const override;
+
+ private:
+  static bool ParseFieldsFromBlock(const BlockContents& contents,
+                                   const char** data, const char** offset,
+                                   size_t* num, size_t* base_lg);
+
+  bool MayMatch(const Slice& entry, uint64_t block_offset, bool no_io,
+                GetContext* get_context,
+                BlockCacheLookupContext* lookup_context) const;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_filter_block_test.cc b/src/rocksdb/table/block_based/block_based_filter_block_test.cc
new file mode 100644
index 000000000..283d6a9a2
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_filter_block_test.cc
@@ -0,0 +1,434 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_filter_block.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// For testing: emit an array with one hash value per key
+class TestHashFilter : public FilterPolicy {
+ public:
+  const char* Name() const override { return "TestHashFilter"; }
+
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class FilterBlockTest : public mock::MockBlockBasedTableTester,
+                        public testing::Test {
+ public:
+  FilterBlockTest() : mock::MockBlockBasedTableTester(new TestHashFilter) {}
+};
+
+TEST_F(FilterBlockTest, EmptyBuilder) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+  Slice slice(builder.Finish());
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice));
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+}
+
+TEST_F(FilterBlockTest, SingleChunk) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+  ASSERT_EQ(0, builder.NumAdded());
+  builder.StartBlock(100);
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.StartBlock(200);
+  builder.Add("box");
+  builder.StartBlock(300);
+  builder.Add("hello");
+  ASSERT_EQ(5, builder.NumAdded());
+  Slice slice(builder.Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+}
+
+TEST_F(FilterBlockTest, MultiChunk) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+
+  // First filter
+  builder.StartBlock(0);
+  builder.Add("foo");
+  builder.StartBlock(2000);
+  builder.Add("bar");
+
+  // Second filter
+  builder.StartBlock(3100);
+  builder.Add("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder.StartBlock(9000);
+  builder.Add("box");
+  builder.Add("hello");
+
+  Slice slice(builder.Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
+
+  // Check first filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/uint64_t{0},
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/2000,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check second filter
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/3100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check last filter
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/9000,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/9000,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+}
+
+// Test for block based filter block
+// use new interface in FilterPolicy to create filter builder/reader
+class BlockBasedFilterBlockTest : public mock::MockBlockBasedTableTester,
+                                  public testing::Test {
+ public:
+  BlockBasedFilterBlockTest()
+      : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, true)) {}
+};
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
+  FilterBlockBuilder* builder =
+      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
+  Slice slice(builder->Finish());
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice));
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FilterBlockReader* reader =
+      new BlockBasedFilterBlockReader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/10000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  delete builder;
+  delete reader;
+}
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
+  FilterBlockBuilder* builder =
+      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
+  builder->StartBlock(100);
+  builder->Add("foo");
+  builder->Add("bar");
+  builder->Add("box");
+  builder->StartBlock(200);
+  builder->Add("box");
+  builder->StartBlock(300);
+  builder->Add("hello");
+  Slice slice(builder->Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FilterBlockReader* reader =
+      new BlockBasedFilterBlockReader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  delete builder;
+  delete reader;
+}
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
+  FilterBlockBuilder* builder =
+      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
+
+  // First filter
+  builder->StartBlock(0);
+  builder->Add("foo");
+  builder->StartBlock(2000);
+  builder->Add("bar");
+
+  // Second filter
+  builder->StartBlock(3100);
+  builder->Add("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder->StartBlock(9000);
+  builder->Add("box");
+  builder->Add("hello");
+
+  Slice slice(builder->Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FilterBlockReader* reader =
+      new BlockBasedFilterBlockReader(table_.get(), std::move(block));
+
+  // Check first filter
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check second filter
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check last filter
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  delete builder;
+  delete reader;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/block_based_table_builder.cc b/src/rocksdb/table/block_based/block_based_table_builder.cc
new file mode 100644
index 000000000..2003008fe
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_builder.cc
@@ -0,0 +1,1217 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_table_builder.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "db/dbformat.h"
+#include "index_builder.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/table.h"
+
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/format.h"
+#include "table/table_builder.h"
+
+#include "memory/memory_allocator.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+
+typedef BlockBasedTableOptions::IndexType IndexType;
+
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+
+// Create a filter block builder based on its type.
+FilterBlockBuilder* CreateFilterBlockBuilder(
+    const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
+    const FilterBuildingContext& context,
+    const bool use_delta_encoding_for_index_values,
+    PartitionedIndexBuilder* const p_index_builder) {
+  const BlockBasedTableOptions& table_opt = context.table_options;
+  if (table_opt.filter_policy == nullptr) return nullptr;
+
+  FilterBitsBuilder* filter_bits_builder =
+      BloomFilterPolicy::GetBuilderFromContext(context);
+  if (filter_bits_builder == nullptr) {
+    return new BlockBasedFilterBlockBuilder(mopt.prefix_extractor.get(),
+                                            table_opt);
+  } else {
+    if (table_opt.partition_filters) {
+      assert(p_index_builder != nullptr);
+      // Since after partition cut request from filter builder it takes time
+      // until index builder actully cuts the partition, we take the lower bound
+      // as partition size.
+      assert(table_opt.block_size_deviation <= 100);
+      auto partition_size =
+          static_cast<uint32_t>(((table_opt.metadata_block_size *
+                                  (100 - table_opt.block_size_deviation)) +
+                                 99) /
+                                100);
+      partition_size = std::max(partition_size, static_cast<uint32_t>(1));
+      return new PartitionedFilterBlockBuilder(
+          mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
+          filter_bits_builder, table_opt.index_block_restart_interval,
+          use_delta_encoding_for_index_values, p_index_builder, partition_size);
+    } else {
+      return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
+                                        table_opt.whole_key_filtering,
+                                        filter_bits_builder);
+    }
+  }
+}
+
+bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
+  // Check to see if compressed less than 12.5%
+  return compressed_size < raw_size - (raw_size / 8u);
+}
+
+bool CompressBlockInternal(const Slice& raw,
+                           const CompressionInfo& compression_info,
+                           uint32_t format_version,
+                           std::string* compressed_output) {
+  // Will return compressed block contents if (1) the compression method is
+  // supported in this platform and (2) the compression rate is "good enough".
+  switch (compression_info.type()) {
+    case kSnappyCompression:
+      return Snappy_Compress(compression_info, raw.data(), raw.size(),
+                             compressed_output);
+    case kZlibCompression:
+      return Zlib_Compress(
+          compression_info,
+          GetCompressFormatForVersion(kZlibCompression, format_version),
+          raw.data(), raw.size(), compressed_output);
+    case kBZip2Compression:
+      return BZip2_Compress(
+          compression_info,
+          GetCompressFormatForVersion(kBZip2Compression, format_version),
+          raw.data(), raw.size(), compressed_output);
+    case kLZ4Compression:
+      return LZ4_Compress(
+          compression_info,
+          GetCompressFormatForVersion(kLZ4Compression, format_version),
+          raw.data(), raw.size(), compressed_output);
+    case kLZ4HCCompression:
+      return LZ4HC_Compress(
+          compression_info,
+          GetCompressFormatForVersion(kLZ4HCCompression, format_version),
+          raw.data(), raw.size(), compressed_output);
+    case kXpressCompression:
+      return XPRESS_Compress(raw.data(), raw.size(), compressed_output);
+    case kZSTD:
+    case kZSTDNotFinalCompression:
+      return ZSTD_Compress(compression_info, raw.data(), raw.size(),
+                           compressed_output);
+    default:
+      // Do not recognize this compression type
+      return false;
+  }
+}
+
+}  // namespace
+
+// format_version is the block format as defined in include/rocksdb/table.h
+Slice CompressBlock(const Slice& raw, const CompressionInfo& info,
+                    CompressionType* type, uint32_t format_version,
+                    bool do_sample, std::string* compressed_output,
+                    std::string* sampled_output_fast,
+                    std::string* sampled_output_slow) {
+  *type = info.type();
+
+  if (info.type() == kNoCompression && !info.SampleForCompression()) {
+    return raw;
+  }
+
+  // If requested, we sample one in every N block with a
+  // fast and slow compression algorithm and report the stats.
+  // The users can use these stats to decide if it is worthwhile
+  // enabling compression and they also get a hint about which
+  // compression algorithm wil be beneficial.
+  if (do_sample && info.SampleForCompression() &&
+      Random::GetTLSInstance()->OneIn((int)info.SampleForCompression()) &&
+      sampled_output_fast && sampled_output_slow) {
+    // Sampling with a fast compression algorithm
+    if (LZ4_Supported() || Snappy_Supported()) {
+      CompressionType c =
+          LZ4_Supported() ? kLZ4Compression : kSnappyCompression;
+      CompressionContext context(c);
+      CompressionOptions options;
+      CompressionInfo info_tmp(options, context,
+                               CompressionDict::GetEmptyDict(), c,
+                               info.SampleForCompression());
+
+      CompressBlockInternal(raw, info_tmp, format_version, sampled_output_fast);
+    }
+
+    // Sampling with a slow but high-compression algorithm
+    if (ZSTD_Supported() || Zlib_Supported()) {
+      CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression;
+      CompressionContext context(c);
+      CompressionOptions options;
+      CompressionInfo info_tmp(options, context,
+                               CompressionDict::GetEmptyDict(), c,
+                               info.SampleForCompression());
+      CompressBlockInternal(raw, info_tmp, format_version, sampled_output_slow);
+    }
+  }
+
+  // Actually compress the data
+  if (*type != kNoCompression) {
+    if (CompressBlockInternal(raw, info, format_version, compressed_output) &&
+        GoodCompressionRatio(compressed_output->size(), raw.size())) {
+      return *compressed_output;
+    }
+  }
+
+  // Compression method is not supported, or not good
+  // compression ratio, so just fall back to uncompressed form.
+  *type = kNoCompression;
+  return raw;
+}
+
+// kBlockBasedTableMagicNumber was picked by running
+//    echo rocksdb.table.block_based | sha1sum
+// and taking the leading 64 bits.
+// Please note that kBlockBasedTableMagicNumber may also be accessed by other
+// .cc files
+// for that reason we declare it extern in the header but to get the space
+// allocated
+// it must be not extern in one place.
+const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
+// We also support reading and writing legacy block based table format (for
+// backwards compatibility)
+const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
+
+// A collector that collects properties of interest to block-based table.
+// For now this class looks heavy-weight since we only write one additional
+// property.
+// But in the foreseeable future, we will add more and more properties that are
+// specific to block-based table.
+class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
+    : public IntTblPropCollector {
+ public:
+  explicit BlockBasedTablePropertiesCollector(
+      BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering,
+      bool prefix_filtering)
+      : index_type_(index_type),
+        whole_key_filtering_(whole_key_filtering),
+        prefix_filtering_(prefix_filtering) {}
+
+  Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
+                     uint64_t /*file_size*/) override {
+    // Intentionally left blank. Have no interest in collecting stats for
+    // individual key/value pairs.
+    return Status::OK();
+  }
+
+  virtual void BlockAdd(uint64_t /* blockRawBytes */,
+                        uint64_t /* blockCompressedBytesFast */,
+                        uint64_t /* blockCompressedBytesSlow */) override {
+    // Intentionally left blank. No interest in collecting stats for
+    // blocks.
+    return;
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string val;
+    PutFixed32(&val, static_cast<uint32_t>(index_type_));
+    properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
+    properties->insert({BlockBasedTablePropertyNames::kWholeKeyFiltering,
+                        whole_key_filtering_ ? kPropTrue : kPropFalse});
+    properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering,
+                        prefix_filtering_ ? kPropTrue : kPropFalse});
+    return Status::OK();
+  }
+
+  // The name of the properties collector can be used for debugging purpose.
+  const char* Name() const override {
+    return "BlockBasedTablePropertiesCollector";
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    // Intentionally left blank.
+    return UserCollectedProperties();
+  }
+
+ private:
+  BlockBasedTableOptions::IndexType index_type_;
+  bool whole_key_filtering_;
+  bool prefix_filtering_;
+};
+
+struct BlockBasedTableBuilder::Rep {
+  const ImmutableCFOptions ioptions;
+  const MutableCFOptions moptions;
+  const BlockBasedTableOptions table_options;
+  const InternalKeyComparator& internal_comparator;
+  WritableFileWriter* file;
+  uint64_t offset = 0;
+  Status status;
+  size_t alignment;
+  BlockBuilder data_block;
+  // Buffers uncompressed data blocks and keys to replay later. Needed when
+  // compression dictionary is enabled so we can finalize the dictionary before
+  // compressing any data blocks.
+  // TODO(ajkr): ideally we don't buffer all keys and all uncompressed data
+  // blocks as it's redundant, but it's easier to implement for now.
+  std::vector<std::pair<std::string, std::vector<std::string>>>
+      data_block_and_keys_buffers;
+  BlockBuilder range_del_block;
+
+  InternalKeySliceTransform internal_prefix_transform;
+  std::unique_ptr<IndexBuilder> index_builder;
+  PartitionedIndexBuilder* p_index_builder_ = nullptr;
+
+  std::string last_key;
+  CompressionType compression_type;
+  uint64_t sample_for_compression;
+  CompressionOptions compression_opts;
+  std::unique_ptr<CompressionDict> compression_dict;
+  CompressionContext compression_ctx;
+  std::unique_ptr<UncompressionContext> verify_ctx;
+  std::unique_ptr<UncompressionDict> verify_dict;
+
+  size_t data_begin_offset = 0;
+
+  TableProperties props;
+
+  // States of the builder.
+  //
+  // - `kBuffered`: This is the initial state where zero or more data blocks are
+  //   accumulated uncompressed in-memory. From this state, call
+  //   `EnterUnbuffered()` to finalize the compression dictionary if enabled,
+  //   compress/write out any buffered blocks, and proceed to the `kUnbuffered`
+  //   state.
+  //
+  // - `kUnbuffered`: This is the state when compression dictionary is finalized
+  //   either because it wasn't enabled in the first place or it's been created
+  //   from sampling previously buffered data. In this state, blocks are simply
+  //   compressed/written out as they fill up. From this state, call `Finish()`
+  //   to complete the file (write meta-blocks, etc.), or `Abandon()` to delete
+  //   the partially created file.
+  //
+  // - `kClosed`: This indicates either `Finish()` or `Abandon()` has been
+  //   called, so the table builder is no longer usable. We must be in this
+  //   state by the time the destructor runs.
+  enum class State {
+    kBuffered,
+    kUnbuffered,
+    kClosed,
+  };
+  State state;
+
+  const bool use_delta_encoding_for_index_values;
+  std::unique_ptr<FilterBlockBuilder> filter_builder;
+  char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
+  size_t compressed_cache_key_prefix_size;
+
+  BlockHandle pending_handle;  // Handle to add to index block
+
+  std::string compressed_output;
+  std::unique_ptr<FlushBlockPolicy> flush_block_policy;
+  int level_at_creation;
+  uint32_t column_family_id;
+  const std::string& column_family_name;
+  uint64_t creation_time = 0;
+  uint64_t oldest_key_time = 0;
+  const uint64_t target_file_size;
+  uint64_t file_creation_time = 0;
+
+  std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
+
+  Rep(const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions,
+      const BlockBasedTableOptions& table_opt,
+      const InternalKeyComparator& icomparator,
+      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+          int_tbl_prop_collector_factories,
+      uint32_t _column_family_id, WritableFileWriter* f,
+      const CompressionType _compression_type,
+      const uint64_t _sample_for_compression,
+      const CompressionOptions& _compression_opts, const bool skip_filters,
+      const int _level_at_creation, const std::string& _column_family_name,
+      const uint64_t _creation_time, const uint64_t _oldest_key_time,
+      const uint64_t _target_file_size, const uint64_t _file_creation_time)
+      : ioptions(_ioptions),
+        moptions(_moptions),
+        table_options(table_opt),
+        internal_comparator(icomparator),
+        file(f),
+        alignment(table_options.block_align
+                      ? std::min(table_options.block_size, kDefaultPageSize)
+                      : 0),
+        data_block(table_options.block_restart_interval,
+                   table_options.use_delta_encoding,
+                   false /* use_value_delta_encoding */,
+                   icomparator.user_comparator()
+                           ->CanKeysWithDifferentByteContentsBeEqual()
+                       ? BlockBasedTableOptions::kDataBlockBinarySearch
+                       : table_options.data_block_index_type,
+                   table_options.data_block_hash_table_util_ratio),
+        range_del_block(1 /* block_restart_interval */),
+        internal_prefix_transform(_moptions.prefix_extractor.get()),
+        compression_type(_compression_type),
+        sample_for_compression(_sample_for_compression),
+        compression_opts(_compression_opts),
+        compression_dict(),
+        compression_ctx(_compression_type),
+        verify_dict(),
+        state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered
+                                                     : State::kUnbuffered),
+        use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
+                                            !table_opt.block_align),
+        compressed_cache_key_prefix_size(0),
+        flush_block_policy(
+            table_options.flush_block_policy_factory->NewFlushBlockPolicy(
+                table_options, data_block)),
+        level_at_creation(_level_at_creation),
+        column_family_id(_column_family_id),
+        column_family_name(_column_family_name),
+        creation_time(_creation_time),
+        oldest_key_time(_oldest_key_time),
+        target_file_size(_target_file_size),
+        file_creation_time(_file_creation_time) {
+    if (table_options.index_type ==
+        BlockBasedTableOptions::kTwoLevelIndexSearch) {
+      p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
+          &internal_comparator, use_delta_encoding_for_index_values,
+          table_options);
+      index_builder.reset(p_index_builder_);
+    } else {
+      index_builder.reset(IndexBuilder::CreateIndexBuilder(
+          table_options.index_type, &internal_comparator,
+          &this->internal_prefix_transform, use_delta_encoding_for_index_values,
+          table_options));
+    }
+    if (skip_filters) {
+      filter_builder = nullptr;
+    } else {
+      FilterBuildingContext context(table_options);
+      context.column_family_name = column_family_name;
+      context.compaction_style = ioptions.compaction_style;
+      context.level_at_creation = level_at_creation;
+      context.info_log = ioptions.info_log;
+      filter_builder.reset(CreateFilterBlockBuilder(
+          ioptions, moptions, context, use_delta_encoding_for_index_values,
+          p_index_builder_));
+    }
+
+    for (auto& collector_factories : *int_tbl_prop_collector_factories) {
+      table_properties_collectors.emplace_back(
+          collector_factories->CreateIntTblPropCollector(column_family_id));
+    }
+    table_properties_collectors.emplace_back(
+        new BlockBasedTablePropertiesCollector(
+            table_options.index_type, table_options.whole_key_filtering,
+            _moptions.prefix_extractor != nullptr));
+    if (table_options.verify_compression) {
+      verify_ctx.reset(new UncompressionContext(UncompressionContext::NoCache(),
+                                                compression_type));
+    }
+  }
+
+  Rep(const Rep&) = delete;
+  Rep& operator=(const Rep&) = delete;
+
+  ~Rep() {}
+};
+
+BlockBasedTableBuilder::BlockBasedTableBuilder(
+    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
+    const BlockBasedTableOptions& table_options,
+    const InternalKeyComparator& internal_comparator,
+    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+        int_tbl_prop_collector_factories,
+    uint32_t column_family_id, WritableFileWriter* file,
+    const CompressionType compression_type,
+    const uint64_t sample_for_compression,
+    const CompressionOptions& compression_opts, const bool skip_filters,
+    const std::string& column_family_name, const int level_at_creation,
+    const uint64_t creation_time, const uint64_t oldest_key_time,
+    const uint64_t target_file_size, const uint64_t file_creation_time) {
+  BlockBasedTableOptions sanitized_table_options(table_options);
+  if (sanitized_table_options.format_version == 0 &&
+      sanitized_table_options.checksum != kCRC32c) {
+    ROCKS_LOG_WARN(
+        ioptions.info_log,
+        "Silently converting format_version to 1 because checksum is "
+        "non-default");
+    // silently convert format_version to 1 to keep consistent with current
+    // behavior
+    sanitized_table_options.format_version = 1;
+  }
+
+  rep_ = new Rep(ioptions, moptions, sanitized_table_options,
+                 internal_comparator, int_tbl_prop_collector_factories,
+                 column_family_id, file, compression_type,
+                 sample_for_compression, compression_opts, skip_filters,
+                 level_at_creation, column_family_name, creation_time,
+                 oldest_key_time, target_file_size, file_creation_time);
+
+  if (rep_->filter_builder != nullptr) {
+    rep_->filter_builder->StartBlock(0);
+  }
+  if (table_options.block_cache_compressed.get() != nullptr) {
+    BlockBasedTable::GenerateCachePrefix(
+        table_options.block_cache_compressed.get(), file->writable_file(),
+        &rep_->compressed_cache_key_prefix[0],
+        &rep_->compressed_cache_key_prefix_size);
+  }
+}
+
+BlockBasedTableBuilder::~BlockBasedTableBuilder() {
+  // Catch errors where caller forgot to call Finish()
+  assert(rep_->state == Rep::State::kClosed);
+  delete rep_;
+}
+
+void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
+  Rep* r = rep_;
+  assert(rep_->state != Rep::State::kClosed);
+  if (!ok()) return;
+  ValueType value_type = ExtractValueType(key);
+  if (IsValueType(value_type)) {
+#ifndef NDEBUG
+    if (r->props.num_entries > r->props.num_range_deletions) {
+      assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
+    }
+#endif  // NDEBUG
+
+    auto should_flush = r->flush_block_policy->Update(key, value);
+    if (should_flush) {
+      assert(!r->data_block.empty());
+      Flush();
+
+      if (r->state == Rep::State::kBuffered &&
+          r->data_begin_offset > r->target_file_size) {
+        EnterUnbuffered();
+      }
+
+      // Add item to index block.
+      // We do not emit the index entry for a block until we have seen the
+      // first key for the next data block.  This allows us to use shorter
+      // keys in the index block.  For example, consider a block boundary
+      // between the keys "the quick brown fox" and "the who".  We can use
+      // "the r" as the key for the index block entry since it is >= all
+      // entries in the first block and < all entries in subsequent
+      // blocks.
+      if (ok() && r->state == Rep::State::kUnbuffered) {
+        r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle);
+      }
+    }
+
+    // Note: PartitionedFilterBlockBuilder requires key being added to filter
+    // builder after being added to index builder.
+    if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) {
+      size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size();
+      r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+    }
+
+    r->last_key.assign(key.data(), key.size());
+    r->data_block.Add(key, value);
+    if (r->state == Rep::State::kBuffered) {
+      // Buffer keys to be replayed during `Finish()` once compression
+      // dictionary has been finalized.
+      if (r->data_block_and_keys_buffers.empty() || should_flush) {
+        r->data_block_and_keys_buffers.emplace_back();
+      }
+      r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString());
+    } else {
+      r->index_builder->OnKeyAdded(key);
+    }
+    NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
+                                      r->table_properties_collectors,
+                                      r->ioptions.info_log);
+
+  } else if (value_type == kTypeRangeDeletion) {
+    r->range_del_block.Add(key, value);
+    NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
+                                      r->table_properties_collectors,
+                                      r->ioptions.info_log);
+  } else {
+    assert(false);
+  }
+
+  r->props.num_entries++;
+  r->props.raw_key_size += key.size();
+  r->props.raw_value_size += value.size();
+  if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion) {
+    r->props.num_deletions++;
+  } else if (value_type == kTypeRangeDeletion) {
+    r->props.num_deletions++;
+    r->props.num_range_deletions++;
+  } else if (value_type == kTypeMerge) {
+    r->props.num_merge_operands++;
+  }
+}
+
+void BlockBasedTableBuilder::Flush() {
+  Rep* r = rep_;
+  assert(rep_->state != Rep::State::kClosed);
+  if (!ok()) return;
+  if (r->data_block.empty()) return;
+  WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */);
+}
+
+void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
+                                        BlockHandle* handle,
+                                        bool is_data_block) {
+  WriteBlock(block->Finish(), handle, is_data_block);
+  block->Reset();
+}
+
+void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
+                                        BlockHandle* handle,
+                                        bool is_data_block) {
+  // File format contains a sequence of blocks where each block has:
+  //    block_data: uint8[n]
+  //    type: uint8
+  //    crc: uint32
+  assert(ok());
+  Rep* r = rep_;
+
+  auto type = r->compression_type;
+  uint64_t sample_for_compression = r->sample_for_compression;
+  Slice block_contents;
+  bool abort_compression = false;
+
+  StopWatchNano timer(
+      r->ioptions.env,
+      ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics));
+
+  if (r->state == Rep::State::kBuffered) {
+    assert(is_data_block);
+    assert(!r->data_block_and_keys_buffers.empty());
+    r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString();
+    r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size();
+    return;
+  }
+
+  if (raw_block_contents.size() < kCompressionSizeLimit) {
+    const CompressionDict* compression_dict;
+    if (!is_data_block || r->compression_dict == nullptr) {
+      compression_dict = &CompressionDict::GetEmptyDict();
+    } else {
+      compression_dict = r->compression_dict.get();
+    }
+    assert(compression_dict != nullptr);
+    CompressionInfo compression_info(r->compression_opts, r->compression_ctx,
+                                     *compression_dict, type,
+                                     sample_for_compression);
+
+    std::string sampled_output_fast;
+    std::string sampled_output_slow;
+    block_contents = CompressBlock(
+        raw_block_contents, compression_info, &type,
+        r->table_options.format_version, is_data_block /* do_sample */,
+        &r->compressed_output, &sampled_output_fast, &sampled_output_slow);
+
+    // notify collectors on block add
+    NotifyCollectTableCollectorsOnBlockAdd(
+        r->table_properties_collectors, raw_block_contents.size(),
+        sampled_output_fast.size(), sampled_output_slow.size());
+
+    // Some of the compression algorithms are known to be unreliable. If
+    // the verify_compression flag is set then try to de-compress the
+    // compressed data and compare to the input.
+    if (type != kNoCompression && r->table_options.verify_compression) {
+      // Retrieve the uncompressed contents into a new buffer
+      const UncompressionDict* verify_dict;
+      if (!is_data_block || r->verify_dict == nullptr) {
+        verify_dict = &UncompressionDict::GetEmptyDict();
+      } else {
+        verify_dict = r->verify_dict.get();
+      }
+      assert(verify_dict != nullptr);
+      BlockContents contents;
+      UncompressionInfo uncompression_info(*r->verify_ctx, *verify_dict,
+                                           r->compression_type);
+      Status stat = UncompressBlockContentsForCompressionType(
+          uncompression_info, block_contents.data(), block_contents.size(),
+          &contents, r->table_options.format_version, r->ioptions);
+
+      if (stat.ok()) {
+        bool compressed_ok = contents.data.compare(raw_block_contents) == 0;
+        if (!compressed_ok) {
+          // The result of the compression was invalid. abort.
+          abort_compression = true;
+          ROCKS_LOG_ERROR(r->ioptions.info_log,
+                          "Decompressed block did not match raw block");
+          r->status =
+              Status::Corruption("Decompressed block did not match raw block");
+        }
+      } else {
+        // Decompression reported an error. abort.
+        r->status = Status::Corruption("Could not decompress");
+        abort_compression = true;
+      }
+    }
+  } else {
+    // Block is too big to be compressed.
+    abort_compression = true;
+  }
+
+  // Abort compression if the block is too big, or did not pass
+  // verification.
+  if (abort_compression) {
+    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
+    type = kNoCompression;
+    block_contents = raw_block_contents;
+  } else if (type != kNoCompression) {
+    if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)) {
+      RecordTimeToHistogram(r->ioptions.statistics, COMPRESSION_TIMES_NANOS,
+                            timer.ElapsedNanos());
+    }
+    RecordInHistogram(r->ioptions.statistics, BYTES_COMPRESSED,
+                      raw_block_contents.size());
+    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED);
+  } else if (type != r->compression_type) {
+    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
+  }
+
+  WriteRawBlock(block_contents, type, handle, is_data_block);
+  r->compressed_output.clear();
+  if (is_data_block) {
+    if (r->filter_builder != nullptr) {
+      r->filter_builder->StartBlock(r->offset);
+    }
+    r->props.data_size = r->offset;
+    ++r->props.num_data_blocks;
+  }
+}
+
+void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
+                                           CompressionType type,
+                                           BlockHandle* handle,
+                                           bool is_data_block) {
+  Rep* r = rep_;
+  StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS);
+  handle->set_offset(r->offset);
+  handle->set_size(block_contents.size());
+  assert(r->status.ok());
+  r->status = r->file->Append(block_contents);
+  if (r->status.ok()) {
+    char trailer[kBlockTrailerSize];
+    trailer[0] = type;
+    char* trailer_without_type = trailer + 1;
+    switch (r->table_options.checksum) {
+      case kNoChecksum:
+        EncodeFixed32(trailer_without_type, 0);
+        break;
+      case kCRC32c: {
+        auto crc = crc32c::Value(block_contents.data(), block_contents.size());
+        crc = crc32c::Extend(crc, trailer, 1);  // Extend to cover block type
+        EncodeFixed32(trailer_without_type, crc32c::Mask(crc));
+        break;
+      }
+      case kxxHash: {
+        XXH32_state_t* const state = XXH32_createState();
+        XXH32_reset(state, 0);
+        XXH32_update(state, block_contents.data(),
+                     static_cast<uint32_t>(block_contents.size()));
+        XXH32_update(state, trailer, 1);  // Extend  to cover block type
+        EncodeFixed32(trailer_without_type, XXH32_digest(state));
+        XXH32_freeState(state);
+        break;
+      }
+      case kxxHash64: {
+        XXH64_state_t* const state = XXH64_createState();
+        XXH64_reset(state, 0);
+        XXH64_update(state, block_contents.data(),
+                     static_cast<uint32_t>(block_contents.size()));
+        XXH64_update(state, trailer, 1);  // Extend  to cover block type
+        EncodeFixed32(
+            trailer_without_type,
+            static_cast<uint32_t>(XXH64_digest(state) &  // lower 32 bits
+                                  uint64_t{0xffffffff}));
+        XXH64_freeState(state);
+        break;
+      }
+    }
+
+    assert(r->status.ok());
+    TEST_SYNC_POINT_CALLBACK(
+        "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum",
+        static_cast<char*>(trailer));
+    r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
+    if (r->status.ok()) {
+      r->status = InsertBlockInCache(block_contents, type, handle);
+    }
+    if (r->status.ok()) {
+      r->offset += block_contents.size() + kBlockTrailerSize;
+      if (r->table_options.block_align && is_data_block) {
+        size_t pad_bytes =
+            (r->alignment - ((block_contents.size() + kBlockTrailerSize) &
+                             (r->alignment - 1))) &
+            (r->alignment - 1);
+        r->status = r->file->Pad(pad_bytes);
+        if (r->status.ok()) {
+          r->offset += pad_bytes;
+        }
+      }
+    }
+  }
+}
+
+Status BlockBasedTableBuilder::status() const { return rep_->status; }
+
+static void DeleteCachedBlockContents(const Slice& /*key*/, void* value) {
+  BlockContents* bc = reinterpret_cast<BlockContents*>(value);
+  delete bc;
+}
+
+//
+// Make a copy of the block contents and insert into compressed block cache
+//
+Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
+                                                  const CompressionType type,
+                                                  const BlockHandle* handle) {
+  Rep* r = rep_;
+  Cache* block_cache_compressed = r->table_options.block_cache_compressed.get();
+
+  if (type != kNoCompression && block_cache_compressed != nullptr) {
+    size_t size = block_contents.size();
+
+    auto ubuf =
+        AllocateBlock(size + 1, block_cache_compressed->memory_allocator());
+    memcpy(ubuf.get(), block_contents.data(), size);
+    ubuf[size] = type;
+
+    BlockContents* block_contents_to_cache =
+        new BlockContents(std::move(ubuf), size);
+#ifndef NDEBUG
+    block_contents_to_cache->is_raw_block = true;
+#endif  // NDEBUG
+
+    // make cache key by appending the file offset to the cache prefix id
+    char* end = EncodeVarint64(
+        r->compressed_cache_key_prefix + r->compressed_cache_key_prefix_size,
+        handle->offset());
+    Slice key(r->compressed_cache_key_prefix,
+              static_cast<size_t>(end - r->compressed_cache_key_prefix));
+
+    // Insert into compressed block cache.
+    block_cache_compressed->Insert(
+        key, block_contents_to_cache,
+        block_contents_to_cache->ApproximateMemoryUsage(),
+        &DeleteCachedBlockContents);
+
+    // Invalidate OS cache.
+    r->file->InvalidateCache(static_cast<size_t>(r->offset), size);
+  }
+  return Status::OK();
+}
+
+void BlockBasedTableBuilder::WriteFilterBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  BlockHandle filter_block_handle;
+  bool empty_filter_block = (rep_->filter_builder == nullptr ||
+                             rep_->filter_builder->NumAdded() == 0);
+  if (ok() && !empty_filter_block) {
+    Status s = Status::Incomplete();
+    while (ok() && s.IsIncomplete()) {
+      Slice filter_content =
+          rep_->filter_builder->Finish(filter_block_handle, &s);
+      assert(s.ok() || s.IsIncomplete());
+      rep_->props.filter_size += filter_content.size();
+      WriteRawBlock(filter_content, kNoCompression, &filter_block_handle);
+    }
+  }
+  if (ok() && !empty_filter_block) {
+    // Add mapping from "<filter_block_prefix>.Name" to location
+    // of filter data.
+    std::string key;
+    if (rep_->filter_builder->IsBlockBased()) {
+      key = BlockBasedTable::kFilterBlockPrefix;
+    } else {
+      key = rep_->table_options.partition_filters
+                ? BlockBasedTable::kPartitionedFilterBlockPrefix
+                : BlockBasedTable::kFullFilterBlockPrefix;
+    }
+    key.append(rep_->table_options.filter_policy->Name());
+    meta_index_builder->Add(key, filter_block_handle);
+  }
+}
+
+void BlockBasedTableBuilder::WriteIndexBlock(
+    MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) {
+  IndexBuilder::IndexBlocks index_blocks;
+  auto index_builder_status = rep_->index_builder->Finish(&index_blocks);
+  if (index_builder_status.IsIncomplete()) {
+    // We we have more than one index partition then meta_blocks are not
+    // supported for the index. Currently meta_blocks are used only by
+    // HashIndexBuilder which is not multi-partition.
+    assert(index_blocks.meta_blocks.empty());
+  } else if (ok() && !index_builder_status.ok()) {
+    rep_->status = index_builder_status;
+  }
+  if (ok()) {
+    for (const auto& item : index_blocks.meta_blocks) {
+      BlockHandle block_handle;
+      WriteBlock(item.second, &block_handle, false /* is_data_block */);
+      if (!ok()) {
+        break;
+      }
+      meta_index_builder->Add(item.first, block_handle);
+    }
+  }
+  if (ok()) {
+    if (rep_->table_options.enable_index_compression) {
+      WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
+    } else {
+      WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
+                    index_block_handle);
+    }
+  }
+  // If there are more index partitions, finish them and write them out
+  Status s = index_builder_status;
+  while (ok() && s.IsIncomplete()) {
+    s = rep_->index_builder->Finish(&index_blocks, *index_block_handle);
+    if (!s.ok() && !s.IsIncomplete()) {
+      rep_->status = s;
+      return;
+    }
+    if (rep_->table_options.enable_index_compression) {
+      WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
+    } else {
+      WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
+                    index_block_handle);
+    }
+    // The last index_block_handle will be for the partition index block
+  }
+}
+
+void BlockBasedTableBuilder::WritePropertiesBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  BlockHandle properties_block_handle;
+  if (ok()) {
+    PropertyBlockBuilder property_block_builder;
+    rep_->props.column_family_id = rep_->column_family_id;
+    rep_->props.column_family_name = rep_->column_family_name;
+    rep_->props.filter_policy_name =
+        rep_->table_options.filter_policy != nullptr
+            ? rep_->table_options.filter_policy->Name()
+            : "";
+    rep_->props.index_size =
+        rep_->index_builder->IndexSize() + kBlockTrailerSize;
+    rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr
+                                      ? rep_->ioptions.user_comparator->Name()
+                                      : "nullptr";
+    rep_->props.merge_operator_name =
+        rep_->ioptions.merge_operator != nullptr
+            ? rep_->ioptions.merge_operator->Name()
+            : "nullptr";
+    rep_->props.compression_name =
+        CompressionTypeToString(rep_->compression_type);
+    rep_->props.compression_options =
+        CompressionOptionsToString(rep_->compression_opts);
+    rep_->props.prefix_extractor_name =
+        rep_->moptions.prefix_extractor != nullptr
+            ? rep_->moptions.prefix_extractor->Name()
+            : "nullptr";
+
+    std::string property_collectors_names = "[";
+    for (size_t i = 0;
+         i < rep_->ioptions.table_properties_collector_factories.size(); ++i) {
+      if (i != 0) {
+        property_collectors_names += ",";
+      }
+      property_collectors_names +=
+          rep_->ioptions.table_properties_collector_factories[i]->Name();
+    }
+    property_collectors_names += "]";
+    rep_->props.property_collectors_names = property_collectors_names;
+    if (rep_->table_options.index_type ==
+        BlockBasedTableOptions::kTwoLevelIndexSearch) {
+      assert(rep_->p_index_builder_ != nullptr);
+      rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions();
+      rep_->props.top_level_index_size =
+          rep_->p_index_builder_->TopLevelIndexSize(rep_->offset);
+    }
+    rep_->props.index_key_is_user_key =
+        !rep_->index_builder->seperator_is_key_plus_seq();
+    rep_->props.index_value_is_delta_encoded =
+        rep_->use_delta_encoding_for_index_values;
+    rep_->props.creation_time = rep_->creation_time;
+    rep_->props.oldest_key_time = rep_->oldest_key_time;
+    rep_->props.file_creation_time = rep_->file_creation_time;
+
+    // Add basic properties
+    property_block_builder.AddTableProperty(rep_->props);
+
+    // Add use collected properties
+    NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors,
+                                         rep_->ioptions.info_log,
+                                         &property_block_builder);
+
+    WriteRawBlock(property_block_builder.Finish(), kNoCompression,
+                  &properties_block_handle);
+  }
+  if (ok()) {
+#ifndef NDEBUG
+    {
+      uint64_t props_block_offset = properties_block_handle.offset();
+      uint64_t props_block_size = properties_block_handle.size();
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
+          &props_block_offset);
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
+          &props_block_size);
+    }
+#endif  // !NDEBUG
+    meta_index_builder->Add(kPropertiesBlock, properties_block_handle);
+  }
+}
+
+void BlockBasedTableBuilder::WriteCompressionDictBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  if (rep_->compression_dict != nullptr &&
+      rep_->compression_dict->GetRawDict().size()) {
+    BlockHandle compression_dict_block_handle;
+    if (ok()) {
+      WriteRawBlock(rep_->compression_dict->GetRawDict(), kNoCompression,
+                    &compression_dict_block_handle);
+#ifndef NDEBUG
+      Slice compression_dict = rep_->compression_dict->GetRawDict();
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+          &compression_dict);
+#endif  // NDEBUG
+    }
+    if (ok()) {
+      meta_index_builder->Add(kCompressionDictBlock,
+                              compression_dict_block_handle);
+    }
+  }
+}
+
+void BlockBasedTableBuilder::WriteRangeDelBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  if (ok() && !rep_->range_del_block.empty()) {
+    BlockHandle range_del_block_handle;
+    WriteRawBlock(rep_->range_del_block.Finish(), kNoCompression,
+                  &range_del_block_handle);
+    meta_index_builder->Add(kRangeDelBlock, range_del_block_handle);
+  }
+}
+
+void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
+                                         BlockHandle& index_block_handle) {
+  Rep* r = rep_;
+  // No need to write out new footer if we're using default checksum.
+  // We're writing legacy magic number because we want old versions of RocksDB
+  // be able to read files generated with new release (just in case if
+  // somebody wants to roll back after an upgrade)
+  // TODO(icanadi) at some point in the future, when we're absolutely sure
+  // nobody will roll back to RocksDB 2.x versions, retire the legacy magic
+  // number and always write new table files with new magic number
+  bool legacy = (r->table_options.format_version == 0);
+  // this is guaranteed by BlockBasedTableBuilder's constructor
+  assert(r->table_options.checksum == kCRC32c ||
+         r->table_options.format_version != 0);
+  Footer footer(
+      legacy ? kLegacyBlockBasedTableMagicNumber : kBlockBasedTableMagicNumber,
+      r->table_options.format_version);
+  footer.set_metaindex_handle(metaindex_block_handle);
+  footer.set_index_handle(index_block_handle);
+  footer.set_checksum(r->table_options.checksum);
+  std::string footer_encoding;
+  footer.EncodeTo(&footer_encoding);
+  assert(r->status.ok());
+  r->status = r->file->Append(footer_encoding);
+  if (r->status.ok()) {
+    r->offset += footer_encoding.size();
+  }
+}
+
+void BlockBasedTableBuilder::EnterUnbuffered() {
+  Rep* r = rep_;
+  assert(r->state == Rep::State::kBuffered);
+  r->state = Rep::State::kUnbuffered;
+  const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
+                                  ? r->compression_opts.zstd_max_train_bytes
+                                  : r->compression_opts.max_dict_bytes;
+  Random64 generator{r->creation_time};
+  std::string compression_dict_samples;
+  std::vector<size_t> compression_dict_sample_lens;
+  if (!r->data_block_and_keys_buffers.empty()) {
+    while (compression_dict_samples.size() < kSampleBytes) {
+      size_t rand_idx =
+          static_cast<size_t>(
+              generator.Uniform(r->data_block_and_keys_buffers.size()));
+      size_t copy_len =
+          std::min(kSampleBytes - compression_dict_samples.size(),
+                   r->data_block_and_keys_buffers[rand_idx].first.size());
+      compression_dict_samples.append(
+          r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len);
+      compression_dict_sample_lens.emplace_back(copy_len);
+    }
+  }
+
+  // final data block flushed, now we can generate dictionary from the samples.
+  // OK if compression_dict_samples is empty, we'll just get empty dictionary.
+  std::string dict;
+  if (r->compression_opts.zstd_max_train_bytes > 0) {
+    dict = ZSTD_TrainDictionary(compression_dict_samples,
+                                compression_dict_sample_lens,
+                                r->compression_opts.max_dict_bytes);
+  } else {
+    dict = std::move(compression_dict_samples);
+  }
+  r->compression_dict.reset(new CompressionDict(dict, r->compression_type,
+                                                r->compression_opts.level));
+  r->verify_dict.reset(new UncompressionDict(
+      dict, r->compression_type == kZSTD ||
+                r->compression_type == kZSTDNotFinalCompression));
+
+  for (size_t i = 0; ok() && i < r->data_block_and_keys_buffers.size(); ++i) {
+    const auto& data_block = r->data_block_and_keys_buffers[i].first;
+    auto& keys = r->data_block_and_keys_buffers[i].second;
+    assert(!data_block.empty());
+    assert(!keys.empty());
+
+    for (const auto& key : keys) {
+      if (r->filter_builder != nullptr) {
+        size_t ts_sz =
+            r->internal_comparator.user_comparator()->timestamp_size();
+        r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+      }
+      r->index_builder->OnKeyAdded(key);
+    }
+    WriteBlock(Slice(data_block), &r->pending_handle, true /* is_data_block */);
+    if (ok() && i + 1 < r->data_block_and_keys_buffers.size()) {
+      Slice first_key_in_next_block =
+          r->data_block_and_keys_buffers[i + 1].second.front();
+      Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
+      r->index_builder->AddIndexEntry(&keys.back(), first_key_in_next_block_ptr,
+                                      r->pending_handle);
+    }
+  }
+  r->data_block_and_keys_buffers.clear();
+}
+
+Status BlockBasedTableBuilder::Finish() {
+  Rep* r = rep_;
+  assert(r->state != Rep::State::kClosed);
+  bool empty_data_block = r->data_block.empty();
+  Flush();
+  if (r->state == Rep::State::kBuffered) {
+    EnterUnbuffered();
+  }
+  // To make sure properties block is able to keep the accurate size of index
+  // block, we will finish writing all index entries first.
+  if (ok() && !empty_data_block) {
+    r->index_builder->AddIndexEntry(
+        &r->last_key, nullptr /* no next data block */, r->pending_handle);
+  }
+
+  // Write meta blocks, metaindex block and footer in the following order.
+  //    1. [meta block: filter]
+  //    2. [meta block: index]
+  //    3. [meta block: compression dictionary]
+  //    4. [meta block: range deletion tombstone]
+  //    5. [meta block: properties]
+  //    6. [metaindex block]
+  //    7. Footer
+  BlockHandle metaindex_block_handle, index_block_handle;
+  MetaIndexBuilder meta_index_builder;
+  WriteFilterBlock(&meta_index_builder);
+  WriteIndexBlock(&meta_index_builder, &index_block_handle);
+  WriteCompressionDictBlock(&meta_index_builder);
+  WriteRangeDelBlock(&meta_index_builder);
+  WritePropertiesBlock(&meta_index_builder);
+  if (ok()) {
+    // flush the meta index block
+    WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
+                  &metaindex_block_handle);
+  }
+  if (ok()) {
+    WriteFooter(metaindex_block_handle, index_block_handle);
+  }
+  if (r->file != nullptr) {
+    file_checksum_ = r->file->GetFileChecksum();
+  }
+  r->state = Rep::State::kClosed;
+  return r->status;
+}
+
+void BlockBasedTableBuilder::Abandon() {
+  assert(rep_->state != Rep::State::kClosed);
+  rep_->state = Rep::State::kClosed;
+}
+
+uint64_t BlockBasedTableBuilder::NumEntries() const {
+  return rep_->props.num_entries;
+}
+
+uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; }
+
+bool BlockBasedTableBuilder::NeedCompact() const {
+  for (const auto& collector : rep_->table_properties_collectors) {
+    if (collector->NeedCompact()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+TableProperties BlockBasedTableBuilder::GetTableProperties() const {
+  TableProperties ret = rep_->props;
+  for (const auto& collector : rep_->table_properties_collectors) {
+    for (const auto& prop : collector->GetReadableProperties()) {
+      ret.readable_properties.insert(prop);
+    }
+    collector->Finish(&ret.user_collected_properties);
+  }
+  return ret;
+}
+
+const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const {
+  if (rep_->file != nullptr) {
+    return rep_->file->GetFileChecksumFuncName();
+  } else {
+    return kUnknownFileChecksumFuncName.c_str();
+  }
+}
+
+const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";
+const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter.";
+const std::string BlockBasedTable::kPartitionedFilterBlockPrefix =
+    "partitionedfilter.";
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_builder.h b/src/rocksdb/table/block_based/block_based_table_builder.h
new file mode 100644
index 000000000..97c9bc65a
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_builder.h
@@ -0,0 +1,157 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "table/meta_blocks.h"
+#include "table/table_builder.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+struct BlockBasedTableOptions;
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+
+class BlockBasedTableBuilder : public TableBuilder {
+ public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish().
+  BlockBasedTableBuilder(
+      const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
+      const BlockBasedTableOptions& table_options,
+      const InternalKeyComparator& internal_comparator,
+      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+          int_tbl_prop_collector_factories,
+      uint32_t column_family_id, WritableFileWriter* file,
+      const CompressionType compression_type,
+      const uint64_t sample_for_compression,
+      const CompressionOptions& compression_opts, const bool skip_filters,
+      const std::string& column_family_name, const int level_at_creation,
+      const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
+      const uint64_t target_file_size = 0,
+      const uint64_t file_creation_time = 0);
+
+  // No copying allowed
+  BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
+  BlockBasedTableBuilder& operator=(const BlockBasedTableBuilder&) = delete;
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~BlockBasedTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+  bool NeedCompact() const override;
+
+  // Get table properties
+  TableProperties GetTableProperties() const override;
+
+  // Get file checksum
+  const std::string& GetFileChecksum() const override { return file_checksum_; }
+
+  // Get file checksum function name
+  const char* GetFileChecksumFuncName() const override;
+
+ private:
+  bool ok() const { return status().ok(); }
+
+  // Transition state from buffered to unbuffered. See `Rep::State` API comment
+  // for details of the states.
+  // REQUIRES: `rep_->state == kBuffered`
+  void EnterUnbuffered();
+
+  // Call block's Finish() method
+  // and then write the compressed block contents to file.
+  void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block);
+
+  // Compress and write block content to the file.
+  void WriteBlock(const Slice& block_contents, BlockHandle* handle,
+                  bool is_data_block);
+  // Directly write data to the file.
+  void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle,
+                     bool is_data_block = false);
+  Status InsertBlockInCache(const Slice& block_contents,
+                            const CompressionType type,
+                            const BlockHandle* handle);
+
+  void WriteFilterBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteIndexBlock(MetaIndexBuilder* meta_index_builder,
+                       BlockHandle* index_block_handle);
+  void WritePropertiesBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteCompressionDictBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteRangeDelBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteFooter(BlockHandle& metaindex_block_handle,
+                   BlockHandle& index_block_handle);
+
+  struct Rep;
+  class BlockBasedTablePropertiesCollectorFactory;
+  class BlockBasedTablePropertiesCollector;
+  Rep* rep_;
+
+  // Advanced operation: flush any buffered key/value pairs to file.
+  // Can be used to ensure that two adjacent entries never live in
+  // the same data block.  Most clients should not need to use this method.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Flush();
+
+  // Some compression libraries fail when the raw size is bigger than int. If
+  // uncompressed size is bigger than kCompressionSizeLimit, don't compress it
+  const uint64_t kCompressionSizeLimit = std::numeric_limits<int>::max();
+
+  // Store file checksum. If checksum is disabled, its value is "0".
+  std::string file_checksum_ = kUnknownFileChecksum;
+};
+
+Slice CompressBlock(const Slice& raw, const CompressionInfo& info,
+                    CompressionType* type, uint32_t format_version,
+                    bool do_sample, std::string* compressed_output,
+                    std::string* sampled_output_fast,
+                    std::string* sampled_output_slow);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_factory.cc b/src/rocksdb/table/block_based/block_based_table_factory.cc
new file mode 100644
index 000000000..70a6f38d5
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_factory.cc
@@ -0,0 +1,649 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdint.h>
+#include <cinttypes>
+
+#include <memory>
+#include <string>
+
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/flush_block_policy.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void TailPrefetchStats::RecordEffectiveSize(size_t len) {
+  MutexLock l(&mutex_);
+  if (num_records_ < kNumTracked) {
+    num_records_++;
+  }
+  records_[next_++] = len;
+  if (next_ == kNumTracked) {
+    next_ = 0;
+  }
+}
+
+size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
+  std::vector<size_t> sorted;
+  {
+    MutexLock l(&mutex_);
+
+    if (num_records_ == 0) {
+      return 0;
+    }
+    sorted.assign(records_, records_ + num_records_);
+  }
+
+  // Of the historic size, we find the maximum one that satisifis the condtiion
+  // that if prefetching all, less than 1/8 will be wasted.
+  std::sort(sorted.begin(), sorted.end());
+
+  // Assuming we have 5 data points, and after sorting it looks like this:
+  //
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                    +---+    |   |   |   |
+  //                    |   |    |   |   |   |
+  //           +---+    |   |    |   |   |   |
+  //           |   |    |   |    |   |   |   |
+  //  +---+    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // and we use every of the value as a candidate, and estimate how much we
+  // wasted, compared to read. For example, when we use the 3rd record
+  // as candiate. This area is what we read:
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //  ***  ***  ***  ***+ ***  ***  *** *** **
+  //  *                 |   |    |   |   |   |
+  //           +---+    |   |    |   |   |   *
+  //  *        |   |    |   |    |   |   |   |
+  //  +---+    |   |    |   |    |   |   |   *
+  //  *   |    |   |    | X |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   *
+  //  *   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   *
+  //  *   |    |   |    |   |    |   |   |   |
+  //  *** *** ***-***  ***--*** ***--*** +****
+  // which is (size of the record) X (number of records).
+  //
+  // While wasted is this area:
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //  ***  ***  ***  ****---+    |   |   |   |
+  //  *                 *   |    |   |   |   |
+  //  *        *-***  ***   |    |   |   |   |
+  //  *        *   |    |   |    |   |   |   |
+  //  *--**  ***   |    |   |    |   |   |   |
+  //  |   |    |   |    | X |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // Which can be calculated iteratively.
+  // The difference between wasted using 4st and 3rd record, will
+  // be following area:
+  //                                     +---+
+  //  +--+  +-+   ++  +-+  +-+   +---+   |   |
+  //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //    xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  | xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  +-+ +-+  +-+  ++  +---+ +--+   |   |   |
+  //  |                 |   |    |   |   |   |
+  //           +---+ ++ |   |    |   |   |   |
+  //  |        |   |    |   |    | X |   |   |
+  //  +---+ ++ |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // which will be the size difference between 4st and 3rd record,
+  // times 3, which is number of records before the 4st.
+  // Here we assume that all data within the prefetch range will be useful. In
+  // reality, it may not be the case when a partial block is inside the range,
+  // or there are data in the middle that is not read. We ignore those cases
+  // for simplicity.
+  assert(!sorted.empty());
+  size_t prev_size = sorted[0];
+  size_t max_qualified_size = sorted[0];
+  size_t wasted = 0;
+  for (size_t i = 1; i < sorted.size(); i++) {
+    size_t read = sorted[i] * sorted.size();
+    wasted += (sorted[i] - prev_size) * i;
+    if (wasted <= read / 8) {
+      max_qualified_size = sorted[i];
+    }
+    prev_size = sorted[i];
+  }
+  const size_t kMaxPrefetchSize = 512 * 1024;  // Never exceed 512KB
+  return std::min(kMaxPrefetchSize, max_qualified_size);
+}
+
+// TODO(myabandeh): We should return an error instead of silently changing the
+// options
+BlockBasedTableFactory::BlockBasedTableFactory(
+    const BlockBasedTableOptions& _table_options)
+    : table_options_(_table_options) {
+  if (table_options_.flush_block_policy_factory == nullptr) {
+    table_options_.flush_block_policy_factory.reset(
+        new FlushBlockBySizePolicyFactory());
+  }
+  if (table_options_.no_block_cache) {
+    table_options_.block_cache.reset();
+  } else if (table_options_.block_cache == nullptr) {
+    LRUCacheOptions co;
+    co.capacity = 8 << 20;
+    // It makes little sense to pay overhead for mid-point insertion while the
+    // block size is only 8MB.
+    co.high_pri_pool_ratio = 0.0;
+    table_options_.block_cache = NewLRUCache(co);
+  }
+  if (table_options_.block_size_deviation < 0 ||
+      table_options_.block_size_deviation > 100) {
+    table_options_.block_size_deviation = 0;
+  }
+  if (table_options_.block_restart_interval < 1) {
+    table_options_.block_restart_interval = 1;
+  }
+  if (table_options_.index_block_restart_interval < 1) {
+    table_options_.index_block_restart_interval = 1;
+  }
+  if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+      table_options_.index_block_restart_interval != 1) {
+    // Currently kHashSearch is incompatible with index_block_restart_interval > 1
+    table_options_.index_block_restart_interval = 1;
+  }
+  if (table_options_.partition_filters &&
+      table_options_.index_type !=
+          BlockBasedTableOptions::kTwoLevelIndexSearch) {
+    // We do not support partitioned filters without partitioning indexes
+    table_options_.partition_filters = false;
+  }
+}
+
+Status BlockBasedTableFactory::NewTableReader(
+    const TableReaderOptions& table_reader_options,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader,
+    bool prefetch_index_and_filter_in_cache) const {
+  return BlockBasedTable::Open(
+      table_reader_options.ioptions, table_reader_options.env_options,
+      table_options_, table_reader_options.internal_comparator, std::move(file),
+      file_size, table_reader, table_reader_options.prefix_extractor,
+      prefetch_index_and_filter_in_cache, table_reader_options.skip_filters,
+      table_reader_options.level, table_reader_options.immortal,
+      table_reader_options.largest_seqno, &tail_prefetch_stats_,
+      table_reader_options.block_cache_tracer);
+}
+
+TableBuilder* BlockBasedTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+    WritableFileWriter* file) const {
+  auto table_builder = new BlockBasedTableBuilder(
+      table_builder_options.ioptions, table_builder_options.moptions,
+      table_options_, table_builder_options.internal_comparator,
+      table_builder_options.int_tbl_prop_collector_factories, column_family_id,
+      file, table_builder_options.compression_type,
+      table_builder_options.sample_for_compression,
+      table_builder_options.compression_opts,
+      table_builder_options.skip_filters,
+      table_builder_options.column_family_name, table_builder_options.level,
+      table_builder_options.creation_time,
+      table_builder_options.oldest_key_time,
+      table_builder_options.target_file_size,
+      table_builder_options.file_creation_time);
+
+  return table_builder;
+}
+
+Status BlockBasedTableFactory::SanitizeOptions(
+    const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+  if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+      cf_opts.prefix_extractor == nullptr) {
+    return Status::InvalidArgument(
+        "Hash index is specified for block-based "
+        "table, but prefix_extractor is not given");
+  }
+  if (table_options_.cache_index_and_filter_blocks &&
+      table_options_.no_block_cache) {
+    return Status::InvalidArgument(
+        "Enable cache_index_and_filter_blocks, "
+        ", but block cache is disabled");
+  }
+  if (table_options_.pin_l0_filter_and_index_blocks_in_cache &&
+      table_options_.no_block_cache) {
+    return Status::InvalidArgument(
+        "Enable pin_l0_filter_and_index_blocks_in_cache, "
+        ", but block cache is disabled");
+  }
+  if (!BlockBasedTableSupportedVersion(table_options_.format_version)) {
+    return Status::InvalidArgument(
+        "Unsupported BlockBasedTable format_version. Please check "
+        "include/rocksdb/table.h for more info");
+  }
+  if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
+    return Status::InvalidArgument(
+        "Enable block_align, but compression "
+        "enabled");
+  }
+  if (table_options_.block_align &&
+      (table_options_.block_size & (table_options_.block_size - 1))) {
+    return Status::InvalidArgument(
+        "Block alignment requested but block size is not a power of 2");
+  }
+  if (table_options_.block_size > port::kMaxUint32) {
+    return Status::InvalidArgument(
+        "block size exceeds maximum number (4GiB) allowed");
+  }
+  if (table_options_.data_block_index_type ==
+          BlockBasedTableOptions::kDataBlockBinaryAndHash &&
+      table_options_.data_block_hash_table_util_ratio <= 0) {
+    return Status::InvalidArgument(
+        "data_block_hash_table_util_ratio should be greater than 0 when "
+        "data_block_index_type is set to kDataBlockBinaryAndHash");
+  }
+  if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
+    // TODO(myabandeh): support it
+    return Status::InvalidArgument(
+        "max_successive_merges larger than 0 is currently inconsistent with "
+        "unordered_write");
+  }
+  return Status::OK();
+}
+
+std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "  flush_block_policy_factory: %s (%p)\n",
+           table_options_.flush_block_policy_factory->Name(),
+           static_cast<void*>(table_options_.flush_block_policy_factory.get()));
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  cache_index_and_filter_blocks: %d\n",
+           table_options_.cache_index_and_filter_blocks);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  cache_index_and_filter_blocks_with_high_priority: %d\n",
+           table_options_.cache_index_and_filter_blocks_with_high_priority);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  pin_l0_filter_and_index_blocks_in_cache: %d\n",
+           table_options_.pin_l0_filter_and_index_blocks_in_cache);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  pin_top_level_index_and_filter: %d\n",
+           table_options_.pin_top_level_index_and_filter);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_type: %d\n",
+           table_options_.index_type);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  data_block_index_type: %d\n",
+           table_options_.data_block_index_type);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_shortening: %d\n",
+           static_cast<int>(table_options_.index_shortening));
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  data_block_hash_table_util_ratio: %lf\n",
+           table_options_.data_block_hash_table_util_ratio);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  hash_index_allow_collision: %d\n",
+           table_options_.hash_index_allow_collision);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  checksum: %d\n", table_options_.checksum);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  no_block_cache: %d\n",
+           table_options_.no_block_cache);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_cache: %p\n",
+           static_cast<void*>(table_options_.block_cache.get()));
+  ret.append(buffer);
+  if (table_options_.block_cache) {
+    const char* block_cache_name = table_options_.block_cache->Name();
+    if (block_cache_name != nullptr) {
+      snprintf(buffer, kBufferSize, "  block_cache_name: %s\n",
+               block_cache_name);
+      ret.append(buffer);
+    }
+    ret.append("  block_cache_options:\n");
+    ret.append(table_options_.block_cache->GetPrintableOptions());
+  }
+  snprintf(buffer, kBufferSize, "  block_cache_compressed: %p\n",
+           static_cast<void*>(table_options_.block_cache_compressed.get()));
+  ret.append(buffer);
+  if (table_options_.block_cache_compressed) {
+    const char* block_cache_compressed_name =
+        table_options_.block_cache_compressed->Name();
+    if (block_cache_compressed_name != nullptr) {
+      snprintf(buffer, kBufferSize, "  block_cache_name: %s\n",
+               block_cache_compressed_name);
+      ret.append(buffer);
+    }
+    ret.append("  block_cache_compressed_options:\n");
+    ret.append(table_options_.block_cache_compressed->GetPrintableOptions());
+  }
+  snprintf(buffer, kBufferSize, "  persistent_cache: %p\n",
+           static_cast<void*>(table_options_.persistent_cache.get()));
+  ret.append(buffer);
+  if (table_options_.persistent_cache) {
+    snprintf(buffer, kBufferSize, "  persistent_cache_options:\n");
+    ret.append(buffer);
+    ret.append(table_options_.persistent_cache->GetPrintableOptions());
+  }
+  snprintf(buffer, kBufferSize, "  block_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_size_deviation: %d\n",
+           table_options_.block_size_deviation);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_restart_interval: %d\n",
+           table_options_.block_restart_interval);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_block_restart_interval: %d\n",
+           table_options_.index_block_restart_interval);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  metadata_block_size: %" PRIu64 "\n",
+           table_options_.metadata_block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  partition_filters: %d\n",
+           table_options_.partition_filters);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  use_delta_encoding: %d\n",
+           table_options_.use_delta_encoding);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  filter_policy: %s\n",
+           table_options_.filter_policy == nullptr
+               ? "nullptr"
+               : table_options_.filter_policy->Name());
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  whole_key_filtering: %d\n",
+           table_options_.whole_key_filtering);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  verify_compression: %d\n",
+           table_options_.verify_compression);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  read_amp_bytes_per_bit: %d\n",
+           table_options_.read_amp_bytes_per_bit);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  format_version: %d\n",
+           table_options_.format_version);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  enable_index_compression: %d\n",
+           table_options_.enable_index_compression);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_align: %d\n",
+           table_options_.block_align);
+  ret.append(buffer);
+  return ret;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+bool SerializeSingleBlockBasedTableOption(
+    std::string* opt_string, const BlockBasedTableOptions& bbt_options,
+    const std::string& name, const std::string& delimiter) {
+  auto iter = block_based_table_type_info.find(name);
+  if (iter == block_based_table_type_info.end()) {
+    return false;
+  }
+  auto& opt_info = iter->second;
+  const char* opt_address =
+      reinterpret_cast<const char*>(&bbt_options) + opt_info.offset;
+  std::string value;
+  bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value);
+  if (result) {
+    *opt_string = name + "=" + value + delimiter;
+  }
+  return result;
+}
+}  // namespace
+
+Status BlockBasedTableFactory::GetOptionString(
+    std::string* opt_string, const std::string& delimiter) const {
+  assert(opt_string);
+  opt_string->clear();
+  for (auto iter = block_based_table_type_info.begin();
+       iter != block_based_table_type_info.end(); ++iter) {
+    if (iter->second.verification == OptionVerificationType::kDeprecated) {
+      // If the option is no longer used in rocksdb and marked as deprecated,
+      // we skip it in the serialization.
+      continue;
+    }
+    std::string single_output;
+    bool result = SerializeSingleBlockBasedTableOption(
+        &single_output, table_options_, iter->first, delimiter);
+    assert(result);
+    if (result) {
+      opt_string->append(single_output);
+    }
+  }
+  return Status::OK();
+}
+#else
+Status BlockBasedTableFactory::GetOptionString(
+    std::string* /*opt_string*/, const std::string& /*delimiter*/) const {
+  return Status::OK();
+}
+#endif  // !ROCKSDB_LITE
+
+const BlockBasedTableOptions& BlockBasedTableFactory::table_options() const {
+  return table_options_;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+std::string ParseBlockBasedTableOption(const std::string& name,
+                                       const std::string& org_value,
+                                       BlockBasedTableOptions* new_options,
+                                       bool input_strings_escaped = false,
+                                       bool ignore_unknown_options = false) {
+  const std::string& value =
+      input_strings_escaped ? UnescapeOptionString(org_value) : org_value;
+  if (!input_strings_escaped) {
+    // if the input string is not escaped, it means this function is
+    // invoked from SetOptions, which takes the old format.
+    if (name == "block_cache" || name == "block_cache_compressed") {
+      // cache options can be specified in the following format
+      //   "block_cache={capacity=1M;num_shard_bits=4;
+      //    strict_capacity_limit=true;high_pri_pool_ratio=0.5;}"
+      // To support backward compatibility, the following format
+      // is also supported.
+      //   "block_cache=1M"
+      std::shared_ptr<Cache> cache;
+      // block_cache is specified in format block_cache=<cache_size>.
+      if (value.find('=') == std::string::npos) {
+        cache = NewLRUCache(ParseSizeT(value));
+      } else {
+        LRUCacheOptions cache_opts;
+        if (!ParseOptionHelper(reinterpret_cast<char*>(&cache_opts),
+                               OptionType::kLRUCacheOptions, value)) {
+          return "Invalid cache options";
+        }
+        cache = NewLRUCache(cache_opts);
+      }
+
+      if (name == "block_cache") {
+        new_options->block_cache = cache;
+      } else {
+        new_options->block_cache_compressed = cache;
+      }
+      return "";
+    } else if (name == "filter_policy") {
+      // Expect the following format
+      // bloomfilter:int:bool
+      const std::string kName = "bloomfilter:";
+      if (value.compare(0, kName.size(), kName) != 0) {
+        return "Invalid filter policy name";
+      }
+      size_t pos = value.find(':', kName.size());
+      if (pos == std::string::npos) {
+        return "Invalid filter policy config, missing bits_per_key";
+      }
+      double bits_per_key =
+          ParseDouble(trim(value.substr(kName.size(), pos - kName.size())));
+      bool use_block_based_builder =
+          ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1)));
+      new_options->filter_policy.reset(
+          NewBloomFilterPolicy(bits_per_key, use_block_based_builder));
+      return "";
+    }
+  }
+  const auto iter = block_based_table_type_info.find(name);
+  if (iter == block_based_table_type_info.end()) {
+    if (ignore_unknown_options) {
+      return "";
+    } else {
+      return "Unrecognized option";
+    }
+  }
+  const auto& opt_info = iter->second;
+  if (opt_info.verification != OptionVerificationType::kDeprecated &&
+      !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset,
+                         opt_info.type, value)) {
+    return "Invalid value";
+  }
+  return "";
+}
+}  // namespace
+
+Status GetBlockBasedTableOptionsFromString(
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+
+  return GetBlockBasedTableOptionsFromMap(table_options, opts_map,
+                                          new_table_options);
+}
+
+Status GetBlockBasedTableOptionsFromMap(
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options, bool input_strings_escaped,
+    bool ignore_unknown_options) {
+  assert(new_table_options);
+  *new_table_options = table_options;
+  for (const auto& o : opts_map) {
+    auto error_message = ParseBlockBasedTableOption(
+        o.first, o.second, new_table_options, input_strings_escaped,
+        ignore_unknown_options);
+    if (error_message != "") {
+      const auto iter = block_based_table_type_info.find(o.first);
+      if (iter == block_based_table_type_info.end() ||
+          !input_strings_escaped ||  // !input_strings_escaped indicates
+                                     // the old API, where everything is
+                                     // parsable.
+          (iter->second.verification != OptionVerificationType::kByName &&
+           iter->second.verification !=
+               OptionVerificationType::kByNameAllowNull &&
+           iter->second.verification !=
+               OptionVerificationType::kByNameAllowFromNull &&
+           iter->second.verification != OptionVerificationType::kDeprecated)) {
+        // Restore "new_options" to the default "base_options".
+        *new_table_options = table_options;
+        return Status::InvalidArgument("Can't parse BlockBasedTableOptions:",
+                                       o.first + " " + error_message);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status VerifyBlockBasedTableFactory(
+    const BlockBasedTableFactory* base_tf,
+    const BlockBasedTableFactory* file_tf,
+    OptionsSanityCheckLevel sanity_check_level) {
+  if ((base_tf != nullptr) != (file_tf != nullptr) &&
+      sanity_check_level > kSanityLevelNone) {
+    return Status::Corruption(
+        "[RocksDBOptionsParser]: Inconsistent TableFactory class type");
+  }
+  if (base_tf == nullptr) {
+    return Status::OK();
+  }
+  assert(file_tf != nullptr);
+
+  const auto& base_opt = base_tf->table_options();
+  const auto& file_opt = file_tf->table_options();
+
+  for (auto& pair : block_based_table_type_info) {
+    if (pair.second.verification == OptionVerificationType::kDeprecated) {
+      // We skip checking deprecated variables as they might
+      // contain random values since they might not be initialized
+      continue;
+    }
+    if (BBTOptionSanityCheckLevel(pair.first) <= sanity_check_level) {
+      if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt),
+                           reinterpret_cast<const char*>(&file_opt),
+                           pair.second, pair.first, nullptr)) {
+        return Status::Corruption(
+            "[RocksDBOptionsParser]: "
+            "failed the verification on BlockBasedTableOptions::",
+            pair.first);
+      }
+    }
+  }
+  return Status::OK();
+}
+#endif  // !ROCKSDB_LITE
+
+TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& _table_options) {
+  return new BlockBasedTableFactory(_table_options);
+}
+
+const std::string BlockBasedTableFactory::kName = "BlockBasedTable";
+const std::string BlockBasedTablePropertyNames::kIndexType =
+    "rocksdb.block.based.table.index.type";
+const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
+    "rocksdb.block.based.table.whole.key.filtering";
+const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
+    "rocksdb.block.based.table.prefix.filtering";
+const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
+const std::string kHashIndexPrefixesMetadataBlock =
+    "rocksdb.hashindex.metadata";
+const std::string kPropTrue = "1";
+const std::string kPropFalse = "0";
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_factory.h b/src/rocksdb/table/block_based/block_based_table_factory.h
new file mode 100644
index 000000000..7c8633c07
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_factory.h
@@ -0,0 +1,195 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "db/dbformat.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct EnvOptions;
+
+class BlockBasedTableBuilder;
+
+// A class used to track actual bytes written from the tail in the recent SST
+// file opens, and provide a suggestion for following open.
+class TailPrefetchStats {
+ public:
+  void RecordEffectiveSize(size_t len);
+  // 0 indicates no information to determine.
+  size_t GetSuggestedPrefetchSize();
+
+ private:
+  const static size_t kNumTracked = 32;
+  size_t records_[kNumTracked];
+  port::Mutex mutex_;
+  size_t next_ = 0;
+  size_t num_records_ = 0;
+};
+
+class BlockBasedTableFactory : public TableFactory {
+ public:
+  explicit BlockBasedTableFactory(
+      const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+  ~BlockBasedTableFactory() {}
+
+  const char* Name() const override { return kName.c_str(); }
+
+  Status NewTableReader(
+      const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      bool prefetch_index_and_filter_in_cache = true) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      uint32_t column_family_id, WritableFileWriter* file) const override;
+
+  // Sanitizes the specified DB Options.
+  Status SanitizeOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override;
+
+  std::string GetPrintableTableOptions() const override;
+
+  Status GetOptionString(std::string* opt_string,
+                         const std::string& delimiter) const override;
+
+  const BlockBasedTableOptions& table_options() const;
+
+  void* GetOptions() override { return &table_options_; }
+
+  bool IsDeleteRangeSupported() const override { return true; }
+
+  static const std::string kName;
+
+ private:
+  BlockBasedTableOptions table_options_;
+  mutable TailPrefetchStats tail_prefetch_stats_;
+};
+
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+extern const std::string kPropTrue;
+extern const std::string kPropFalse;
+
+#ifndef ROCKSDB_LITE
+extern Status VerifyBlockBasedTableFactory(
+    const BlockBasedTableFactory* base_tf,
+    const BlockBasedTableFactory* file_tf,
+    OptionsSanityCheckLevel sanity_check_level);
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    block_based_table_type_info = {
+        /* currently not supported
+          std::shared_ptr<Cache> block_cache = nullptr;
+          std::shared_ptr<Cache> block_cache_compressed = nullptr;
+         */
+        {"flush_block_policy_factory",
+         {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory),
+          OptionType::kFlushBlockPolicyFactory, OptionVerificationType::kByName,
+          false, 0}},
+        {"cache_index_and_filter_blocks",
+         {offsetof(struct BlockBasedTableOptions,
+                   cache_index_and_filter_blocks),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"cache_index_and_filter_blocks_with_high_priority",
+         {offsetof(struct BlockBasedTableOptions,
+                   cache_index_and_filter_blocks_with_high_priority),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"pin_l0_filter_and_index_blocks_in_cache",
+         {offsetof(struct BlockBasedTableOptions,
+                   pin_l0_filter_and_index_blocks_in_cache),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"index_type",
+         {offsetof(struct BlockBasedTableOptions, index_type),
+          OptionType::kBlockBasedTableIndexType,
+          OptionVerificationType::kNormal, false, 0}},
+        {"hash_index_allow_collision",
+         {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"data_block_index_type",
+         {offsetof(struct BlockBasedTableOptions, data_block_index_type),
+          OptionType::kBlockBasedTableDataBlockIndexType,
+          OptionVerificationType::kNormal, false, 0}},
+        {"index_shortening",
+         {offsetof(struct BlockBasedTableOptions, index_shortening),
+          OptionType::kBlockBasedTableIndexShorteningMode,
+          OptionVerificationType::kNormal, false, 0}},
+        {"data_block_hash_table_util_ratio",
+         {offsetof(struct BlockBasedTableOptions,
+                   data_block_hash_table_util_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal, false, 0}},
+        {"checksum",
+         {offsetof(struct BlockBasedTableOptions, checksum),
+          OptionType::kChecksumType, OptionVerificationType::kNormal, false,
+          0}},
+        {"no_block_cache",
+         {offsetof(struct BlockBasedTableOptions, no_block_cache),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"block_size",
+         {offsetof(struct BlockBasedTableOptions, block_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
+        {"block_size_deviation",
+         {offsetof(struct BlockBasedTableOptions, block_size_deviation),
+          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"block_restart_interval",
+         {offsetof(struct BlockBasedTableOptions, block_restart_interval),
+          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"index_block_restart_interval",
+         {offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
+          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"index_per_partition",
+         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, false,
+          0}},
+        {"metadata_block_size",
+         {offsetof(struct BlockBasedTableOptions, metadata_block_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
+        {"partition_filters",
+         {offsetof(struct BlockBasedTableOptions, partition_filters),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"filter_policy",
+         {offsetof(struct BlockBasedTableOptions, filter_policy),
+          OptionType::kFilterPolicy, OptionVerificationType::kByName, false,
+          0}},
+        {"whole_key_filtering",
+         {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"skip_table_builder_flush",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
+          0}},
+        {"format_version",
+         {offsetof(struct BlockBasedTableOptions, format_version),
+          OptionType::kUInt32T, OptionVerificationType::kNormal, false, 0}},
+        {"verify_compression",
+         {offsetof(struct BlockBasedTableOptions, verify_compression),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"read_amp_bytes_per_bit",
+         {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
+          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
+        {"enable_index_compression",
+         {offsetof(struct BlockBasedTableOptions, enable_index_compression),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"block_align",
+         {offsetof(struct BlockBasedTableOptions, block_align),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"pin_top_level_index_and_filter",
+         {offsetof(struct BlockBasedTableOptions,
+                   pin_top_level_index_and_filter),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader.cc b/src/rocksdb/table/block_based/block_based_table_reader.cc
new file mode 100644
index 000000000..9b37b431f
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader.cc
@@ -0,0 +1,4531 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_based_table_reader.h"
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+
+#include "file/file_prefetch_buffer.h"
+#include "file/random_access_file_reader.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/block_fetcher.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/multiget_context.h"
+#include "table/persistent_cache_helper.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/two_level_iterator.h"
+
+#include "monitoring/perf_context_imp.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/util.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+
+typedef BlockBasedTable::IndexReader IndexReader;
+
+// Found that 256 KB readahead size provides the best performance, based on
+// experiments, for auto readahead. Experiment data is in PR #3282.
+const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
+
+BlockBasedTable::~BlockBasedTable() {
+  delete rep_;
+}
+
+std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
+
+template <typename TBlocklike>
+class BlocklikeTraits;
+
+template <>
+class BlocklikeTraits<BlockContents> {
+ public:
+  static BlockContents* Create(BlockContents&& contents,
+                               SequenceNumber /* global_seqno */,
+                               size_t /* read_amp_bytes_per_bit */,
+                               Statistics* /* statistics */,
+                               bool /* using_zstd */,
+                               const FilterPolicy* /* filter_policy */) {
+    return new BlockContents(std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
+    return 0;
+  }
+};
+
+template <>
+class BlocklikeTraits<ParsedFullFilterBlock> {
+ public:
+  static ParsedFullFilterBlock* Create(BlockContents&& contents,
+                                       SequenceNumber /* global_seqno */,
+                                       size_t /* read_amp_bytes_per_bit */,
+                                       Statistics* /* statistics */,
+                                       bool /* using_zstd */,
+                                       const FilterPolicy* filter_policy) {
+    return new ParsedFullFilterBlock(filter_policy, std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
+    return 0;
+  }
+};
+
+template <>
+class BlocklikeTraits<Block> {
+ public:
+  static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
+                       size_t read_amp_bytes_per_bit, Statistics* statistics,
+                       bool /* using_zstd */,
+                       const FilterPolicy* /* filter_policy */) {
+    return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
+                     statistics);
+  }
+
+  static uint32_t GetNumRestarts(const Block& block) {
+    return block.NumRestarts();
+  }
+};
+
+template <>
+class BlocklikeTraits<UncompressionDict> {
+ public:
+  static UncompressionDict* Create(BlockContents&& contents,
+                                   SequenceNumber /* global_seqno */,
+                                   size_t /* read_amp_bytes_per_bit */,
+                                   Statistics* /* statistics */,
+                                   bool using_zstd,
+                                   const FilterPolicy* /* filter_policy */) {
+    return new UncompressionDict(contents.data, std::move(contents.allocation),
+                                 using_zstd);
+  }
+
+  static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
+    return 0;
+  }
+};
+
+namespace {
+// Read the block identified by "handle" from "file".
+// The only relevant option is options.verify_checksums for now.
+// On failure return non-OK.
+// On success fill *result and return OK - caller owns *result
+// @param uncompression_dict Data for presetting the compression library's
+//    dictionary.
+template <typename TBlocklike>
+Status ReadBlockFromFile(
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
+    std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
+    bool do_uncompress, bool maybe_compressed, BlockType block_type,
+    const UncompressionDict& uncompression_dict,
+    const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
+    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
+    bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) {
+  assert(result);
+
+  BlockContents contents;
+  BlockFetcher block_fetcher(
+      file, prefetch_buffer, footer, options, handle, &contents, ioptions,
+      do_uncompress, maybe_compressed, block_type, uncompression_dict,
+      cache_options, memory_allocator, nullptr, for_compaction);
+  Status s = block_fetcher.ReadBlockContents();
+  if (s.ok()) {
+    result->reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(contents), global_seqno, read_amp_bytes_per_bit,
+        ioptions.statistics, using_zstd, filter_policy));
+  }
+
+  return s;
+}
+
+inline MemoryAllocator* GetMemoryAllocator(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache.get()
+             ? table_options.block_cache->memory_allocator()
+             : nullptr;
+}
+
+inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache_compressed.get()
+             ? table_options.block_cache_compressed->memory_allocator()
+             : nullptr;
+}
+
+// Delete the entry resided in the cache.
+template <class Entry>
+void DeleteCachedEntry(const Slice& /*key*/, void* value) {
+  auto entry = reinterpret_cast<Entry*>(value);
+  delete entry;
+}
+
+// Release the cached entry and decrement its ref count.
+void ForceReleaseCachedEntry(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle, true /* force_erase */);
+}
+
+// Release the cached entry and decrement its ref count.
+// Do not force erase
+void ReleaseCachedEntry(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle, false /* force_erase */);
+}
+
+// For hash based index, return true if prefix_extractor and
+// prefix_extractor_block mismatch, false otherwise. This flag will be used
+// as total_order_seek via NewIndexIterator
+bool PrefixExtractorChanged(const TableProperties* table_properties,
+                            const SliceTransform* prefix_extractor) {
+  // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
+  // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
+  // is set but prefix_extractor_block is not set, also disable hash index
+  if (prefix_extractor == nullptr || table_properties == nullptr ||
+      table_properties->prefix_extractor_name.empty()) {
+    return true;
+  }
+
+  // prefix_extractor and prefix_extractor_block are both non-empty
+  if (table_properties->prefix_extractor_name.compare(
+          prefix_extractor->Name()) != 0) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
+  CacheAllocationPtr heap_buf;
+  heap_buf = AllocateBlock(buf.size(), allocator);
+  memcpy(heap_buf.get(), buf.data(), buf.size());
+  return heap_buf;
+}
+
+}  // namespace
+
+// Encapsulates common functionality for the various index reader
+// implementations. Provides access to the index block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
+ public:
+  IndexReaderCommon(const BlockBasedTable* t,
+                    CachableEntry<Block>&& index_block)
+      : table_(t), index_block_(std::move(index_block)) {
+    assert(table_ != nullptr);
+  }
+
+ protected:
+  static Status ReadIndexBlock(const BlockBasedTable* table,
+                               FilePrefetchBuffer* prefetch_buffer,
+                               const ReadOptions& read_options, bool use_cache,
+                               GetContext* get_context,
+                               BlockCacheLookupContext* lookup_context,
+                               CachableEntry<Block>* index_block);
+
+  const BlockBasedTable* table() const { return table_; }
+
+  const InternalKeyComparator* internal_comparator() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+
+    return &table_->get_rep()->internal_comparator;
+  }
+
+  bool index_has_first_key() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_has_first_key;
+  }
+
+  bool index_key_includes_seq() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_key_includes_seq;
+  }
+
+  bool index_value_is_full() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_value_is_full;
+  }
+
+  bool cache_index_blocks() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+  }
+
+  Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
+                             BlockCacheLookupContext* lookup_context,
+                             CachableEntry<Block>* index_block) const;
+
+  size_t ApproximateIndexBlockMemoryUsage() const {
+    assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
+    return index_block_.GetOwnValue()
+               ? index_block_.GetValue()->ApproximateMemoryUsage()
+               : 0;
+  }
+
+ private:
+  const BlockBasedTable* table_;
+  CachableEntry<Block> index_block_;
+};
+
+Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<Block>* index_block) {
+  PERF_TIMER_GUARD(read_index_block_nanos);
+
+  assert(table != nullptr);
+  assert(index_block != nullptr);
+  assert(index_block->IsEmpty());
+
+  const Rep* const rep = table->get_rep();
+  assert(rep != nullptr);
+
+  const Status s = table->RetrieveBlock(
+      prefetch_buffer, read_options, rep->footer.index_handle(),
+      UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
+      get_context, lookup_context, /* for_compaction */ false, use_cache);
+
+  return s;
+}
+
+Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<Block>* index_block) const {
+  assert(index_block != nullptr);
+
+  if (!index_block_.IsEmpty()) {
+    index_block->SetUnownedValue(index_block_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
+                        cache_index_blocks(), get_context, lookup_context,
+                        index_block);
+}
+
+// Index that allows binary search lookup in a two-level index structure.
+class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  // Read the partition index from the file and create an instance for
+  // `PartitionIndexReader`.
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(const BlockBasedTable* table,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader) {
+    assert(table != nullptr);
+    assert(table->get_rep());
+    assert(!pin || prefetch);
+    assert(index_reader != nullptr);
+
+    CachableEntry<Block> index_block;
+    if (prefetch || !use_cache) {
+      const Status s =
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
+                         /*get_context=*/nullptr, lookup_context, &index_block);
+      if (!s.ok()) {
+        return s;
+      }
+
+      if (use_cache && !pin) {
+        index_block.Reset();
+      }
+    }
+
+    index_reader->reset(
+        new PartitionIndexReader(table, std::move(index_block)));
+
+    return Status::OK();
+  }
+
+  // return a two-level iterator: first level is on the partition index
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override {
+    const bool no_io = (read_options.read_tier == kBlockCacheTier);
+    CachableEntry<Block> index_block;
+    const Status s =
+        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+    if (!s.ok()) {
+      if (iter != nullptr) {
+        iter->Invalidate(s);
+        return iter;
+      }
+
+      return NewErrorInternalIterator<IndexValue>(s);
+    }
+
+    InternalIteratorBase<IndexValue>* it = nullptr;
+
+    Statistics* kNullStats = nullptr;
+    // Filters are already checked before seeking the index
+    if (!partition_map_.empty()) {
+      // We don't return pinned data from index blocks, so no need
+      // to set `block_contents_pinned`.
+      it = NewTwoLevelIterator(
+          new BlockBasedTable::PartitionedIndexIteratorState(table(),
+                                                             &partition_map_),
+          index_block.GetValue()->NewIndexIterator(
+              internal_comparator(), internal_comparator()->user_comparator(),
+              nullptr, kNullStats, true, index_has_first_key(),
+              index_key_includes_seq(), index_value_is_full()));
+    } else {
+      ReadOptions ro;
+      ro.fill_cache = read_options.fill_cache;
+      // We don't return pinned data from index blocks, so no need
+      // to set `block_contents_pinned`.
+      it = new BlockBasedTableIterator<IndexBlockIter, IndexValue>(
+          table(), ro, *internal_comparator(),
+          index_block.GetValue()->NewIndexIterator(
+              internal_comparator(), internal_comparator()->user_comparator(),
+              nullptr, kNullStats, true, index_has_first_key(),
+              index_key_includes_seq(), index_value_is_full()),
+          false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
+          lookup_context ? lookup_context->caller
+                         : TableReaderCaller::kUncategorized);
+    }
+
+    assert(it != nullptr);
+    index_block.TransferTo(it);
+
+    return it;
+
+    // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
+    // on-stack BlockIter while the state is on heap. Currentlly it assumes
+    // the first level iter is always on heap and will attempt to delete it
+    // in its destructor.
+  }
+
+  void CacheDependencies(bool pin) override {
+    // Before read partitions, prefetch them to avoid lots of IOs
+    BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+    const BlockBasedTable::Rep* rep = table()->rep_;
+    IndexBlockIter biter;
+    BlockHandle handle;
+    Statistics* kNullStats = nullptr;
+
+    CachableEntry<Block> index_block;
+    Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
+                                   &lookup_context, &index_block);
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(rep->ioptions.info_log,
+                     "Error retrieving top-level index block while trying to "
+                     "cache index partitions: %s",
+                     s.ToString().c_str());
+      return;
+    }
+
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    index_block.GetValue()->NewIndexIterator(
+        internal_comparator(), internal_comparator()->user_comparator(), &biter,
+        kNullStats, true, index_has_first_key(), index_key_includes_seq(),
+        index_value_is_full());
+    // Index partitions are assumed to be consecuitive. Prefetch them all.
+    // Read the first block offset
+    biter.SeekToFirst();
+    if (!biter.Valid()) {
+      // Empty index.
+      return;
+    }
+    handle = biter.value().handle;
+    uint64_t prefetch_off = handle.offset();
+
+    // Read the last block's offset
+    biter.SeekToLast();
+    if (!biter.Valid()) {
+      // Empty index.
+      return;
+    }
+    handle = biter.value().handle;
+    uint64_t last_off = handle.offset() + block_size(handle);
+    uint64_t prefetch_len = last_off - prefetch_off;
+    std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+    rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer);
+    s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
+                                  static_cast<size_t>(prefetch_len));
+
+    // After prefetch, read the partitions one by one
+    biter.SeekToFirst();
+    auto ro = ReadOptions();
+    for (; biter.Valid(); biter.Next()) {
+      handle = biter.value().handle;
+      CachableEntry<Block> block;
+      // TODO: Support counter batch update for partitioned index and
+      // filter blocks
+      s = table()->MaybeReadBlockAndLoadToCache(
+          prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
+          &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
+          /*contents=*/nullptr);
+
+      assert(s.ok() || block.GetValue() == nullptr);
+      if (s.ok() && block.GetValue() != nullptr) {
+        if (block.IsCached()) {
+          if (pin) {
+            partition_map_[handle.offset()] = std::move(block);
+          }
+        }
+      }
+    }
+  }
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
+    return usage;
+  }
+
+ private:
+  PartitionIndexReader(const BlockBasedTable* t,
+                       CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+
+  std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
+};
+
+// Index that allows binary search lookup for the first key of each block.
+// This class can be viewed as a thin wrapper for `Block` class which already
+// supports binary search.
+class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  // Read index from the file and create an intance for
+  // `BinarySearchIndexReader`.
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(const BlockBasedTable* table,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader) {
+    assert(table != nullptr);
+    assert(table->get_rep());
+    assert(!pin || prefetch);
+    assert(index_reader != nullptr);
+
+    CachableEntry<Block> index_block;
+    if (prefetch || !use_cache) {
+      const Status s =
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
+                         /*get_context=*/nullptr, lookup_context, &index_block);
+      if (!s.ok()) {
+        return s;
+      }
+
+      if (use_cache && !pin) {
+        index_block.Reset();
+      }
+    }
+
+    index_reader->reset(
+        new BinarySearchIndexReader(table, std::move(index_block)));
+
+    return Status::OK();
+  }
+
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override {
+    const bool no_io = (read_options.read_tier == kBlockCacheTier);
+    CachableEntry<Block> index_block;
+    const Status s =
+        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+    if (!s.ok()) {
+      if (iter != nullptr) {
+        iter->Invalidate(s);
+        return iter;
+      }
+
+      return NewErrorInternalIterator<IndexValue>(s);
+    }
+
+    Statistics* kNullStats = nullptr;
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    auto it = index_block.GetValue()->NewIndexIterator(
+        internal_comparator(), internal_comparator()->user_comparator(), iter,
+        kNullStats, true, index_has_first_key(), index_key_includes_seq(),
+        index_value_is_full());
+
+    assert(it != nullptr);
+    index_block.TransferTo(it);
+
+    return it;
+  }
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+
+ private:
+  BinarySearchIndexReader(const BlockBasedTable* t,
+                          CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+};
+
+// Index that leverages an internal hash table to quicken the lookup for a given
+// key.
+class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  static Status Create(const BlockBasedTable* table,
+                       FilePrefetchBuffer* prefetch_buffer,
+                       InternalIterator* meta_index_iter, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader) {
+    assert(table != nullptr);
+    assert(index_reader != nullptr);
+    assert(!pin || prefetch);
+
+    const BlockBasedTable::Rep* rep = table->get_rep();
+    assert(rep != nullptr);
+
+    CachableEntry<Block> index_block;
+    if (prefetch || !use_cache) {
+      const Status s =
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
+                         /*get_context=*/nullptr, lookup_context, &index_block);
+      if (!s.ok()) {
+        return s;
+      }
+
+      if (use_cache && !pin) {
+        index_block.Reset();
+      }
+    }
+
+    // Note, failure to create prefix hash index does not need to be a
+    // hard error. We can still fall back to the original binary search index.
+    // So, Create will succeed regardless, from this point on.
+
+    index_reader->reset(new HashIndexReader(table, std::move(index_block)));
+
+    // Get prefixes block
+    BlockHandle prefixes_handle;
+    Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
+                             &prefixes_handle);
+    if (!s.ok()) {
+      // TODO: log error
+      return Status::OK();
+    }
+
+    // Get index metadata block
+    BlockHandle prefixes_meta_handle;
+    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
+                      &prefixes_meta_handle);
+    if (!s.ok()) {
+      // TODO: log error
+      return Status::OK();
+    }
+
+    RandomAccessFileReader* const file = rep->file.get();
+    const Footer& footer = rep->footer;
+    const ImmutableCFOptions& ioptions = rep->ioptions;
+    const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
+    MemoryAllocator* const memory_allocator =
+        GetMemoryAllocator(rep->table_options);
+
+    // Read contents for the blocks
+    BlockContents prefixes_contents;
+    BlockFetcher prefixes_block_fetcher(
+        file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
+        &prefixes_contents, ioptions, true /*decompress*/,
+        true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
+        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+    s = prefixes_block_fetcher.ReadBlockContents();
+    if (!s.ok()) {
+      return s;
+    }
+    BlockContents prefixes_meta_contents;
+    BlockFetcher prefixes_meta_block_fetcher(
+        file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
+        &prefixes_meta_contents, ioptions, true /*decompress*/,
+        true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
+        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+    s = prefixes_meta_block_fetcher.ReadBlockContents();
+    if (!s.ok()) {
+      // TODO: log error
+      return Status::OK();
+    }
+
+    BlockPrefixIndex* prefix_index = nullptr;
+    assert(rep->internal_prefix_transform.get() != nullptr);
+    s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
+                                 prefixes_contents.data,
+                                 prefixes_meta_contents.data, &prefix_index);
+    // TODO: log error
+    if (s.ok()) {
+      HashIndexReader* const hash_index_reader =
+          static_cast<HashIndexReader*>(index_reader->get());
+      hash_index_reader->prefix_index_.reset(prefix_index);
+    }
+
+    return Status::OK();
+  }
+
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool disable_prefix_seek,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override {
+    const bool no_io = (read_options.read_tier == kBlockCacheTier);
+    CachableEntry<Block> index_block;
+    const Status s =
+        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+    if (!s.ok()) {
+      if (iter != nullptr) {
+        iter->Invalidate(s);
+        return iter;
+      }
+
+      return NewErrorInternalIterator<IndexValue>(s);
+    }
+
+    Statistics* kNullStats = nullptr;
+    const bool total_order_seek =
+        read_options.total_order_seek || disable_prefix_seek;
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    auto it = index_block.GetValue()->NewIndexIterator(
+        internal_comparator(), internal_comparator()->user_comparator(), iter,
+        kNullStats, total_order_seek, index_has_first_key(),
+        index_key_includes_seq(), index_value_is_full(),
+        false /* block_contents_pinned */, prefix_index_.get());
+
+    assert(it != nullptr);
+    index_block.TransferTo(it);
+
+    return it;
+  }
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
+#else
+    if (prefix_index_) {
+      usage += prefix_index_->ApproximateMemoryUsage();
+    }
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+
+ private:
+  HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+
+  std::unique_ptr<BlockPrefixIndex> prefix_index_;
+};
+
+void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
+                                            GetContext* get_context,
+                                            size_t usage) const {
+  Statistics* const statistics = rep_->ioptions.statistics;
+
+  PERF_COUNTER_ADD(block_cache_hit_count, 1);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
+                            static_cast<uint32_t>(rep_->level));
+
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_hit;
+    get_context->get_context_stats_.num_cache_bytes_read += usage;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_HIT);
+    RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
+  }
+
+  switch (block_type) {
+    case BlockType::kFilter:
+      PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
+
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      // TODO: introduce perf counter for compression dictionary hit count
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+      }
+      break;
+
+    case BlockType::kIndex:
+      PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
+
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
+      }
+      break;
+  }
+}
+
+void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
+                                             GetContext* get_context) const {
+  Statistics* const statistics = rep_->ioptions.statistics;
+
+  // TODO: introduce aggregate (not per-level) block cache miss count
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
+                            static_cast<uint32_t>(rep_->level));
+
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_miss;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_MISS);
+  }
+
+  // TODO: introduce perf counters for misses per block type
+  switch (block_type) {
+    case BlockType::kFilter:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+      }
+      break;
+
+    case BlockType::kIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
+      }
+      break;
+  }
+}
+
+void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
+                                                  GetContext* get_context,
+                                                  size_t usage) const {
+  Statistics* const statistics = rep_->ioptions.statistics;
+
+  // TODO: introduce perf counters for block cache insertions
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_add;
+    get_context->get_context_stats_.num_cache_bytes_write += usage;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_ADD);
+    RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
+  }
+
+  switch (block_type) {
+    case BlockType::kFilter:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_add;
+        get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
+        RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_add;
+        get_context->get_context_stats_
+            .num_cache_compression_dict_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+                   usage);
+      }
+      break;
+
+    case BlockType::kIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_add;
+        get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
+        RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_add;
+        get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
+        RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
+      }
+      break;
+  }
+}
+
+Cache::Handle* BlockBasedTable::GetEntryFromCache(
+    Cache* block_cache, const Slice& key, BlockType block_type,
+    GetContext* get_context) const {
+  auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
+
+  if (cache_handle != nullptr) {
+    UpdateCacheHitMetrics(block_type, get_context,
+                          block_cache->GetUsage(cache_handle));
+  } else {
+    UpdateCacheMissMetrics(block_type, get_context);
+  }
+
+  return cache_handle;
+}
+
+// Helper function to setup the cache key's prefix for the Table.
+void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
+  assert(kMaxCacheKeyPrefixSize >= 10);
+  rep->cache_key_prefix_size = 0;
+  rep->compressed_cache_key_prefix_size = 0;
+  if (rep->table_options.block_cache != nullptr) {
+    GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
+                        &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
+  }
+  if (rep->table_options.persistent_cache != nullptr) {
+    GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
+                        &rep->persistent_cache_key_prefix[0],
+                        &rep->persistent_cache_key_prefix_size);
+  }
+  if (rep->table_options.block_cache_compressed != nullptr) {
+    GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
+                        rep->file->file(), &rep->compressed_cache_key_prefix[0],
+                        &rep->compressed_cache_key_prefix_size);
+  }
+}
+
+void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
+                                          char* buffer, size_t* size) {
+  // generate an id from the file
+  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
+
+  // If the prefix wasn't generated or was too long,
+  // create one from the cache.
+  if (cc != nullptr && *size == 0) {
+    char* end = EncodeVarint64(buffer, cc->NewId());
+    *size = static_cast<size_t>(end - buffer);
+  }
+}
+
+void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file,
+                                          char* buffer, size_t* size) {
+  // generate an id from the file
+  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
+
+  // If the prefix wasn't generated or was too long,
+  // create one from the cache.
+  if (cc != nullptr && *size == 0) {
+    char* end = EncodeVarint64(buffer, cc->NewId());
+    *size = static_cast<size_t>(end - buffer);
+  }
+}
+
+namespace {
+// Return True if table_properties has `user_prop_name` has a `true` value
+// or it doesn't contain this property (for backward compatible).
+bool IsFeatureSupported(const TableProperties& table_properties,
+                        const std::string& user_prop_name, Logger* info_log) {
+  auto& props = table_properties.user_collected_properties;
+  auto pos = props.find(user_prop_name);
+  // Older version doesn't have this value set. Skip this check.
+  if (pos != props.end()) {
+    if (pos->second == kPropFalse) {
+      return false;
+    } else if (pos->second != kPropTrue) {
+      ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
+                     user_prop_name.c_str(), pos->second.c_str());
+    }
+  }
+  return true;
+}
+
+// Caller has to ensure seqno is not nullptr.
+Status GetGlobalSequenceNumber(const TableProperties& table_properties,
+                               SequenceNumber largest_seqno,
+                               SequenceNumber* seqno) {
+  const auto& props = table_properties.user_collected_properties;
+  const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
+  const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+
+  *seqno = kDisableGlobalSequenceNumber;
+  if (version_pos == props.end()) {
+    if (seqno_pos != props.end()) {
+      std::array<char, 200> msg_buf;
+      // This is not an external sst file, global_seqno is not supported.
+      snprintf(
+          msg_buf.data(), msg_buf.max_size(),
+          "A non-external sst file have global seqno property with value %s",
+          seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
+    }
+    return Status::OK();
+  }
+
+  uint32_t version = DecodeFixed32(version_pos->second.c_str());
+  if (version < 2) {
+    if (seqno_pos != props.end() || version != 1) {
+      std::array<char, 200> msg_buf;
+      // This is a v1 external sst file, global_seqno is not supported.
+      snprintf(msg_buf.data(), msg_buf.max_size(),
+               "An external sst file with version %u have global seqno "
+               "property with value %s",
+               version, seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
+    }
+    return Status::OK();
+  }
+
+  // Since we have a plan to deprecate global_seqno, we do not return failure
+  // if seqno_pos == props.end(). We rely on version_pos to detect whether the
+  // SST is external.
+  SequenceNumber global_seqno(0);
+  if (seqno_pos != props.end()) {
+    global_seqno = DecodeFixed64(seqno_pos->second.c_str());
+  }
+  // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
+  // to denote it is unknown.
+  if (largest_seqno < kMaxSequenceNumber) {
+    if (global_seqno == 0) {
+      global_seqno = largest_seqno;
+    }
+    if (global_seqno != largest_seqno) {
+      std::array<char, 200> msg_buf;
+      snprintf(
+          msg_buf.data(), msg_buf.max_size(),
+          "An external sst file with version %u have global seqno property "
+          "with value %s, while largest seqno in the file is %llu",
+          version, seqno_pos->second.c_str(),
+          static_cast<unsigned long long>(largest_seqno));
+      return Status::Corruption(msg_buf.data());
+    }
+  }
+  *seqno = global_seqno;
+
+  if (global_seqno > kMaxSequenceNumber) {
+    std::array<char, 200> msg_buf;
+    snprintf(msg_buf.data(), msg_buf.max_size(),
+             "An external sst file with version %u have global seqno property "
+             "with value %llu, which is greater than kMaxSequenceNumber",
+             version, static_cast<unsigned long long>(global_seqno));
+    return Status::Corruption(msg_buf.data());
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
+                                   size_t cache_key_prefix_size,
+                                   const BlockHandle& handle, char* cache_key) {
+  assert(cache_key != nullptr);
+  assert(cache_key_prefix_size != 0);
+  assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
+  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
+  char* end =
+      EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
+  return Slice(cache_key, static_cast<size_t>(end - cache_key));
+}
+
+Status BlockBasedTable::Open(
+    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+    const BlockBasedTableOptions& table_options,
+    const InternalKeyComparator& internal_comparator,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader,
+    const SliceTransform* prefix_extractor,
+    const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
+    const int level, const bool immortal_table,
+    const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
+    BlockCacheTracer* const block_cache_tracer) {
+  table_reader->reset();
+
+  Status s;
+  Footer footer;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+
+  // prefetch both index and filters, down to all partitions
+  const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
+  const bool preload_all = !table_options.cache_index_and_filter_blocks;
+
+  if (!ioptions.allow_mmap_reads) {
+    s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
+                     preload_all, &prefetch_buffer);
+  } else {
+    // Should not prefetch for mmap mode.
+    prefetch_buffer.reset(new FilePrefetchBuffer(
+        nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
+  }
+
+  // Read in the following order:
+  //    1. Footer
+  //    2. [metaindex block]
+  //    3. [meta block: properties]
+  //    4. [meta block: range deletion tombstone]
+  //    5. [meta block: compression dictionary]
+  //    6. [meta block: index]
+  //    7. [meta block: filter]
+  s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
+                         kBlockBasedTableMagicNumber);
+  if (!s.ok()) {
+    return s;
+  }
+  if (!BlockBasedTableSupportedVersion(footer.version())) {
+    return Status::Corruption(
+        "Unknown Footer version. Maybe this file was created with newer "
+        "version of RocksDB?");
+  }
+
+  // We've successfully read the footer. We are ready to serve requests.
+  // Better not mutate rep_ after the creation. eg. internal_prefix_transform
+  // raw pointer will be used to create HashIndexReader, whose reset may
+  // access a dangling pointer.
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+  Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
+                                      internal_comparator, skip_filters, level,
+                                      immortal_table);
+  rep->file = std::move(file);
+  rep->footer = footer;
+  rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
+  // We need to wrap data with internal_prefix_transform to make sure it can
+  // handle prefix correctly.
+  if (prefix_extractor != nullptr) {
+    rep->internal_prefix_transform.reset(
+        new InternalKeySliceTransform(prefix_extractor));
+  }
+  SetupCacheKeyPrefix(rep);
+  std::unique_ptr<BlockBasedTable> new_table(
+      new BlockBasedTable(rep, block_cache_tracer));
+
+  // page cache options
+  rep->persistent_cache_options =
+      PersistentCacheOptions(rep->table_options.persistent_cache,
+                             std::string(rep->persistent_cache_key_prefix,
+                                         rep->persistent_cache_key_prefix_size),
+                             rep->ioptions.statistics);
+
+  // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
+  // handle to null, otherwise it may be seen as uninitialized during the below
+  // meta-block reads.
+  rep->compression_dict_handle = BlockHandle::NullBlockHandle();
+
+  // Read metaindex
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
+                                    &metaindex_iter);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Populates table_properties and some fields that depend on it,
+  // such as index_type.
+  s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
+                                     metaindex_iter.get(), largest_seqno);
+  if (!s.ok()) {
+    return s;
+  }
+  s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
+                                   internal_comparator, &lookup_context);
+  if (!s.ok()) {
+    return s;
+  }
+  s = new_table->PrefetchIndexAndFilterBlocks(
+      prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
+      prefetch_all, table_options, level, &lookup_context);
+
+  if (s.ok()) {
+    // Update tail prefetch stats
+    assert(prefetch_buffer.get() != nullptr);
+    if (tail_prefetch_stats != nullptr) {
+      assert(prefetch_buffer->min_offset_read() < file_size);
+      tail_prefetch_stats->RecordEffectiveSize(
+          static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
+    }
+
+    *table_reader = std::move(new_table);
+  }
+
+  return s;
+}
+
+Status BlockBasedTable::PrefetchTail(
+    RandomAccessFileReader* file, uint64_t file_size,
+    TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
+    const bool preload_all,
+    std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
+  size_t tail_prefetch_size = 0;
+  if (tail_prefetch_stats != nullptr) {
+    // Multiple threads may get a 0 (no history) when running in parallel,
+    // but it will get cleared after the first of them finishes.
+    tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
+  }
+  if (tail_prefetch_size == 0) {
+    // Before read footer, readahead backwards to prefetch data. Do more
+    // readahead if we're going to read index/filter.
+    // TODO: This may incorrectly select small readahead in case partitioned
+    // index/filter is enabled and top-level partition pinning is enabled.
+    // That's because we need to issue readahead before we read the properties,
+    // at which point we don't yet know the index type.
+    tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
+  }
+  size_t prefetch_off;
+  size_t prefetch_len;
+  if (file_size < tail_prefetch_size) {
+    prefetch_off = 0;
+    prefetch_len = static_cast<size_t>(file_size);
+  } else {
+    prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
+    prefetch_len = tail_prefetch_size;
+  }
+  TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
+                           &tail_prefetch_size);
+  Status s;
+  // TODO should not have this special logic in the future.
+  if (!file->use_direct_io()) {
+    prefetch_buffer->reset(new FilePrefetchBuffer(
+        nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
+    s = file->Prefetch(prefetch_off, prefetch_len);
+  } else {
+    prefetch_buffer->reset(new FilePrefetchBuffer(
+        nullptr, 0, 0, true /* enable */, true /* track_min_offset */));
+    s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
+  }
+  return s;
+}
+
+Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
+                      uint32_t expected) {
+  Status s;
+  uint32_t actual = 0;
+  switch (type) {
+    case kNoChecksum:
+      break;
+    case kCRC32c:
+      expected = crc32c::Unmask(expected);
+      actual = crc32c::Value(buf, len);
+      break;
+    case kxxHash:
+      actual = XXH32(buf, static_cast<int>(len), 0);
+      break;
+    case kxxHash64:
+      actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
+                                     uint64_t{0xffffffff});
+      break;
+    default:
+      s = Status::Corruption("unknown checksum type");
+  }
+  if (s.ok() && actual != expected) {
+    s = Status::Corruption("properties block checksum mismatched");
+  }
+  return s;
+}
+
+Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
+    FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
+    TableProperties** table_properties) {
+  assert(table_properties != nullptr);
+  // If this is an external SST file ingested with write_global_seqno set to
+  // true, then we expect the checksum mismatch because checksum was written
+  // by SstFileWriter, but its global seqno in the properties block may have
+  // been changed during ingestion. In this case, we read the properties
+  // block, copy it to a memory buffer, change the global seqno to its
+  // original value, i.e. 0, and verify the checksum again.
+  BlockHandle props_block_handle;
+  CacheAllocationPtr tmp_buf;
+  Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
+                            rep_->footer, rep_->ioptions, table_properties,
+                            false /* verify_checksum */, &props_block_handle,
+                            &tmp_buf, false /* compression_type_missing */,
+                            nullptr /* memory_allocator */);
+  if (s.ok() && tmp_buf) {
+    const auto seqno_pos_iter =
+        (*table_properties)
+            ->properties_offsets.find(
+                ExternalSstFilePropertyNames::kGlobalSeqno);
+    size_t block_size = static_cast<size_t>(props_block_handle.size());
+    if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
+      uint64_t global_seqno_offset = seqno_pos_iter->second;
+      EncodeFixed64(
+          tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
+    }
+    uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
+    s = ROCKSDB_NAMESPACE::VerifyChecksum(rep_->footer.checksum(),
+                                          tmp_buf.get(), block_size + 1, value);
+  }
+  return s;
+}
+
+Status BlockBasedTable::ReadPropertiesBlock(
+    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    const SequenceNumber largest_seqno) {
+  bool found_properties_block = true;
+  Status s;
+  s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
+
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(rep_->ioptions.info_log,
+                   "Error when seeking to properties block from file: %s",
+                   s.ToString().c_str());
+  } else if (found_properties_block) {
+    s = meta_iter->status();
+    TableProperties* table_properties = nullptr;
+    if (s.ok()) {
+      s = ReadProperties(
+          meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
+          rep_->ioptions, &table_properties, true /* verify_checksum */,
+          nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
+          false /* compression_type_missing */, nullptr /* memory_allocator */);
+    }
+
+    if (s.IsCorruption()) {
+      s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
+                                           &table_properties);
+    }
+    std::unique_ptr<TableProperties> props_guard;
+    if (table_properties != nullptr) {
+      props_guard.reset(table_properties);
+    }
+
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(rep_->ioptions.info_log,
+                     "Encountered error while reading data from properties "
+                     "block %s",
+                     s.ToString().c_str());
+    } else {
+      assert(table_properties != nullptr);
+      rep_->table_properties.reset(props_guard.release());
+      rep_->blocks_maybe_compressed =
+          rep_->table_properties->compression_name !=
+          CompressionTypeToString(kNoCompression);
+      rep_->blocks_definitely_zstd_compressed =
+          (rep_->table_properties->compression_name ==
+               CompressionTypeToString(kZSTD) ||
+           rep_->table_properties->compression_name ==
+               CompressionTypeToString(kZSTDNotFinalCompression));
+    }
+  } else {
+    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
+                    "Cannot find Properties block from file.");
+  }
+#ifndef ROCKSDB_LITE
+  if (rep_->table_properties) {
+    ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
+                        &(rep_->table_prefix_extractor));
+  }
+#endif  // ROCKSDB_LITE
+
+  // Read the table properties, if provided.
+  if (rep_->table_properties) {
+    rep_->whole_key_filtering &=
+        IsFeatureSupported(*(rep_->table_properties),
+                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
+                           rep_->ioptions.info_log);
+    rep_->prefix_filtering &=
+        IsFeatureSupported(*(rep_->table_properties),
+                           BlockBasedTablePropertyNames::kPrefixFiltering,
+                           rep_->ioptions.info_log);
+
+    rep_->index_key_includes_seq =
+        rep_->table_properties->index_key_is_user_key == 0;
+    rep_->index_value_is_full =
+        rep_->table_properties->index_value_is_delta_encoded == 0;
+
+    // Update index_type with the true type.
+    // If table properties don't contain index type, we assume that the table
+    // is in very old format and has kBinarySearch index type.
+    auto& props = rep_->table_properties->user_collected_properties;
+    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (pos != props.end()) {
+      rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
+          DecodeFixed32(pos->second.c_str()));
+    }
+
+    rep_->index_has_first_key =
+        rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
+
+    s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
+                                &(rep_->global_seqno));
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
+    }
+  }
+  return s;
+}
+
+Status BlockBasedTable::ReadRangeDelBlock(
+    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    const InternalKeyComparator& internal_comparator,
+    BlockCacheLookupContext* lookup_context) {
+  Status s;
+  bool found_range_del_block;
+  BlockHandle range_del_handle;
+  s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(
+        rep_->ioptions.info_log,
+        "Error when seeking to range delete tombstones block from file: %s",
+        s.ToString().c_str());
+  } else if (found_range_del_block && !range_del_handle.IsNull()) {
+    ReadOptions read_options;
+    std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
+        read_options, range_del_handle,
+        /*input_iter=*/nullptr, BlockType::kRangeDeletion,
+        /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
+    assert(iter != nullptr);
+    s = iter->status();
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(
+          rep_->ioptions.info_log,
+          "Encountered error while reading data from range del block %s",
+          s.ToString().c_str());
+    } else {
+      rep_->fragmented_range_dels =
+          std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
+                                                         internal_comparator);
+    }
+  }
+  return s;
+}
+
+Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
+    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    BlockBasedTable* new_table, bool prefetch_all,
+    const BlockBasedTableOptions& table_options, const int level,
+    BlockCacheLookupContext* lookup_context) {
+  Status s;
+
+  // Find filter handle and filter type
+  if (rep_->filter_policy) {
+    for (auto filter_type :
+         {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
+          Rep::FilterType::kBlockFilter}) {
+      std::string prefix;
+      switch (filter_type) {
+        case Rep::FilterType::kFullFilter:
+          prefix = kFullFilterBlockPrefix;
+          break;
+        case Rep::FilterType::kPartitionedFilter:
+          prefix = kPartitionedFilterBlockPrefix;
+          break;
+        case Rep::FilterType::kBlockFilter:
+          prefix = kFilterBlockPrefix;
+          break;
+        default:
+          assert(0);
+      }
+      std::string filter_block_key = prefix;
+      filter_block_key.append(rep_->filter_policy->Name());
+      if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
+              .ok()) {
+        rep_->filter_type = filter_type;
+        break;
+      }
+    }
+  }
+
+  // Find compression dictionary handle
+  bool found_compression_dict = false;
+  s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
+                                 &rep_->compression_dict_handle);
+  if (!s.ok()) {
+    return s;
+  }
+
+  BlockBasedTableOptions::IndexType index_type = rep_->index_type;
+
+  const bool use_cache = table_options.cache_index_and_filter_blocks;
+
+  // pin both index and filters, down to all partitions
+  const bool pin_all =
+      rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
+
+  // prefetch the first level of index
+  const bool prefetch_index =
+      prefetch_all ||
+      (table_options.pin_top_level_index_and_filter &&
+       index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+  // pin the first level of index
+  const bool pin_index =
+      pin_all || (table_options.pin_top_level_index_and_filter &&
+                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+
+  std::unique_ptr<IndexReader> index_reader;
+  s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
+                                   prefetch_index, pin_index, lookup_context,
+                                   &index_reader);
+  if (!s.ok()) {
+    return s;
+  }
+
+  rep_->index_reader = std::move(index_reader);
+
+  // The partitions of partitioned index are always stored in cache. They
+  // are hence follow the configuration for pin and prefetch regardless of
+  // the value of cache_index_and_filter_blocks
+  if (prefetch_all) {
+    rep_->index_reader->CacheDependencies(pin_all);
+  }
+
+  // prefetch the first level of filter
+  const bool prefetch_filter =
+      prefetch_all ||
+      (table_options.pin_top_level_index_and_filter &&
+       rep_->filter_type == Rep::FilterType::kPartitionedFilter);
+  // Partition fitlers cannot be enabled without partition indexes
+  assert(!prefetch_filter || prefetch_index);
+  // pin the first level of filter
+  const bool pin_filter =
+      pin_all || (table_options.pin_top_level_index_and_filter &&
+                  rep_->filter_type == Rep::FilterType::kPartitionedFilter);
+
+  if (rep_->filter_policy) {
+    auto filter = new_table->CreateFilterBlockReader(
+        prefetch_buffer, use_cache, prefetch_filter, pin_filter,
+        lookup_context);
+    if (filter) {
+      // Refer to the comment above about paritioned indexes always being cached
+      if (prefetch_all) {
+        filter->CacheDependencies(pin_all);
+      }
+
+      rep_->filter = std::move(filter);
+    }
+  }
+
+  if (!rep_->compression_dict_handle.IsNull()) {
+    std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
+    s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
+                                        prefetch_all, pin_all, lookup_context,
+                                        &uncompression_dict_reader);
+    if (!s.ok()) {
+      return s;
+    }
+
+    rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
+  }
+
+  assert(s.ok());
+  return s;
+}
+
+void BlockBasedTable::SetupForCompaction() {
+  switch (rep_->ioptions.access_hint_on_compaction_start) {
+    case Options::NONE:
+      break;
+    case Options::NORMAL:
+      rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
+      break;
+    case Options::SEQUENTIAL:
+      rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
+      break;
+    case Options::WILLNEED:
+      rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
+      break;
+    default:
+      assert(false);
+  }
+}
+
+std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
+    const {
+  return rep_->table_properties;
+}
+
+size_t BlockBasedTable::ApproximateMemoryUsage() const {
+  size_t usage = 0;
+  if (rep_->filter) {
+    usage += rep_->filter->ApproximateMemoryUsage();
+  }
+  if (rep_->index_reader) {
+    usage += rep_->index_reader->ApproximateMemoryUsage();
+  }
+  if (rep_->uncompression_dict_reader) {
+    usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
+  }
+  return usage;
+}
+
+// Load the meta-index-block from the file. On success, return the loaded
+// metaindex
+// block and its iterator.
+Status BlockBasedTable::ReadMetaIndexBlock(
+    FilePrefetchBuffer* prefetch_buffer,
+    std::unique_ptr<Block>* metaindex_block,
+    std::unique_ptr<InternalIterator>* iter) {
+  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
+  // it is an empty block.
+  std::unique_ptr<Block> metaindex;
+  Status s = ReadBlockFromFile(
+      rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
+      rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
+      true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
+      kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
+      GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
+      rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */);
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
+                    "Encountered error while reading data from properties"
+                    " block %s",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  *metaindex_block = std::move(metaindex);
+  // meta block uses bytewise comparator.
+  iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(),
+                                                      BytewiseComparator()));
+  return Status::OK();
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::GetDataBlockFromCache(
+    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+    Cache* block_cache, Cache* block_cache_compressed,
+    const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
+    const UncompressionDict& uncompression_dict, BlockType block_type,
+    GetContext* get_context) const {
+  const size_t read_amp_bytes_per_bit =
+      block_type == BlockType::kData
+          ? rep_->table_options.read_amp_bytes_per_bit
+          : 0;
+  assert(block);
+  assert(block->IsEmpty());
+
+  Status s;
+  BlockContents* compressed_block = nullptr;
+  Cache::Handle* block_cache_compressed_handle = nullptr;
+
+  // Lookup uncompressed cache first
+  if (block_cache != nullptr) {
+    auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
+                                          block_type, get_context);
+    if (cache_handle != nullptr) {
+      block->SetCachedValue(
+          reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
+          block_cache, cache_handle);
+      return s;
+    }
+  }
+
+  // If not found, search from the compressed block cache.
+  assert(block->IsEmpty());
+
+  if (block_cache_compressed == nullptr) {
+    return s;
+  }
+
+  assert(!compressed_block_cache_key.empty());
+  block_cache_compressed_handle =
+      block_cache_compressed->Lookup(compressed_block_cache_key);
+
+  Statistics* statistics = rep_->ioptions.statistics;
+
+  // if we found in the compressed cache, then uncompress and insert into
+  // uncompressed cache
+  if (block_cache_compressed_handle == nullptr) {
+    RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
+    return s;
+  }
+
+  // found compressed block
+  RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
+  compressed_block = reinterpret_cast<BlockContents*>(
+      block_cache_compressed->Value(block_cache_compressed_handle));
+  CompressionType compression_type = compressed_block->get_compression_type();
+  assert(compression_type != kNoCompression);
+
+  // Retrieve the uncompressed contents into a new buffer
+  BlockContents contents;
+  UncompressionContext context(compression_type);
+  UncompressionInfo info(context, uncompression_dict, compression_type);
+  s = UncompressBlockContents(
+      info, compressed_block->data.data(), compressed_block->data.size(),
+      &contents, rep_->table_options.format_version, rep_->ioptions,
+      GetMemoryAllocator(rep_->table_options));
+
+  // Insert uncompressed block into block cache
+  if (s.ok()) {
+    std::unique_ptr<TBlocklike> block_holder(
+        BlocklikeTraits<TBlocklike>::Create(
+            std::move(contents), rep_->get_global_seqno(block_type),
+            read_amp_bytes_per_bit, statistics,
+            rep_->blocks_definitely_zstd_compressed,
+            rep_->table_options.filter_policy.get()));  // uncompressed block
+
+    if (block_cache != nullptr && block_holder->own_bytes() &&
+        read_options.fill_cache) {
+      size_t charge = block_holder->ApproximateMemoryUsage();
+      Cache::Handle* cache_handle = nullptr;
+      s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
+                              &DeleteCachedEntry<TBlocklike>, &cache_handle);
+      if (s.ok()) {
+        assert(cache_handle != nullptr);
+        block->SetCachedValue(block_holder.release(), block_cache,
+                              cache_handle);
+
+        UpdateCacheInsertionMetrics(block_type, get_context, charge);
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
+      }
+    } else {
+      block->SetOwnedValue(block_holder.release());
+    }
+  }
+
+  // Release hold on compressed cache entry
+  block_cache_compressed->Release(block_cache_compressed_handle);
+  return s;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::PutDataBlockToCache(
+    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+    Cache* block_cache, Cache* block_cache_compressed,
+    CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
+    CompressionType raw_block_comp_type,
+    const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
+    MemoryAllocator* memory_allocator, BlockType block_type,
+    GetContext* get_context) const {
+  const ImmutableCFOptions& ioptions = rep_->ioptions;
+  const uint32_t format_version = rep_->table_options.format_version;
+  const size_t read_amp_bytes_per_bit =
+      block_type == BlockType::kData
+          ? rep_->table_options.read_amp_bytes_per_bit
+          : 0;
+  const Cache::Priority priority =
+      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
+              (block_type == BlockType::kFilter ||
+               block_type == BlockType::kCompressionDictionary ||
+               block_type == BlockType::kIndex)
+          ? Cache::Priority::HIGH
+          : Cache::Priority::LOW;
+  assert(cached_block);
+  assert(cached_block->IsEmpty());
+
+  Status s;
+  Statistics* statistics = ioptions.statistics;
+
+  std::unique_ptr<TBlocklike> block_holder;
+  if (raw_block_comp_type != kNoCompression) {
+    // Retrieve the uncompressed contents into a new buffer
+    BlockContents uncompressed_block_contents;
+    UncompressionContext context(raw_block_comp_type);
+    UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
+    s = UncompressBlockContents(info, raw_block_contents->data.data(),
+                                raw_block_contents->data.size(),
+                                &uncompressed_block_contents, format_version,
+                                ioptions, memory_allocator);
+    if (!s.ok()) {
+      return s;
+    }
+
+    block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
+        statistics, rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get()));
+  } else {
+    block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
+        statistics, rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get()));
+  }
+
+  // Insert compressed block into compressed block cache.
+  // Release the hold on the compressed cache entry immediately.
+  if (block_cache_compressed != nullptr &&
+      raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
+      raw_block_contents->own_bytes()) {
+#ifndef NDEBUG
+    assert(raw_block_contents->is_raw_block);
+#endif  // NDEBUG
+
+    // We cannot directly put raw_block_contents because this could point to
+    // an object in the stack.
+    BlockContents* block_cont_for_comp_cache =
+        new BlockContents(std::move(*raw_block_contents));
+    s = block_cache_compressed->Insert(
+        compressed_block_cache_key, block_cont_for_comp_cache,
+        block_cont_for_comp_cache->ApproximateMemoryUsage(),
+        &DeleteCachedEntry<BlockContents>);
+    if (s.ok()) {
+      // Avoid the following code to delete this cached block.
+      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
+    } else {
+      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+      delete block_cont_for_comp_cache;
+    }
+  }
+
+  // insert into uncompressed block cache
+  if (block_cache != nullptr && block_holder->own_bytes()) {
+    size_t charge = block_holder->ApproximateMemoryUsage();
+    Cache::Handle* cache_handle = nullptr;
+    s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
+                            &DeleteCachedEntry<TBlocklike>, &cache_handle,
+                            priority);
+    if (s.ok()) {
+      assert(cache_handle != nullptr);
+      cached_block->SetCachedValue(block_holder.release(), block_cache,
+                                   cache_handle);
+
+      UpdateCacheInsertionMetrics(block_type, get_context, charge);
+    } else {
+      RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
+    }
+  } else {
+    cached_block->SetOwnedValue(block_holder.release());
+  }
+
+  return s;
+}
+
+std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context) {
+  auto& rep = rep_;
+  auto filter_type = rep->filter_type;
+  if (filter_type == Rep::FilterType::kNoFilter) {
+    return std::unique_ptr<FilterBlockReader>();
+  }
+
+  assert(rep->filter_policy);
+
+  switch (filter_type) {
+    case Rep::FilterType::kPartitionedFilter:
+      return PartitionedFilterBlockReader::Create(
+          this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
+
+    case Rep::FilterType::kBlockFilter:
+      return BlockBasedFilterBlockReader::Create(
+          this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
+
+    case Rep::FilterType::kFullFilter:
+      return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
+                                           prefetch, pin, lookup_context);
+
+    default:
+      // filter_type is either kNoFilter (exited the function at the first if),
+      // or it must be covered in this switch block
+      assert(false);
+      return std::unique_ptr<FilterBlockReader>();
+  }
+}
+
+// disable_prefix_seek should be set to true when prefix_extractor found in SST
+// differs from the one in mutable_cf_options and index type is HashBasedIndex
+InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
+    const ReadOptions& read_options, bool disable_prefix_seek,
+    IndexBlockIter* input_iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) const {
+  assert(rep_ != nullptr);
+  assert(rep_->index_reader != nullptr);
+
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
+                                         input_iter, get_context,
+                                         lookup_context);
+}
+
+// Convert an index iterator value (i.e., an encoded BlockHandle)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(
+    const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
+    BlockType block_type, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context, Status s,
+    FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  CachableEntry<UncompressionDict> uncompression_dict;
+  if (rep_->uncompression_dict_reader) {
+    const bool no_io = (ro.read_tier == kBlockCacheTier);
+    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        prefetch_buffer, no_io, get_context, lookup_context,
+        &uncompression_dict);
+    if (!s.ok()) {
+      iter->Invalidate(s);
+      return iter;
+    }
+  }
+
+  const UncompressionDict& dict = uncompression_dict.GetValue()
+                                      ? *uncompression_dict.GetValue()
+                                      : UncompressionDict::GetEmptyDict();
+
+  CachableEntry<Block> block;
+  s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
+                    get_context, lookup_context, for_compaction,
+                    /* use_cache */ true);
+
+  if (!s.ok()) {
+    assert(block.IsEmpty());
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
+                                       block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
+      // insert a dummy record to block cache to track the memory usage
+      Cache* const block_cache = rep_->table_options.block_cache.get();
+      Cache::Handle* cache_handle = nullptr;
+      // There are two other types of cache keys: 1) SST cache key added in
+      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
+      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
+      // from SST cache key(31 bytes), and use non-zero prefix to
+      // differentiate from `write_buffer_manager`
+      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
+      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
+      // Prefix: use rep_->cache_key_prefix padded by 0s
+      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
+      assert(rep_->cache_key_prefix_size != 0);
+      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
+      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
+      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
+                                 next_cache_key_id_++);
+      assert(end - cache_key <=
+             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
+      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
+      s = block_cache->Insert(unique_key, nullptr,
+                              block.GetValue()->ApproximateMemoryUsage(),
+                              nullptr, &cache_handle);
+
+      if (s.ok()) {
+        assert(cache_handle != nullptr);
+        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                              cache_handle);
+      }
+    }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
+  }
+
+  block.TransferTo(iter);
+
+  return iter;
+}
+
+template <>
+DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
+    const Rep* rep, Block* block, DataBlockIter* input_iter,
+    bool block_contents_pinned) {
+  return block->NewDataIterator(
+      &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+      input_iter, rep->ioptions.statistics, block_contents_pinned);
+}
+
+template <>
+IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
+    const Rep* rep, Block* block, IndexBlockIter* input_iter,
+    bool block_contents_pinned) {
+  return block->NewIndexIterator(
+      &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+      input_iter, rep->ioptions.statistics, /* total_order_seek */ true,
+      rep->index_has_first_key, rep->index_key_includes_seq,
+      rep->index_value_is_full, block_contents_pinned);
+}
+
+// Convert an uncompressed data block (i.e CachableEntry<Block>)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
+                                                  CachableEntry<Block>& block,
+                                                  TBlockIter* input_iter,
+                                                  Status s) const {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
+                                       block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
+      // insert a dummy record to block cache to track the memory usage
+      Cache* const block_cache = rep_->table_options.block_cache.get();
+      Cache::Handle* cache_handle = nullptr;
+      // There are two other types of cache keys: 1) SST cache key added in
+      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
+      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
+      // from SST cache key(31 bytes), and use non-zero prefix to
+      // differentiate from `write_buffer_manager`
+      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
+      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
+      // Prefix: use rep_->cache_key_prefix padded by 0s
+      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
+      assert(rep_->cache_key_prefix_size != 0);
+      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
+      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
+      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
+                                 next_cache_key_id_++);
+      assert(end - cache_key <=
+             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
+      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
+      s = block_cache->Insert(unique_key, nullptr,
+                              block.GetValue()->ApproximateMemoryUsage(),
+                              nullptr, &cache_handle);
+      if (s.ok()) {
+        assert(cache_handle != nullptr);
+        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                              cache_handle);
+      }
+    }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
+  }
+
+  block.TransferTo(iter);
+  return iter;
+}
+
+// If contents is nullptr, this function looks up the block caches for the
+// data block referenced by handle, and read the block from disk if necessary.
+// If contents is non-null, it skips the cache lookup and disk read, since
+// the caller has already read it. In both cases, if ro.fill_cache is true,
+// it inserts the block into the block cache.
+template <typename TBlocklike>
+Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    BlockContents* contents) const {
+  assert(block_entry != nullptr);
+  const bool no_io = (ro.read_tier == kBlockCacheTier);
+  Cache* block_cache = rep_->table_options.block_cache.get();
+  // No point to cache compressed blocks if it never goes away
+  Cache* block_cache_compressed =
+      rep_->immortal_table ? nullptr
+                           : rep_->table_options.block_cache_compressed.get();
+
+  // First, try to get the block from the cache
+  //
+  // If either block cache is enabled, we'll try to read from it.
+  Status s;
+  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  Slice key /* key to the block cache */;
+  Slice ckey /* key to the compressed block cache */;
+  bool is_cache_hit = false;
+  if (block_cache != nullptr || block_cache_compressed != nullptr) {
+    // create key for block cache
+    if (block_cache != nullptr) {
+      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
+                        handle, cache_key);
+    }
+
+    if (block_cache_compressed != nullptr) {
+      ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
+                         rep_->compressed_cache_key_prefix_size, handle,
+                         compressed_cache_key);
+    }
+
+    if (!contents) {
+      s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
+                                ro, block_entry, uncompression_dict, block_type,
+                                get_context);
+      if (block_entry->GetValue()) {
+        // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
+        // compressed block cache.
+        is_cache_hit = true;
+      }
+    }
+
+    // Can't find the block from the cache. If I/O is allowed, read from the
+    // file.
+    if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
+      Statistics* statistics = rep_->ioptions.statistics;
+      const bool maybe_compressed =
+          block_type != BlockType::kFilter &&
+          block_type != BlockType::kCompressionDictionary &&
+          rep_->blocks_maybe_compressed;
+      const bool do_uncompress = maybe_compressed && !block_cache_compressed;
+      CompressionType raw_block_comp_type;
+      BlockContents raw_block_contents;
+      if (!contents) {
+        StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
+        BlockFetcher block_fetcher(
+            rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
+            &raw_block_contents, rep_->ioptions, do_uncompress,
+            maybe_compressed, block_type, uncompression_dict,
+            rep_->persistent_cache_options,
+            GetMemoryAllocator(rep_->table_options),
+            GetMemoryAllocatorForCompressedBlock(rep_->table_options));
+        s = block_fetcher.ReadBlockContents();
+        raw_block_comp_type = block_fetcher.get_compression_type();
+        contents = &raw_block_contents;
+      } else {
+        raw_block_comp_type = contents->get_compression_type();
+      }
+
+      if (s.ok()) {
+        SequenceNumber seq_no = rep_->get_global_seqno(block_type);
+        // If filling cache is allowed and a cache is configured, try to put the
+        // block to the cache.
+        s = PutDataBlockToCache(
+            key, ckey, block_cache, block_cache_compressed, block_entry,
+            contents, raw_block_comp_type, uncompression_dict, seq_no,
+            GetMemoryAllocator(rep_->table_options), block_type, get_context);
+      }
+    }
+  }
+
+  // Fill lookup_context.
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
+      lookup_context) {
+    size_t usage = 0;
+    uint64_t nkeys = 0;
+    if (block_entry->GetValue()) {
+      // Approximate the number of keys in the block using restarts.
+      nkeys =
+          rep_->table_options.block_restart_interval *
+          BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
+      usage = block_entry->GetValue()->ApproximateMemoryUsage();
+    }
+    TraceType trace_block_type = TraceType::kTraceMax;
+    switch (block_type) {
+      case BlockType::kData:
+        trace_block_type = TraceType::kBlockTraceDataBlock;
+        break;
+      case BlockType::kFilter:
+        trace_block_type = TraceType::kBlockTraceFilterBlock;
+        break;
+      case BlockType::kCompressionDictionary:
+        trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
+        break;
+      case BlockType::kRangeDeletion:
+        trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
+        break;
+      case BlockType::kIndex:
+        trace_block_type = TraceType::kBlockTraceIndexBlock;
+        break;
+      default:
+        // This cannot happen.
+        assert(false);
+        break;
+    }
+    bool no_insert = no_io || !ro.fill_cache;
+    if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
+            trace_block_type, lookup_context->caller)) {
+      // Defer logging the access to Get() and MultiGet() to trace additional
+      // information, e.g., referenced_key_exist_in_block.
+
+      // Make a copy of the block key here since it will be logged later.
+      lookup_context->FillLookupContext(
+          is_cache_hit, no_insert, trace_block_type,
+          /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
+    } else {
+      // Avoid making copy of block_key and cf_name when constructing the access
+      // record.
+      BlockCacheTraceRecord access_record(
+          rep_->ioptions.env->NowMicros(),
+          /*block_key=*/"", trace_block_type,
+          /*block_size=*/usage, rep_->cf_id_for_tracing(),
+          /*cf_name=*/"", rep_->level_for_tracing(),
+          rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
+          no_insert, lookup_context->get_id,
+          lookup_context->get_from_user_specified_snapshot,
+          /*referenced_key=*/"");
+      block_cache_tracer_->WriteBlockAccess(access_record, key,
+                                            rep_->cf_name_for_tracing(),
+                                            lookup_context->referenced_key);
+    }
+  }
+
+  assert(s.ok() || block_entry->GetValue() == nullptr);
+  return s;
+}
+
+// This function reads multiple data blocks from disk using Env::MultiRead()
+// and optionally inserts them into the block cache. It uses the scratch
+// buffer provided by the caller, which is contiguous. If scratch is a nullptr
+// it allocates a separate buffer for each block. Typically, if the blocks
+// need to be uncompressed and there is no compressed block cache, callers
+// can allocate a temporary scratch buffer in order to minimize memory
+// allocations.
+// If options.fill_cache is true, it inserts the blocks into cache. If its
+// false and scratch is non-null and the blocks are uncompressed, it copies
+// the buffers to heap. In any case, the CachableEntry<Block> returned will
+// own the data bytes.
+// If compression is enabled and also there is no compressed block cache,
+// the adjacent blocks are read out in one IO (combined read)
+// batch - A MultiGetRange with only those keys with unique data blocks not
+//         found in cache
+// handles - A vector of block handles. Some of them me be NULL handles
+// scratch - An optional contiguous buffer to read compressed blocks into
+void BlockBasedTable::RetrieveMultipleBlocks(
+    const ReadOptions& options, const MultiGetRange* batch,
+    const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
+    autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+    autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
+    char* scratch, const UncompressionDict& uncompression_dict) const {
+  RandomAccessFileReader* file = rep_->file.get();
+  const Footer& footer = rep_->footer;
+  const ImmutableCFOptions& ioptions = rep_->ioptions;
+  SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData);
+  size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
+  MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
+
+  if (file->use_direct_io() || ioptions.allow_mmap_reads) {
+    size_t idx_in_batch = 0;
+    for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+         ++mget_iter, ++idx_in_batch) {
+      BlockCacheLookupContext lookup_data_block_context(
+          TableReaderCaller::kUserMultiGet);
+      const BlockHandle& handle = (*handles)[idx_in_batch];
+      if (handle.IsNull()) {
+        continue;
+      }
+
+      (*statuses)[idx_in_batch] =
+          RetrieveBlock(nullptr, options, handle, uncompression_dict,
+                        &(*results)[idx_in_batch], BlockType::kData,
+                        mget_iter->get_context, &lookup_data_block_context,
+                        /* for_compaction */ false, /* use_cache */ true);
+    }
+    return;
+  }
+
+  autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
+  size_t buf_offset = 0;
+  size_t idx_in_batch = 0;
+
+  uint64_t prev_offset = 0;
+  size_t prev_len = 0;
+  autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block;
+  autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block;
+  for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+       ++mget_iter, ++idx_in_batch) {
+    const BlockHandle& handle = (*handles)[idx_in_batch];
+    if (handle.IsNull()) {
+      continue;
+    }
+
+    size_t prev_end = static_cast<size_t>(prev_offset) + prev_len;
+
+    // If current block is adjacent to the previous one, at the same time,
+    // compression is enabled and there is no compressed cache, we combine
+    // the two block read as one.
+    if (scratch != nullptr && prev_end == handle.offset()) {
+      req_offset_for_block.emplace_back(prev_len);
+      prev_len += block_size(handle);
+    } else {
+      // No compression or current block and previous one is not adjacent:
+      // Step 1, create a new request for previous blocks
+      if (prev_len != 0) {
+        FSReadRequest req;
+        req.offset = prev_offset;
+        req.len = prev_len;
+        if (scratch == nullptr) {
+          req.scratch = new char[req.len];
+        } else {
+          req.scratch = scratch + buf_offset;
+          buf_offset += req.len;
+        }
+        req.status = IOStatus::OK();
+        read_reqs.emplace_back(req);
+      }
+
+      // Step 2, remeber the previous block info
+      prev_offset = handle.offset();
+      prev_len = block_size(handle);
+      req_offset_for_block.emplace_back(0);
+    }
+    req_idx_for_block.emplace_back(read_reqs.size());
+  }
+  // Handle the last block and process the pending last request
+  if (prev_len != 0) {
+    FSReadRequest req;
+    req.offset = prev_offset;
+    req.len = prev_len;
+    if (scratch == nullptr) {
+      req.scratch = new char[req.len];
+    } else {
+      req.scratch = scratch + buf_offset;
+    }
+    req.status = IOStatus::OK();
+    read_reqs.emplace_back(req);
+  }
+
+  file->MultiRead(&read_reqs[0], read_reqs.size());
+
+  idx_in_batch = 0;
+  size_t valid_batch_idx = 0;
+  for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+       ++mget_iter, ++idx_in_batch) {
+    const BlockHandle& handle = (*handles)[idx_in_batch];
+
+    if (handle.IsNull()) {
+      continue;
+    }
+
+    assert(valid_batch_idx < req_idx_for_block.size());
+    assert(valid_batch_idx < req_offset_for_block.size());
+    assert(req_idx_for_block[valid_batch_idx] < read_reqs.size());
+    size_t& req_idx = req_idx_for_block[valid_batch_idx];
+    size_t& req_offset = req_offset_for_block[valid_batch_idx];
+    valid_batch_idx++;
+    FSReadRequest& req = read_reqs[req_idx];
+    Status s = req.status;
+    if (s.ok()) {
+      if (req.result.size() != req.len) {
+        s = Status::Corruption(
+            "truncated block read from " + rep_->file->file_name() +
+            " offset " + ToString(handle.offset()) + ", expected " +
+            ToString(req.len) + " bytes, got " + ToString(req.result.size()));
+      }
+    }
+
+    BlockContents raw_block_contents;
+    size_t cur_read_end = req_offset + block_size(handle);
+    if (cur_read_end > req.result.size()) {
+      s = Status::Corruption(
+          "truncated block read from " + rep_->file->file_name() + " offset " +
+          ToString(handle.offset()) + ", expected " + ToString(req.len) +
+          " bytes, got " + ToString(req.result.size()));
+    }
+
+    bool blocks_share_read_buffer = (req.result.size() != block_size(handle));
+    if (s.ok()) {
+      if (scratch == nullptr && !blocks_share_read_buffer) {
+        // We allocated a buffer for this block. Give ownership of it to
+        // BlockContents so it can free the memory
+        assert(req.result.data() == req.scratch);
+        std::unique_ptr<char[]> raw_block(req.scratch + req_offset);
+        raw_block_contents = BlockContents(std::move(raw_block), handle.size());
+      } else {
+        // We used the scratch buffer which are shared by the blocks.
+        // raw_block_contents does not have the ownership.
+        raw_block_contents =
+            BlockContents(Slice(req.scratch + req_offset, handle.size()));
+      }
+
+#ifndef NDEBUG
+      raw_block_contents.is_raw_block = true;
+#endif
+      if (options.verify_checksums) {
+        PERF_TIMER_GUARD(block_checksum_time);
+        const char* data = req.result.data();
+        uint32_t expected =
+            DecodeFixed32(data + req_offset + handle.size() + 1);
+        // Since the scratch might be shared. the offset of the data block in
+        // the buffer might not be 0. req.result.data() only point to the
+        // begin address of each read request, we need to add the offset
+        // in each read request. Checksum is stored in the block trailer,
+        // which is handle.size() + 1.
+        s = ROCKSDB_NAMESPACE::VerifyChecksum(footer.checksum(),
+                                              req.result.data() + req_offset,
+                                              handle.size() + 1, expected);
+        TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
+      }
+    }
+
+    if (s.ok()) {
+      // It handles a rare case: compression is set and these is no compressed
+      // cache (enable combined read). In this case, the scratch != nullptr.
+      // At the same time, some blocks are actually not compressed,
+      // since its compression space saving is smaller than the threshold. In
+      // this case, if the block shares the scratch memory, we need to copy it
+      // to the heap such that it can be added to the regular block cache.
+      CompressionType compression_type =
+          raw_block_contents.get_compression_type();
+      if (scratch != nullptr && compression_type == kNoCompression) {
+        Slice raw = Slice(req.scratch + req_offset, block_size(handle));
+        raw_block_contents = BlockContents(
+            CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
+            handle.size());
+#ifndef NDEBUG
+        raw_block_contents.is_raw_block = true;
+#endif
+      }
+    }
+
+    if (s.ok()) {
+      if (options.fill_cache) {
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet);
+        CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
+        // MaybeReadBlockAndLoadToCache will insert into the block caches if
+        // necessary. Since we're passing the raw block contents, it will
+        // avoid looking up the block cache
+        s = MaybeReadBlockAndLoadToCache(
+            nullptr, options, handle, uncompression_dict, block_entry,
+            BlockType::kData, mget_iter->get_context,
+            &lookup_data_block_context, &raw_block_contents);
+
+        // block_entry value could be null if no block cache is present, i.e
+        // BlockBasedTableOptions::no_block_cache is true and no compressed
+        // block cache is configured. In that case, fall
+        // through and set up the block explicitly
+        if (block_entry->GetValue() != nullptr) {
+          continue;
+        }
+      }
+
+      CompressionType compression_type =
+          raw_block_contents.get_compression_type();
+      BlockContents contents;
+      if (compression_type != kNoCompression) {
+        UncompressionContext context(compression_type);
+        UncompressionInfo info(context, uncompression_dict, compression_type);
+        s = UncompressBlockContents(info, req.result.data() + req_offset,
+                                    handle.size(), &contents, footer.version(),
+                                    rep_->ioptions, memory_allocator);
+      } else {
+        // There are two cases here: 1) caller uses the scratch buffer; 2) we
+        // use the requst buffer. If scratch buffer is used, we ensure that
+        // all raw blocks are copyed to the heap as single blocks. If scratch
+        // buffer is not used, we also have no combined read, so the raw
+        // block can be used directly.
+        contents = std::move(raw_block_contents);
+      }
+      if (s.ok()) {
+        (*results)[idx_in_batch].SetOwnedValue(
+            new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
+                      ioptions.statistics));
+      }
+    }
+    (*statuses)[idx_in_batch] = s;
+  }
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::RetrieveBlock(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const {
+  assert(block_entry);
+  assert(block_entry->IsEmpty());
+
+  Status s;
+  if (use_cache) {
+    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
+                                     uncompression_dict, block_entry,
+                                     block_type, get_context, lookup_context,
+                                     /*contents=*/nullptr);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (block_entry->GetValue() != nullptr) {
+      assert(s.ok());
+      return s;
+    }
+  }
+
+  assert(block_entry->IsEmpty());
+
+  const bool no_io = ro.read_tier == kBlockCacheTier;
+  if (no_io) {
+    return Status::Incomplete("no blocking io");
+  }
+
+  const bool maybe_compressed =
+      block_type != BlockType::kFilter &&
+      block_type != BlockType::kCompressionDictionary &&
+      rep_->blocks_maybe_compressed;
+  const bool do_uncompress = maybe_compressed;
+  std::unique_ptr<TBlocklike> block;
+
+  {
+    StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
+                 READ_BLOCK_GET_MICROS);
+    s = ReadBlockFromFile(
+        rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
+        rep_->ioptions, do_uncompress, maybe_compressed, block_type,
+        uncompression_dict, rep_->persistent_cache_options,
+        rep_->get_global_seqno(block_type),
+        block_type == BlockType::kData
+            ? rep_->table_options.read_amp_bytes_per_bit
+            : 0,
+        GetMemoryAllocator(rep_->table_options), for_compaction,
+        rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get());
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  block_entry->SetOwnedValue(block.release());
+
+  assert(s.ok());
+  return s;
+}
+
+// Explicitly instantiate templates for both "blocklike" types we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template Status BlockBasedTable::RetrieveBlock<BlockContents>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<BlockContents>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const;
+
+template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const;
+
+template Status BlockBasedTable::RetrieveBlock<Block>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<Block>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const;
+
+template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const;
+
+BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
+    const BlockBasedTable* table,
+    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
+    : table_(table), block_map_(block_map) {}
+
+InternalIteratorBase<IndexValue>*
+BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
+    const BlockHandle& handle) {
+  // Return a block iterator on the index partition
+  auto block = block_map_->find(handle.offset());
+  // This is a possible scenario since block cache might not have had space
+  // for the partition
+  if (block != block_map_->end()) {
+    const Rep* rep = table_->get_rep();
+    assert(rep);
+
+    Statistics* kNullStats = nullptr;
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    return block->second.GetValue()->NewIndexIterator(
+        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+        nullptr, kNullStats, true, rep->index_has_first_key,
+        rep->index_key_includes_seq, rep->index_value_is_full);
+  }
+  // Create an empty iterator
+  return new IndexBlockIter();
+}
+
+// This will be broken if the user specifies an unusual implementation
+// of Options.comparator, or if the user specifies an unusual
+// definition of prefixes in BlockBasedTableOptions.filter_policy.
+// In particular, we require the following three properties:
+//
+// 1) key.starts_with(prefix(key))
+// 2) Compare(prefix(key), key) <= 0.
+// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
+//
+// Otherwise, this method guarantees no I/O will be incurred.
+//
+// REQUIRES: this method shouldn't be called while the DB lock is held.
+bool BlockBasedTable::PrefixMayMatch(
+    const Slice& internal_key, const ReadOptions& read_options,
+    const SliceTransform* options_prefix_extractor,
+    const bool need_upper_bound_check,
+    BlockCacheLookupContext* lookup_context) const {
+  if (!rep_->filter_policy) {
+    return true;
+  }
+
+  const SliceTransform* prefix_extractor;
+
+  if (rep_->table_prefix_extractor == nullptr) {
+    if (need_upper_bound_check) {
+      return true;
+    }
+    prefix_extractor = options_prefix_extractor;
+  } else {
+    prefix_extractor = rep_->table_prefix_extractor.get();
+  }
+  auto user_key = ExtractUserKey(internal_key);
+  if (!prefix_extractor->InDomain(user_key)) {
+    return true;
+  }
+
+  bool may_match = true;
+  Status s;
+
+  // First, try check with full filter
+  FilterBlockReader* const filter = rep_->filter.get();
+  bool filter_checked = true;
+  if (filter != nullptr) {
+    if (!filter->IsBlockBased()) {
+      const Slice* const const_ikey_ptr = &internal_key;
+      may_match = filter->RangeMayExist(
+          read_options.iterate_upper_bound, user_key, prefix_extractor,
+          rep_->internal_comparator.user_comparator(), const_ikey_ptr,
+          &filter_checked, need_upper_bound_check, lookup_context);
+    } else {
+      // if prefix_extractor changed for block based filter, skip filter
+      if (need_upper_bound_check) {
+        return true;
+      }
+      auto prefix = prefix_extractor->Transform(user_key);
+      InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
+      auto internal_prefix = internal_key_prefix.Encode();
+
+      // To prevent any io operation in this method, we set `read_tier` to make
+      // sure we always read index or filter only when they have already been
+      // loaded to memory.
+      ReadOptions no_io_read_options;
+      no_io_read_options.read_tier = kBlockCacheTier;
+
+      // Then, try find it within each block
+      // we already know prefix_extractor and prefix_extractor_name must match
+      // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
+      std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
+          no_io_read_options,
+          /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+          /*get_context=*/nullptr, lookup_context));
+      iiter->Seek(internal_prefix);
+
+      if (!iiter->Valid()) {
+        // we're past end of file
+        // if it's incomplete, it means that we avoided I/O
+        // and we're not really sure that we're past the end
+        // of the file
+        may_match = iiter->status().IsIncomplete();
+      } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
+                                               : iiter->key())
+                     .starts_with(ExtractUserKey(internal_prefix))) {
+        // we need to check for this subtle case because our only
+        // guarantee is that "the key is a string >= last key in that data
+        // block" according to the doc/table_format.txt spec.
+        //
+        // Suppose iiter->key() starts with the desired prefix; it is not
+        // necessarily the case that the corresponding data block will
+        // contain the prefix, since iiter->key() need not be in the
+        // block.  However, the next data block may contain the prefix, so
+        // we return true to play it safe.
+        may_match = true;
+      } else if (filter->IsBlockBased()) {
+        // iiter->key() does NOT start with the desired prefix.  Because
+        // Seek() finds the first key that is >= the seek target, this
+        // means that iiter->key() > prefix.  Thus, any data blocks coming
+        // after the data block corresponding to iiter->key() cannot
+        // possibly contain the key.  Thus, the corresponding data block
+        // is the only on could potentially contain the prefix.
+        BlockHandle handle = iiter->value().handle;
+        may_match = filter->PrefixMayMatch(
+            prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
+            /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
+      }
+    }
+  }
+
+  if (filter_checked) {
+    Statistics* statistics = rep_->ioptions.statistics;
+    RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
+    if (!may_match) {
+      RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
+    }
+  }
+
+  return may_match;
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
+  SeekImpl(&target);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
+  SeekImpl(nullptr);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
+    const Slice* target) {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
+    ResetDataIter();
+    return;
+  }
+
+  bool need_seek_index = true;
+  if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
+    // Reseek.
+    prev_block_offset_ = index_iter_->value().handle.offset();
+
+    if (target) {
+      // We can avoid an index seek if:
+      // 1. The new seek key is larger than the current key
+      // 2. The new seek key is within the upper bound of the block
+      // Since we don't necessarily know the internal key for either
+      // the current key or the upper bound, we check user keys and
+      // exclude the equality case. Considering internal keys can
+      // improve for the boundary cases, but it would complicate the
+      // code.
+      if (user_comparator_.Compare(ExtractUserKey(*target),
+                                   block_iter_.user_key()) > 0 &&
+          user_comparator_.Compare(ExtractUserKey(*target),
+                                   index_iter_->user_key()) < 0) {
+        need_seek_index = false;
+      }
+    }
+  }
+
+  if (need_seek_index) {
+    if (target) {
+      index_iter_->Seek(*target);
+    } else {
+      index_iter_->SeekToFirst();
+    }
+
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      return;
+    }
+  }
+
+  IndexValue v = index_iter_->value();
+  const bool same_block = block_iter_points_to_real_block_ &&
+                          v.handle.offset() == prev_block_offset_;
+
+  // TODO(kolmike): Remove the != kBlockCacheTier condition.
+  if (!v.first_internal_key.empty() && !same_block &&
+      (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
+      read_options_.read_tier != kBlockCacheTier) {
+    // Index contains the first key of the block, and it's >= target.
+    // We can defer reading the block.
+    is_at_first_key_from_index_ = true;
+    // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
+    // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
+    // as that will be done later when the data block is actually read.
+    ResetDataIter();
+  } else {
+    // Need to use the data block.
+    if (!same_block) {
+      InitDataBlock();
+    } else {
+      // When the user does a reseek, the iterate_upper_bound might have
+      // changed. CheckDataBlockWithinUpperBound() needs to be called
+      // explicitly if the reseek ends up in the same data block.
+      // If the reseek ends up in a different block, InitDataBlock() will do
+      // the iterator upper bound check.
+      CheckDataBlockWithinUpperBound();
+    }
+
+    if (target) {
+      block_iter_.Seek(*target);
+    } else {
+      block_iter_.SeekToFirst();
+    }
+    FindKeyForward();
+  }
+
+  CheckOutOfBound();
+
+  if (target) {
+    assert(!Valid() || ((block_type_ == BlockType::kIndex &&
+                         !table_->get_rep()->index_key_includes_seq)
+                            ? (user_comparator_.Compare(ExtractUserKey(*target),
+                                                        key()) <= 0)
+                            : (icomp_.Compare(*target, key()) <= 0)));
+  }
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
+    const Slice& target) {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  // For now totally disable prefix seek in auto prefix mode because we don't
+  // have logic
+  if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
+    ResetDataIter();
+    return;
+  }
+
+  SavePrevIndexValue();
+
+  // Call Seek() rather than SeekForPrev() in the index block, because the
+  // target data block will likely to contain the position for `target`, the
+  // same as Seek(), rather than than before.
+  // For example, if we have three data blocks, each containing two keys:
+  //   [2, 4]  [6, 8] [10, 12]
+  //  (the keys in the index block would be [4, 8, 12])
+  // and the user calls SeekForPrev(7), we need to go to the second block,
+  // just like if they call Seek(7).
+  // The only case where the block is difference is when they seek to a position
+  // in the boundary. For example, if they SeekForPrev(5), we should go to the
+  // first block, rather than the second. However, we don't have the information
+  // to distinguish the two unless we read the second block. In this case, we'll
+  // end up with reading two blocks.
+  index_iter_->Seek(target);
+
+  if (!index_iter_->Valid()) {
+    auto seek_status = index_iter_->status();
+    // Check for IO error
+    if (!seek_status.IsNotFound() && !seek_status.ok()) {
+      ResetDataIter();
+      return;
+    }
+
+    // With prefix index, Seek() returns NotFound if the prefix doesn't exist
+    if (seek_status.IsNotFound()) {
+      // Any key less than the target is fine for prefix seek
+      ResetDataIter();
+      return;
+    } else {
+      index_iter_->SeekToLast();
+    }
+    // Check for IO error
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      return;
+    }
+  }
+
+  InitDataBlock();
+
+  block_iter_.SeekForPrev(target);
+
+  FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
+  assert(!block_iter_.Valid() ||
+         icomp_.Compare(target, block_iter_.key()) >= 0);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  SavePrevIndexValue();
+  index_iter_->SeekToLast();
+  if (!index_iter_->Valid()) {
+    ResetDataIter();
+    return;
+  }
+  InitDataBlock();
+  block_iter_.SeekToLast();
+  FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
+  if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
+    return;
+  }
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Next();
+  FindKeyForward();
+  CheckOutOfBound();
+}
+
+template <class TBlockIter, typename TValue>
+bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
+    IterateResult* result) {
+  Next();
+  bool is_valid = Valid();
+  if (is_valid) {
+    result->key = key();
+    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+  }
+  return is_valid;
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
+  if (is_at_first_key_from_index_) {
+    is_at_first_key_from_index_ = false;
+
+    index_iter_->Prev();
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToLast();
+  } else {
+    assert(block_iter_points_to_real_block_);
+    block_iter_.Prev();
+  }
+
+  FindKeyBackward();
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
+  BlockHandle data_block_handle = index_iter_->value().handle;
+  if (!block_iter_points_to_real_block_ ||
+      data_block_handle.offset() != prev_block_offset_ ||
+      // if previous attempt of reading the block missed cache, try again
+      block_iter_.status().IsIncomplete()) {
+    if (block_iter_points_to_real_block_) {
+      ResetDataIter();
+    }
+    auto* rep = table_->get_rep();
+
+    // Prefetch additional data for range scans (iterators). Enabled only for
+    // user reads.
+    // Implicit auto readahead:
+    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
+    // Explicit user requested readahead:
+    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
+    if (lookup_context_.caller != TableReaderCaller::kCompaction) {
+      if (read_options_.readahead_size == 0) {
+        // Implicit auto readahead
+        num_file_reads_++;
+        if (num_file_reads_ >
+            BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
+          if (!rep->file->use_direct_io() &&
+              (data_block_handle.offset() +
+                   static_cast<size_t>(block_size(data_block_handle)) >
+               readahead_limit_)) {
+            // Buffered I/O
+            // Discarding the return status of Prefetch calls intentionally, as
+            // we can fallback to reading from disk if Prefetch fails.
+            rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
+            readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
+                                                   readahead_size_);
+            // Keep exponentially increasing readahead size until
+            // kMaxAutoReadaheadSize.
+            readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
+                                       readahead_size_ * 2);
+          } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
+            // Direct I/O
+            // Let FilePrefetchBuffer take care of the readahead.
+            rep->CreateFilePrefetchBuffer(
+                BlockBasedTable::kInitAutoReadaheadSize,
+                BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
+          }
+        }
+      } else if (!prefetch_buffer_) {
+        // Explicit user requested readahead
+        // The actual condition is:
+        // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
+        rep->CreateFilePrefetchBuffer(read_options_.readahead_size,
+                                      read_options_.readahead_size,
+                                      &prefetch_buffer_);
+      }
+    } else if (!prefetch_buffer_) {
+      rep->CreateFilePrefetchBuffer(compaction_readahead_size_,
+                                    compaction_readahead_size_,
+                                    &prefetch_buffer_);
+    }
+
+    Status s;
+    table_->NewDataBlockIterator<TBlockIter>(
+        read_options_, data_block_handle, &block_iter_, block_type_,
+        /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
+        /*for_compaction=*/lookup_context_.caller ==
+            TableReaderCaller::kCompaction);
+    block_iter_points_to_real_block_ = true;
+    CheckDataBlockWithinUpperBound();
+  }
+}
+
+template <class TBlockIter, typename TValue>
+bool BlockBasedTableIterator<TBlockIter, TValue>::MaterializeCurrentBlock() {
+  assert(is_at_first_key_from_index_);
+  assert(!block_iter_points_to_real_block_);
+  assert(index_iter_->Valid());
+
+  is_at_first_key_from_index_ = false;
+  InitDataBlock();
+  assert(block_iter_points_to_real_block_);
+  block_iter_.SeekToFirst();
+
+  if (!block_iter_.Valid() ||
+      icomp_.Compare(block_iter_.key(),
+                     index_iter_->value().first_internal_key) != 0) {
+    // Uh oh.
+    block_iter_.Invalidate(Status::Corruption(
+        "first key in index doesn't match first key in block"));
+    return false;
+  }
+
+  return true;
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
+  // This method's code is kept short to make it likely to be inlined.
+
+  assert(!is_out_of_bound_);
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.Valid()) {
+    // This is the only call site of FindBlockForward(), but it's extracted into
+    // a separate method to keep FindKeyForward() short and likely to be
+    // inlined. When transitioning to a different block, we call
+    // FindBlockForward(), which is much longer and is probably not inlined.
+    FindBlockForward();
+  } else {
+    // This is the fast path that avoids a function call.
+  }
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
+  // TODO the while loop inherits from two-level-iterator. We don't know
+  // whether a block can be empty so it can be replaced by an "if".
+  do {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+    // Whether next data block is out of upper bound, if there is one.
+    const bool next_block_is_out_of_bound =
+        read_options_.iterate_upper_bound != nullptr &&
+        block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
+    assert(!next_block_is_out_of_bound ||
+           user_comparator_.Compare(*read_options_.iterate_upper_bound,
+                                    index_iter_->user_key()) <= 0);
+    ResetDataIter();
+    index_iter_->Next();
+    if (next_block_is_out_of_bound) {
+      // The next block is out of bound. No need to read it.
+      TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
+      // We need to make sure this is not the last data block before setting
+      // is_out_of_bound_, since the index key for the last data block can be
+      // larger than smallest key of the next file on the same level.
+      if (index_iter_->Valid()) {
+        is_out_of_bound_ = true;
+      }
+      return;
+    }
+
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    IndexValue v = index_iter_->value();
+
+    // TODO(kolmike): Remove the != kBlockCacheTier condition.
+    if (!v.first_internal_key.empty() &&
+        read_options_.read_tier != kBlockCacheTier) {
+      // Index contains the first key of the block. Defer reading the block.
+      is_at_first_key_from_index_ = true;
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
+  while (!block_iter_.Valid()) {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    ResetDataIter();
+    index_iter_->Prev();
+
+    if (index_iter_->Valid()) {
+      InitDataBlock();
+      block_iter_.SeekToLast();
+    } else {
+      return;
+    }
+  }
+
+  // We could have check lower bound here too, but we opt not to do it for
+  // code simplicity.
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
+  if (read_options_.iterate_upper_bound != nullptr && Valid()) {
+    is_out_of_bound_ = user_comparator_.Compare(
+                           *read_options_.iterate_upper_bound, user_key()) <= 0;
+  }
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter,
+                             TValue>::CheckDataBlockWithinUpperBound() {
+  if (read_options_.iterate_upper_bound != nullptr &&
+      block_iter_points_to_real_block_) {
+    data_block_within_upper_bound_ =
+        (user_comparator_.Compare(*read_options_.iterate_upper_bound,
+                                  index_iter_->user_key()) > 0);
+  }
+}
+
+InternalIterator* BlockBasedTable::NewIterator(
+    const ReadOptions& read_options, const SliceTransform* prefix_extractor,
+    Arena* arena, bool skip_filters, TableReaderCaller caller,
+    size_t compaction_readahead_size) {
+  BlockCacheLookupContext lookup_context{caller};
+  bool need_upper_bound_check =
+      read_options.auto_prefix_mode ||
+      PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
+  if (arena == nullptr) {
+    return new BlockBasedTableIterator<DataBlockIter>(
+        this, read_options, rep_->internal_comparator,
+        NewIndexIterator(
+            read_options,
+            need_upper_bound_check &&
+                rep_->index_type == BlockBasedTableOptions::kHashSearch,
+            /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
+        !skip_filters && !read_options.total_order_seek &&
+            prefix_extractor != nullptr,
+        need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
+        compaction_readahead_size);
+  } else {
+    auto* mem =
+        arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
+    return new (mem) BlockBasedTableIterator<DataBlockIter>(
+        this, read_options, rep_->internal_comparator,
+        NewIndexIterator(
+            read_options,
+            need_upper_bound_check &&
+                rep_->index_type == BlockBasedTableOptions::kHashSearch,
+            /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
+        !skip_filters && !read_options.total_order_seek &&
+            prefix_extractor != nullptr,
+        need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
+        compaction_readahead_size);
+  }
+}
+
+FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
+    const ReadOptions& read_options) {
+  if (rep_->fragmented_range_dels == nullptr) {
+    return nullptr;
+  }
+  SequenceNumber snapshot = kMaxSequenceNumber;
+  if (read_options.snapshot != nullptr) {
+    snapshot = read_options.snapshot->GetSequenceNumber();
+  }
+  return new FragmentedRangeTombstoneIterator(
+      rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
+}
+
+bool BlockBasedTable::FullFilterKeyMayMatch(
+    const ReadOptions& read_options, FilterBlockReader* filter,
+    const Slice& internal_key, const bool no_io,
+    const SliceTransform* prefix_extractor, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) const {
+  if (filter == nullptr || filter->IsBlockBased()) {
+    return true;
+  }
+  Slice user_key = ExtractUserKey(internal_key);
+  const Slice* const const_ikey_ptr = &internal_key;
+  bool may_match = true;
+  if (rep_->whole_key_filtering) {
+    size_t ts_sz =
+        rep_->internal_comparator.user_comparator()->timestamp_size();
+    Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
+    may_match =
+        filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
+                            no_io, const_ikey_ptr, get_context, lookup_context);
+  } else if (!read_options.total_order_seek && prefix_extractor &&
+             rep_->table_properties->prefix_extractor_name.compare(
+                 prefix_extractor->Name()) == 0 &&
+             prefix_extractor->InDomain(user_key) &&
+             !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
+                                     prefix_extractor, kNotValid, no_io,
+                                     const_ikey_ptr, get_context,
+                                     lookup_context)) {
+    may_match = false;
+  }
+  if (may_match) {
+    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
+  }
+  return may_match;
+}
+
+void BlockBasedTable::FullFilterKeysMayMatch(
+    const ReadOptions& read_options, FilterBlockReader* filter,
+    MultiGetRange* range, const bool no_io,
+    const SliceTransform* prefix_extractor,
+    BlockCacheLookupContext* lookup_context) const {
+  if (filter == nullptr || filter->IsBlockBased()) {
+    return;
+  }
+  if (rep_->whole_key_filtering) {
+    filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
+                         lookup_context);
+  } else if (!read_options.total_order_seek && prefix_extractor &&
+             rep_->table_properties->prefix_extractor_name.compare(
+                 prefix_extractor->Name()) == 0) {
+    filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
+                             lookup_context);
+  }
+}
+
+Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
+                            GetContext* get_context,
+                            const SliceTransform* prefix_extractor,
+                            bool skip_filters) {
+  assert(key.size() >= 8);  // key must be internal key
+  assert(get_context != nullptr);
+  Status s;
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+
+  FilterBlockReader* const filter =
+      !skip_filters ? rep_->filter.get() : nullptr;
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  uint64_t tracing_get_id = get_context->get_tracing_get_id();
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserGet, tracing_get_id,
+      /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+    // Trace the key since it contains both user key and sequence number.
+    lookup_context.referenced_key = key.ToString();
+    lookup_context.get_from_user_specified_snapshot =
+        read_options.snapshot != nullptr;
+  }
+  const bool may_match =
+      FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
+                            get_context, &lookup_context);
+  if (!may_match) {
+    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
+  } else {
+    IndexBlockIter iiter_on_stack;
+    // if prefix_extractor found in block differs from options, disable
+    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
+    bool need_upper_bound_check = false;
+    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
+      need_upper_bound_check = PrefixExtractorChanged(
+          rep_->table_properties.get(), prefix_extractor);
+    }
+    auto iiter =
+        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+                         get_context, &lookup_context);
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+    if (iiter != &iiter_on_stack) {
+      iiter_unique_ptr.reset(iiter);
+    }
+
+    size_t ts_sz =
+        rep_->internal_comparator.user_comparator()->timestamp_size();
+    bool matched = false;  // if such user key mathced a key in SST
+    bool done = false;
+    for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
+      IndexValue v = iiter->value();
+
+      bool not_exist_in_filter =
+          filter != nullptr && filter->IsBlockBased() == true &&
+          !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
+                               prefix_extractor, v.handle.offset(), no_io,
+                               /*const_ikey_ptr=*/nullptr, get_context,
+                               &lookup_context);
+
+      if (not_exist_in_filter) {
+        // Not found
+        // TODO: think about interaction with Merge. If a user key cannot
+        // cross one data block, we should be fine.
+        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
+        break;
+      }
+
+      if (!v.first_internal_key.empty() && !skip_filters &&
+          UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                  .Compare(ExtractUserKey(key),
+                           ExtractUserKey(v.first_internal_key)) < 0) {
+        // The requested key falls between highest key in previous block and
+        // lowest key in current block.
+        break;
+      }
+
+      BlockCacheLookupContext lookup_data_block_context{
+          TableReaderCaller::kUserGet, tracing_get_id,
+          /*get_from_user_specified_snapshot=*/read_options.snapshot !=
+              nullptr};
+      bool does_referenced_key_exist = false;
+      DataBlockIter biter;
+      uint64_t referenced_data_size = 0;
+      NewDataBlockIterator<DataBlockIter>(
+          read_options, v.handle, &biter, BlockType::kData, get_context,
+          &lookup_data_block_context,
+          /*s=*/Status(), /*prefetch_buffer*/ nullptr);
+
+      if (no_io && biter.status().IsIncomplete()) {
+        // couldn't get block from block_cache
+        // Update Saver.state to Found because we are only looking for
+        // whether we can guarantee the key is not there when "no_io" is set
+        get_context->MarkKeyMayExist();
+        break;
+      }
+      if (!biter.status().ok()) {
+        s = biter.status();
+        break;
+      }
+
+      bool may_exist = biter.SeekForGet(key);
+      // If user-specified timestamp is supported, we cannot end the search
+      // just because hash index lookup indicates the key+ts does not exist.
+      if (!may_exist && ts_sz == 0) {
+        // HashSeek cannot find the key this block and the the iter is not
+        // the end of the block, i.e. cannot be in the following blocks
+        // either. In this case, the seek_key cannot be found, so we break
+        // from the top level for-loop.
+        done = true;
+      } else {
+        // Call the *saver function on each entry/block until it returns false
+        for (; biter.Valid(); biter.Next()) {
+          ParsedInternalKey parsed_key;
+          if (!ParseInternalKey(biter.key(), &parsed_key)) {
+            s = Status::Corruption(Slice());
+          }
+
+          if (!get_context->SaveValue(
+                  parsed_key, biter.value(), &matched,
+                  biter.IsValuePinned() ? &biter : nullptr)) {
+            if (get_context->State() == GetContext::GetState::kFound) {
+              does_referenced_key_exist = true;
+              referenced_data_size = biter.key().size() + biter.value().size();
+            }
+            done = true;
+            break;
+          }
+        }
+        s = biter.status();
+      }
+      // Write the block cache access record.
+      if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+        // Avoid making copy of block_key, cf_name, and referenced_key when
+        // constructing the access record.
+        Slice referenced_key;
+        if (does_referenced_key_exist) {
+          referenced_key = biter.key();
+        } else {
+          referenced_key = key;
+        }
+        BlockCacheTraceRecord access_record(
+            rep_->ioptions.env->NowMicros(),
+            /*block_key=*/"", lookup_data_block_context.block_type,
+            lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+            /*cf_name=*/"", rep_->level_for_tracing(),
+            rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+            lookup_data_block_context.is_cache_hit,
+            lookup_data_block_context.no_insert,
+            lookup_data_block_context.get_id,
+            lookup_data_block_context.get_from_user_specified_snapshot,
+            /*referenced_key=*/"", referenced_data_size,
+            lookup_data_block_context.num_keys_in_block,
+            does_referenced_key_exist);
+        block_cache_tracer_->WriteBlockAccess(
+            access_record, lookup_data_block_context.block_key,
+            rep_->cf_name_for_tracing(), referenced_key);
+      }
+
+      if (done) {
+        // Avoid the extra Next which is expensive in two-level indexes
+        break;
+      }
+    }
+    if (matched && filter != nullptr && !filter->IsBlockBased()) {
+      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+                                rep_->level);
+    }
+    if (s.ok() && !iiter->status().IsNotFound()) {
+      s = iiter->status();
+    }
+  }
+
+  return s;
+}
+
+using MultiGetRange = MultiGetContext::Range;
+void BlockBasedTable::MultiGet(const ReadOptions& read_options,
+                               const MultiGetRange* mget_range,
+                               const SliceTransform* prefix_extractor,
+                               bool skip_filters) {
+  FilterBlockReader* const filter =
+      !skip_filters ? rep_->filter.get() : nullptr;
+  MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
+                               mget_range->end());
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+  if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
+    tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
+  }
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserMultiGet, tracing_mget_id,
+      /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+  FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
+                         prefix_extractor, &lookup_context);
+
+  if (skip_filters || !sst_file_range.empty()) {
+    IndexBlockIter iiter_on_stack;
+    // if prefix_extractor found in block differs from options, disable
+    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
+    bool need_upper_bound_check = false;
+    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
+      need_upper_bound_check = PrefixExtractorChanged(
+          rep_->table_properties.get(), prefix_extractor);
+    }
+    auto iiter =
+        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+                         sst_file_range.begin()->get_context, &lookup_context);
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+    if (iiter != &iiter_on_stack) {
+      iiter_unique_ptr.reset(iiter);
+    }
+
+    uint64_t offset = std::numeric_limits<uint64_t>::max();
+    autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
+    autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
+    autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
+    char stack_buf[kMultiGetReadStackBufSize];
+    std::unique_ptr<char[]> block_buf;
+    {
+      MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
+                                     sst_file_range.end());
+
+      CachableEntry<UncompressionDict> uncompression_dict;
+      Status uncompression_dict_status;
+      if (rep_->uncompression_dict_reader) {
+        uncompression_dict_status =
+            rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+                nullptr /* prefetch_buffer */, no_io,
+                sst_file_range.begin()->get_context, &lookup_context,
+                &uncompression_dict);
+      }
+
+      const UncompressionDict& dict = uncompression_dict.GetValue()
+                                          ? *uncompression_dict.GetValue()
+                                          : UncompressionDict::GetEmptyDict();
+
+      size_t total_len = 0;
+      ReadOptions ro = read_options;
+      ro.read_tier = kBlockCacheTier;
+
+      for (auto miter = data_block_range.begin();
+           miter != data_block_range.end(); ++miter) {
+        const Slice& key = miter->ikey;
+        iiter->Seek(miter->ikey);
+
+        IndexValue v;
+        if (iiter->Valid()) {
+          v = iiter->value();
+        }
+        if (!iiter->Valid() ||
+            (!v.first_internal_key.empty() && !skip_filters &&
+             UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                     .Compare(ExtractUserKey(key),
+                              ExtractUserKey(v.first_internal_key)) < 0)) {
+          // The requested key falls between highest key in previous block and
+          // lowest key in current block.
+          *(miter->s) = iiter->status();
+          data_block_range.SkipKey(miter);
+          sst_file_range.SkipKey(miter);
+          continue;
+        }
+
+        if (!uncompression_dict_status.ok()) {
+          *(miter->s) = uncompression_dict_status;
+          data_block_range.SkipKey(miter);
+          sst_file_range.SkipKey(miter);
+          continue;
+        }
+
+        statuses.emplace_back();
+        results.emplace_back();
+        if (v.handle.offset() == offset) {
+          // We're going to reuse the block for this key later on. No need to
+          // look it up now. Place a null handle
+          block_handles.emplace_back(BlockHandle::NullBlockHandle());
+          continue;
+        }
+        // Lookup the cache for the given data block referenced by an index
+        // iterator value (i.e BlockHandle). If it exists in the cache,
+        // initialize block to the contents of the data block.
+        offset = v.handle.offset();
+        BlockHandle handle = v.handle;
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet);
+        Status s = RetrieveBlock(
+            nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
+            miter->get_context, &lookup_data_block_context,
+            /* for_compaction */ false, /* use_cache */ true);
+        if (s.IsIncomplete()) {
+          s = Status::OK();
+        }
+        if (s.ok() && !results.back().IsEmpty()) {
+          // Found it in the cache. Add NULL handle to indicate there is
+          // nothing to read from disk
+          block_handles.emplace_back(BlockHandle::NullBlockHandle());
+        } else {
+          block_handles.emplace_back(handle);
+          total_len += block_size(handle);
+        }
+      }
+
+      if (total_len) {
+        char* scratch = nullptr;
+        // If the blocks need to be uncompressed and we don't need the
+        // compressed blocks, then we can use a contiguous block of
+        // memory to read in all the blocks as it will be temporary
+        // storage
+        // 1. If blocks are compressed and compressed block cache is there,
+        //    alloc heap bufs
+        // 2. If blocks are uncompressed, alloc heap bufs
+        // 3. If blocks are compressed and no compressed block cache, use
+        //    stack buf
+        if (rep_->table_options.block_cache_compressed == nullptr &&
+            rep_->blocks_maybe_compressed) {
+          if (total_len <= kMultiGetReadStackBufSize) {
+            scratch = stack_buf;
+          } else {
+            scratch = new char[total_len];
+            block_buf.reset(scratch);
+          }
+        }
+        RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
+                               &statuses, &results, scratch, dict);
+      }
+    }
+
+    DataBlockIter first_biter;
+    DataBlockIter next_biter;
+    size_t idx_in_batch = 0;
+    for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
+         ++miter) {
+      Status s;
+      GetContext* get_context = miter->get_context;
+      const Slice& key = miter->ikey;
+      bool matched = false;  // if such user key matched a key in SST
+      bool done = false;
+      bool first_block = true;
+      do {
+        DataBlockIter* biter = nullptr;
+        bool reusing_block = true;
+        uint64_t referenced_data_size = 0;
+        bool does_referenced_key_exist = false;
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet, tracing_mget_id,
+            /*get_from_user_specified_snapshot=*/read_options.snapshot !=
+                nullptr);
+        if (first_block) {
+          if (!block_handles[idx_in_batch].IsNull() ||
+              !results[idx_in_batch].IsEmpty()) {
+            first_biter.Invalidate(Status::OK());
+            NewDataBlockIterator<DataBlockIter>(
+                read_options, results[idx_in_batch], &first_biter,
+                statuses[idx_in_batch]);
+            reusing_block = false;
+          }
+          biter = &first_biter;
+          idx_in_batch++;
+        } else {
+          IndexValue v = iiter->value();
+          if (!v.first_internal_key.empty() && !skip_filters &&
+              UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                      .Compare(ExtractUserKey(key),
+                               ExtractUserKey(v.first_internal_key)) < 0) {
+            // The requested key falls between highest key in previous block and
+            // lowest key in current block.
+            break;
+          }
+
+          next_biter.Invalidate(Status::OK());
+          NewDataBlockIterator<DataBlockIter>(
+              read_options, iiter->value().handle, &next_biter,
+              BlockType::kData, get_context, &lookup_data_block_context,
+              Status(), nullptr);
+          biter = &next_biter;
+          reusing_block = false;
+        }
+
+        if (read_options.read_tier == kBlockCacheTier &&
+            biter->status().IsIncomplete()) {
+          // couldn't get block from block_cache
+          // Update Saver.state to Found because we are only looking for
+          // whether we can guarantee the key is not there when "no_io" is set
+          get_context->MarkKeyMayExist();
+          break;
+        }
+        if (!biter->status().ok()) {
+          s = biter->status();
+          break;
+        }
+
+        bool may_exist = biter->SeekForGet(key);
+        if (!may_exist) {
+          // HashSeek cannot find the key this block and the the iter is not
+          // the end of the block, i.e. cannot be in the following blocks
+          // either. In this case, the seek_key cannot be found, so we break
+          // from the top level for-loop.
+          break;
+        }
+
+        // Call the *saver function on each entry/block until it returns false
+        for (; biter->Valid(); biter->Next()) {
+          ParsedInternalKey parsed_key;
+          Cleanable dummy;
+          Cleanable* value_pinner = nullptr;
+          if (!ParseInternalKey(biter->key(), &parsed_key)) {
+            s = Status::Corruption(Slice());
+          }
+          if (biter->IsValuePinned()) {
+            if (reusing_block) {
+              Cache* block_cache = rep_->table_options.block_cache.get();
+              assert(biter->cache_handle() != nullptr);
+              block_cache->Ref(biter->cache_handle());
+              dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
+                                    biter->cache_handle());
+              value_pinner = &dummy;
+            } else {
+              value_pinner = biter;
+            }
+          }
+          if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
+                                      value_pinner)) {
+            if (get_context->State() == GetContext::GetState::kFound) {
+              does_referenced_key_exist = true;
+              referenced_data_size =
+                  biter->key().size() + biter->value().size();
+            }
+            done = true;
+            break;
+          }
+          s = biter->status();
+        }
+        // Write the block cache access.
+        if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+          // Avoid making copy of block_key, cf_name, and referenced_key when
+          // constructing the access record.
+          Slice referenced_key;
+          if (does_referenced_key_exist) {
+            referenced_key = biter->key();
+          } else {
+            referenced_key = key;
+          }
+          BlockCacheTraceRecord access_record(
+              rep_->ioptions.env->NowMicros(),
+              /*block_key=*/"", lookup_data_block_context.block_type,
+              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+              /*cf_name=*/"", rep_->level_for_tracing(),
+              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+              lookup_data_block_context.is_cache_hit,
+              lookup_data_block_context.no_insert,
+              lookup_data_block_context.get_id,
+              lookup_data_block_context.get_from_user_specified_snapshot,
+              /*referenced_key=*/"", referenced_data_size,
+              lookup_data_block_context.num_keys_in_block,
+              does_referenced_key_exist);
+          block_cache_tracer_->WriteBlockAccess(
+              access_record, lookup_data_block_context.block_key,
+              rep_->cf_name_for_tracing(), referenced_key);
+        }
+        s = biter->status();
+        if (done) {
+          // Avoid the extra Next which is expensive in two-level indexes
+          break;
+        }
+        if (first_block) {
+          iiter->Seek(key);
+        }
+        first_block = false;
+        iiter->Next();
+      } while (iiter->Valid());
+
+      if (matched && filter != nullptr && !filter->IsBlockBased()) {
+        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+                                  rep_->level);
+      }
+      if (s.ok()) {
+        s = iiter->status();
+      }
+      *(miter->s) = s;
+    }
+  }
+}
+
+Status BlockBasedTable::Prefetch(const Slice* const begin,
+                                 const Slice* const end) {
+  auto& comparator = rep_->internal_comparator;
+  UserComparatorWrapper user_comparator(comparator.user_comparator());
+  // pre-condition
+  if (begin && end && comparator.Compare(*begin, *end) > 0) {
+    return Status::InvalidArgument(*begin, *end);
+  }
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+  IndexBlockIter iiter_on_stack;
+  auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                                &iiter_on_stack, /*get_context=*/nullptr,
+                                &lookup_context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+  }
+
+  if (!iiter->status().ok()) {
+    // error opening index iterator
+    return iiter->status();
+  }
+
+  // indicates if we are on the last page that need to be pre-fetched
+  bool prefetching_boundary_page = false;
+
+  for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
+       iiter->Next()) {
+    BlockHandle block_handle = iiter->value().handle;
+    const bool is_user_key = !rep_->index_key_includes_seq;
+    if (end &&
+        ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
+         (is_user_key &&
+          user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
+      if (prefetching_boundary_page) {
+        break;
+      }
+
+      // The index entry represents the last key in the data block.
+      // We should load this page into memory as well, but no more
+      prefetching_boundary_page = true;
+    }
+
+    // Load the block specified by the block_handle into the block cache
+    DataBlockIter biter;
+
+    NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, &lookup_context, Status(),
+        /*prefetch_buffer=*/nullptr);
+
+    if (!biter.status().ok()) {
+      // there was an unexpected error while pre-fetching
+      return biter.status();
+    }
+  }
+
+  return Status::OK();
+}
+
+Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
+                                       TableReaderCaller caller) {
+  Status s;
+  // Check Meta blocks
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
+                         &metaindex_iter);
+  if (s.ok()) {
+    s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    return s;
+  }
+  // Check Data blocks
+  IndexBlockIter iiter_on_stack;
+  BlockCacheLookupContext context{caller};
+  InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
+      read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
+      /*get_context=*/nullptr, &context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+  }
+  if (!iiter->status().ok()) {
+    // error opening index iterator
+    return iiter->status();
+  }
+  s = VerifyChecksumInBlocks(read_options, iiter);
+  return s;
+}
+
+Status BlockBasedTable::VerifyChecksumInBlocks(
+    const ReadOptions& read_options,
+    InternalIteratorBase<IndexValue>* index_iter) {
+  Status s;
+  // We are scanning the whole file, so no need to do exponential
+  // increasing of the buffer size.
+  size_t readahead_size = (read_options.readahead_size != 0)
+                              ? read_options.readahead_size
+                              : kMaxAutoReadaheadSize;
+  // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
+  // needed there.
+  FilePrefetchBuffer prefetch_buffer(
+      rep_->file.get(), readahead_size /* readadhead_size */,
+      readahead_size /* max_readahead_size */,
+      !rep_->ioptions.allow_mmap_reads /* enable */);
+
+  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+    s = index_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    BlockHandle handle = index_iter->value().handle;
+    BlockContents contents;
+    BlockFetcher block_fetcher(
+        rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
+        &contents, rep_->ioptions, false /* decompress */,
+        false /*maybe_compressed*/, BlockType::kData,
+        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+    s = block_fetcher.ReadBlockContents();
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
+BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
+    const Slice& meta_block_name) {
+  if (meta_block_name.starts_with(kFilterBlockPrefix) ||
+      meta_block_name.starts_with(kFullFilterBlockPrefix) ||
+      meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
+    return BlockType::kFilter;
+  }
+
+  if (meta_block_name == kPropertiesBlock) {
+    return BlockType::kProperties;
+  }
+
+  if (meta_block_name == kCompressionDictBlock) {
+    return BlockType::kCompressionDictionary;
+  }
+
+  if (meta_block_name == kRangeDelBlock) {
+    return BlockType::kRangeDeletion;
+  }
+
+  if (meta_block_name == kHashIndexPrefixesBlock) {
+    return BlockType::kHashIndexPrefixes;
+  }
+
+  if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
+    return BlockType::kHashIndexMetadata;
+  }
+
+  assert(false);
+  return BlockType::kInvalid;
+}
+
+Status BlockBasedTable::VerifyChecksumInMetaBlocks(
+    InternalIteratorBase<Slice>* index_iter) {
+  Status s;
+  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+    s = index_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    BlockHandle handle;
+    Slice input = index_iter->value();
+    s = handle.DecodeFrom(&input);
+    BlockContents contents;
+    const Slice meta_block_name = index_iter->key();
+    BlockFetcher block_fetcher(
+        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
+        ReadOptions(), handle, &contents, rep_->ioptions,
+        false /* decompress */, false /*maybe_compressed*/,
+        GetBlockTypeForMetaBlockByName(meta_block_name),
+        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+    s = block_fetcher.ReadBlockContents();
+    if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
+      TableProperties* table_properties;
+      s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
+                                           index_iter->value(),
+                                           &table_properties);
+      delete table_properties;
+    }
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
+bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
+  assert(rep_ != nullptr);
+
+  Cache* const cache = rep_->table_options.block_cache.get();
+  if (cache == nullptr) {
+    return false;
+  }
+
+  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  Slice cache_key =
+      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
+                  cache_key_storage);
+
+  Cache::Handle* const cache_handle = cache->Lookup(cache_key);
+  if (cache_handle == nullptr) {
+    return false;
+  }
+
+  cache->Release(cache_handle);
+
+  return true;
+}
+
+bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
+                                      const Slice& key) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
+      options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+      /*get_context=*/nullptr, /*lookup_context=*/nullptr));
+  iiter->Seek(key);
+  assert(iiter->Valid());
+
+  return TEST_BlockInCache(iiter->value().handle);
+}
+
+// REQUIRES: The following fields of rep_ should have already been populated:
+//  1. file
+//  2. index_handle,
+//  3. options
+//  4. internal_comparator
+//  5. index_type
+Status BlockBasedTable::CreateIndexReader(
+    FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<IndexReader>* index_reader) {
+  // kHashSearch requires non-empty prefix_extractor but bypass checking
+  // prefix_extractor here since we have no access to MutableCFOptions.
+  // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
+  // If prefix_extractor does not match prefix_extractor_name from table
+  // properties, turn off Hash Index by setting total_order_seek to true
+
+  switch (rep_->index_type) {
+    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
+      return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
+                                          prefetch, pin, lookup_context,
+                                          index_reader);
+    }
+    case BlockBasedTableOptions::kBinarySearch:
+      FALLTHROUGH_INTENDED;
+    case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
+      return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
+                                             prefetch, pin, lookup_context,
+                                             index_reader);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      std::unique_ptr<Block> metaindex_guard;
+      std::unique_ptr<InternalIterator> metaindex_iter_guard;
+      auto meta_index_iter = preloaded_meta_index_iter;
+      bool should_fallback = false;
+      if (rep_->internal_prefix_transform.get() == nullptr) {
+        ROCKS_LOG_WARN(rep_->ioptions.info_log,
+                       "No prefix extractor passed in. Fall back to binary"
+                       " search index.");
+        should_fallback = true;
+      } else if (meta_index_iter == nullptr) {
+        auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
+                                    &metaindex_iter_guard);
+        if (!s.ok()) {
+          // we simply fall back to binary search in case there is any
+          // problem with prefix hash index loading.
+          ROCKS_LOG_WARN(rep_->ioptions.info_log,
+                         "Unable to read the metaindex block."
+                         " Fall back to binary search index.");
+          should_fallback = true;
+        }
+        meta_index_iter = metaindex_iter_guard.get();
+      }
+
+      if (should_fallback) {
+        return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
+                                               prefetch, pin, lookup_context,
+                                               index_reader);
+      } else {
+        return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
+                                       use_cache, prefetch, pin, lookup_context,
+                                       index_reader);
+      }
+    }
+    default: {
+      std::string error_message =
+          "Unrecognized index type: " + ToString(rep_->index_type);
+      return Status::InvalidArgument(error_message.c_str());
+    }
+  }
+}
+
+uint64_t BlockBasedTable::ApproximateOffsetOf(
+    const InternalIteratorBase<IndexValue>& index_iter) const {
+  uint64_t result = 0;
+  if (index_iter.Valid()) {
+    BlockHandle handle = index_iter.value().handle;
+    result = handle.offset();
+  } else {
+    // The iterator is past the last key in the file. If table_properties is not
+    // available, approximate the offset by returning the offset of the
+    // metaindex block (which is right near the end of the file).
+    if (rep_->table_properties) {
+      result = rep_->table_properties->data_size;
+    }
+    // table_properties is not present in the table.
+    if (result == 0) {
+      result = rep_->footer.metaindex_handle().offset();
+    }
+  }
+
+  return result;
+}
+
+uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
+                                              TableReaderCaller caller) {
+  BlockCacheLookupContext context(caller);
+  IndexBlockIter iiter_on_stack;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  auto index_iter =
+      NewIndexIterator(ro, /*disable_prefix_seek=*/true,
+                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+                       /*lookup_context=*/&context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (index_iter != &iiter_on_stack) {
+    iiter_unique_ptr.reset(index_iter);
+  }
+
+  index_iter->Seek(key);
+  return ApproximateOffsetOf(*index_iter);
+}
+
+uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
+                                          TableReaderCaller caller) {
+  assert(rep_->internal_comparator.Compare(start, end) <= 0);
+
+  BlockCacheLookupContext context(caller);
+  IndexBlockIter iiter_on_stack;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  auto index_iter =
+      NewIndexIterator(ro, /*disable_prefix_seek=*/true,
+                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+                       /*lookup_context=*/&context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (index_iter != &iiter_on_stack) {
+    iiter_unique_ptr.reset(index_iter);
+  }
+
+  index_iter->Seek(start);
+  uint64_t start_offset = ApproximateOffsetOf(*index_iter);
+  index_iter->Seek(end);
+  uint64_t end_offset = ApproximateOffsetOf(*index_iter);
+
+  assert(end_offset >= start_offset);
+  return end_offset - start_offset;
+}
+
+bool BlockBasedTable::TEST_FilterBlockInCache() const {
+  assert(rep_ != nullptr);
+  return TEST_BlockInCache(rep_->filter_handle);
+}
+
+bool BlockBasedTable::TEST_IndexBlockInCache() const {
+  assert(rep_ != nullptr);
+
+  return TEST_BlockInCache(rep_->footer.index_handle());
+}
+
+Status BlockBasedTable::GetKVPairsFromDataBlocks(
+    std::vector<KVPairBlock>* kv_pair_blocks) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    // Cannot read Index Block
+    return s;
+  }
+
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+
+    if (!s.ok()) {
+      break;
+    }
+
+    std::unique_ptr<InternalIterator> datablock_iter;
+    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), blockhandles_iter->value().handle,
+        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
+        /*prefetch_buffer=*/nullptr));
+    s = datablock_iter->status();
+
+    if (!s.ok()) {
+      // Error reading the block - Skipped
+      continue;
+    }
+
+    KVPairBlock kv_pair_block;
+    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+         datablock_iter->Next()) {
+      s = datablock_iter->status();
+      if (!s.ok()) {
+        // Error reading the block - Skipped
+        break;
+      }
+      const Slice& key = datablock_iter->key();
+      const Slice& value = datablock_iter->value();
+      std::string key_copy = std::string(key.data(), key.size());
+      std::string value_copy = std::string(value.data(), value.size());
+
+      kv_pair_block.push_back(
+          std::make_pair(std::move(key_copy), std::move(value_copy)));
+    }
+    kv_pair_blocks->push_back(std::move(kv_pair_block));
+  }
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpTable(WritableFile* out_file) {
+  // Output Footer
+  out_file->Append(
+      "Footer Details:\n"
+      "--------------------------------------\n"
+      "  ");
+  out_file->Append(rep_->footer.ToString().c_str());
+  out_file->Append("\n");
+
+  // Output MetaIndex
+  out_file->Append(
+      "Metaindex Details:\n"
+      "--------------------------------------\n");
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
+                                &metaindex_iter);
+  if (s.ok()) {
+    for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+         metaindex_iter->Next()) {
+      s = metaindex_iter->status();
+      if (!s.ok()) {
+        return s;
+      }
+      if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kPropertiesBlock) {
+        out_file->Append("  Properties block handle: ");
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      } else if (metaindex_iter->key() ==
+                 ROCKSDB_NAMESPACE::kCompressionDictBlock) {
+        out_file->Append("  Compression dictionary block handle: ");
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      } else if (strstr(metaindex_iter->key().ToString().c_str(),
+                        "filter.rocksdb.") != nullptr) {
+        out_file->Append("  Filter block handle: ");
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      } else if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kRangeDelBlock) {
+        out_file->Append("  Range deletion block handle: ");
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      }
+    }
+    out_file->Append("\n");
+  } else {
+    return s;
+  }
+
+  // Output TableProperties
+  const ROCKSDB_NAMESPACE::TableProperties* table_properties;
+  table_properties = rep_->table_properties.get();
+
+  if (table_properties != nullptr) {
+    out_file->Append(
+        "Table Properties:\n"
+        "--------------------------------------\n"
+        "  ");
+    out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
+    out_file->Append("\n");
+  }
+
+  if (rep_->filter) {
+    out_file->Append(
+        "Filter Details:\n"
+        "--------------------------------------\n"
+        "  ");
+    out_file->Append(rep_->filter->ToString().c_str());
+    out_file->Append("\n");
+  }
+
+  // Output Index block
+  s = DumpIndexBlock(out_file);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Output compression dictionary
+  if (rep_->uncompression_dict_reader) {
+    CachableEntry<UncompressionDict> uncompression_dict;
+    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        nullptr /* prefetch_buffer */, false /* no_io */,
+        nullptr /* get_context */, nullptr /* lookup_context */,
+        &uncompression_dict);
+    if (!s.ok()) {
+      return s;
+    }
+
+    assert(uncompression_dict.GetValue());
+
+    const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
+    out_file->Append(
+        "Compression Dictionary:\n"
+        "--------------------------------------\n");
+    out_file->Append("  size (bytes): ");
+    out_file->Append(ROCKSDB_NAMESPACE::ToString(raw_dict.size()));
+    out_file->Append("\n\n");
+    out_file->Append("  HEX    ");
+    out_file->Append(raw_dict.ToString(true).c_str());
+    out_file->Append("\n\n");
+  }
+
+  // Output range deletions block
+  auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
+  if (range_del_iter != nullptr) {
+    range_del_iter->SeekToFirst();
+    if (range_del_iter->Valid()) {
+      out_file->Append(
+          "Range deletions:\n"
+          "--------------------------------------\n"
+          "  ");
+      for (; range_del_iter->Valid(); range_del_iter->Next()) {
+        DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
+      }
+      out_file->Append("\n");
+    }
+    delete range_del_iter;
+  }
+  // Output Data blocks
+  s = DumpDataBlocks(out_file);
+
+  return s;
+}
+
+Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
+  out_file->Append(
+      "Index Details:\n"
+      "--------------------------------------\n");
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_file->Append("Can not read Index Block \n\n");
+    return s;
+  }
+
+  out_file->Append("  Block key hex dump: Data block handle\n");
+  out_file->Append("  Block key ascii\n\n");
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    Slice key = blockhandles_iter->key();
+    Slice user_key;
+    InternalKey ikey;
+    if (!rep_->index_key_includes_seq) {
+      user_key = key;
+    } else {
+      ikey.DecodeFrom(key);
+      user_key = ikey.user_key();
+    }
+
+    out_file->Append("  HEX    ");
+    out_file->Append(user_key.ToString(true).c_str());
+    out_file->Append(": ");
+    out_file->Append(blockhandles_iter->value()
+                         .ToString(true, rep_->index_has_first_key)
+                         .c_str());
+    out_file->Append("\n");
+
+    std::string str_key = user_key.ToString();
+    std::string res_key("");
+    char cspace = ' ';
+    for (size_t i = 0; i < str_key.size(); i++) {
+      res_key.append(&str_key[i], 1);
+      res_key.append(1, cspace);
+    }
+    out_file->Append("  ASCII  ");
+    out_file->Append(res_key.c_str());
+    out_file->Append("\n  ------\n");
+  }
+  out_file->Append("\n");
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_file->Append("Can not read Index Block \n\n");
+    return s;
+  }
+
+  uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
+  uint64_t datablock_size_max = 0;
+  uint64_t datablock_size_sum = 0;
+
+  size_t block_id = 1;
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       block_id++, blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    BlockHandle bh = blockhandles_iter->value().handle;
+    uint64_t datablock_size = bh.size();
+    datablock_size_min = std::min(datablock_size_min, datablock_size);
+    datablock_size_max = std::max(datablock_size_max, datablock_size);
+    datablock_size_sum += datablock_size;
+
+    out_file->Append("Data Block # ");
+    out_file->Append(ROCKSDB_NAMESPACE::ToString(block_id));
+    out_file->Append(" @ ");
+    out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
+    out_file->Append("\n");
+    out_file->Append("--------------------------------------\n");
+
+    std::unique_ptr<InternalIterator> datablock_iter;
+    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), blockhandles_iter->value().handle,
+        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
+        /*prefetch_buffer=*/nullptr));
+    s = datablock_iter->status();
+
+    if (!s.ok()) {
+      out_file->Append("Error reading the block - Skipped \n\n");
+      continue;
+    }
+
+    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+         datablock_iter->Next()) {
+      s = datablock_iter->status();
+      if (!s.ok()) {
+        out_file->Append("Error reading the block - Skipped \n");
+        break;
+      }
+      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
+    }
+    out_file->Append("\n");
+  }
+
+  uint64_t num_datablocks = block_id - 1;
+  if (num_datablocks) {
+    double datablock_size_avg =
+        static_cast<double>(datablock_size_sum) / num_datablocks;
+    out_file->Append("Data Block Summary:\n");
+    out_file->Append("--------------------------------------");
+    out_file->Append("\n  # data blocks: ");
+    out_file->Append(ROCKSDB_NAMESPACE::ToString(num_datablocks));
+    out_file->Append("\n  min data block size: ");
+    out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_min));
+    out_file->Append("\n  max data block size: ");
+    out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_max));
+    out_file->Append("\n  avg data block size: ");
+    out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_avg));
+    out_file->Append("\n");
+  }
+
+  return Status::OK();
+}
+
+void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
+                                   WritableFile* out_file) {
+  InternalKey ikey;
+  ikey.DecodeFrom(key);
+
+  out_file->Append("  HEX    ");
+  out_file->Append(ikey.user_key().ToString(true).c_str());
+  out_file->Append(": ");
+  out_file->Append(value.ToString(true).c_str());
+  out_file->Append("\n");
+
+  std::string str_key = ikey.user_key().ToString();
+  std::string str_value = value.ToString();
+  std::string res_key(""), res_value("");
+  char cspace = ' ';
+  for (size_t i = 0; i < str_key.size(); i++) {
+    if (str_key[i] == '\0') {
+      res_key.append("\\0", 2);
+    } else {
+      res_key.append(&str_key[i], 1);
+    }
+    res_key.append(1, cspace);
+  }
+  for (size_t i = 0; i < str_value.size(); i++) {
+    if (str_value[i] == '\0') {
+      res_value.append("\\0", 2);
+    } else {
+      res_value.append(&str_value[i], 1);
+    }
+    res_value.append(1, cspace);
+  }
+
+  out_file->Append("  ASCII  ");
+  out_file->Append(res_key.c_str());
+  out_file->Append(": ");
+  out_file->Append(res_value.c_str());
+  out_file->Append("\n  ------\n");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader.h b/src/rocksdb/table/block_based/block_based_table_reader.h
new file mode 100644
index 000000000..28a378988
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader.h
@@ -0,0 +1,824 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/range_tombstone_fragmenter.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/uncompression_dict_reader.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/multiget_context.h"
+#include "table/persistent_cache_helper.h"
+#include "table/table_properties_internal.h"
+#include "table/table_reader.h"
+#include "table/two_level_iterator.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/coding.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class FilterBlockReader;
+class BlockBasedFilterBlockReader;
+class FullFilterBlockReader;
+class Footer;
+class InternalKeyComparator;
+class Iterator;
+class FSRandomAccessFile;
+class TableCache;
+class TableReader;
+class WritableFile;
+struct BlockBasedTableOptions;
+struct EnvOptions;
+struct ReadOptions;
+class GetContext;
+
+typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
+
+// Reader class for BlockBasedTable format.
+// For the format of BlockBasedTable refer to
+// https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
+// This is the default table type. Data is chucked into fixed size blocks and
+// each block in-turn stores entries. When storing data, we can compress and/or
+// encode data efficiently within a block, which often results in a much smaller
+// data size compared with the raw data size. As for the record retrieval, we'll
+// first locate the block where target record may reside, then read the block to
+// memory, and finally search that record within the block. Of course, to avoid
+// frequent reads of the same block, we introduced the block cache to keep the
+// loaded blocks in the memory.
+class BlockBasedTable : public TableReader {
+ public:
+  static const std::string kFilterBlockPrefix;
+  static const std::string kFullFilterBlockPrefix;
+  static const std::string kPartitionedFilterBlockPrefix;
+  // The longest prefix of the cache key used to identify blocks.
+  // For Posix files the unique ID is three varints.
+  static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
+
+  // All the below fields control iterator readahead
+  static const size_t kInitAutoReadaheadSize = 8 * 1024;
+  // Found that 256 KB readahead size provides the best performance, based on
+  // experiments, for auto readahead. Experiment data is in PR #3282.
+  static const size_t kMaxAutoReadaheadSize;
+  static const int kMinNumFileReadsToStartAutoReadahead = 2;
+
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
+  //
+  // If successful, returns ok and sets "*table_reader" to the newly opened
+  // table.  The client should delete "*table_reader" when no longer needed.
+  // If there was an error while initializing the table, sets "*table_reader"
+  // to nullptr and returns a non-ok status.
+  //
+  // @param file must remain live while this Table is in use.
+  // @param prefetch_index_and_filter_in_cache can be used to disable
+  // prefetching of
+  //    index and filter blocks into block cache at startup
+  // @param skip_filters Disables loading/accessing the filter block. Overrides
+  //    prefetch_index_and_filter_in_cache, so filter will be skipped if both
+  //    are set.
+  static Status Open(const ImmutableCFOptions& ioptions,
+                     const EnvOptions& env_options,
+                     const BlockBasedTableOptions& table_options,
+                     const InternalKeyComparator& internal_key_comparator,
+                     std::unique_ptr<RandomAccessFileReader>&& file,
+                     uint64_t file_size,
+                     std::unique_ptr<TableReader>* table_reader,
+                     const SliceTransform* prefix_extractor = nullptr,
+                     bool prefetch_index_and_filter_in_cache = true,
+                     bool skip_filters = false, int level = -1,
+                     const bool immortal_table = false,
+                     const SequenceNumber largest_seqno = 0,
+                     TailPrefetchStats* tail_prefetch_stats = nullptr,
+                     BlockCacheTracer* const block_cache_tracer = nullptr);
+
+  bool PrefixMayMatch(const Slice& internal_key,
+                      const ReadOptions& read_options,
+                      const SliceTransform* options_prefix_extractor,
+                      const bool need_upper_bound_check,
+                      BlockCacheLookupContext* lookup_context) const;
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  // @param skip_filters Disables loading/accessing the filter block
+  // compaction_readahead_size: its value will only be used if caller =
+  // kCompaction.
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0) override;
+
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& read_options) override;
+
+  // @param skip_filters Disables loading/accessing the filter block
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
+
+  void MultiGet(const ReadOptions& readOptions,
+                const MultiGetContext::Range* mget_range,
+                const SliceTransform* prefix_extractor,
+                bool skip_filters = false) override;
+
+  // Pre-fetch the disk blocks that correspond to the key range specified by
+  // (kbegin, kend). The call will return error status in the event of
+  // IO or iteration error.
+  Status Prefetch(const Slice* begin, const Slice* end) override;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file). The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  uint64_t ApproximateOffsetOf(const Slice& key,
+                               TableReaderCaller caller) override;
+
+  // Given start and end keys, return the approximate data size in the file
+  // between the keys. The returned value is in terms of file bytes, and so
+  // includes effects like compression of the underlying data.
+  // The start key must not be greater than the end key.
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           TableReaderCaller caller) override;
+
+  bool TEST_BlockInCache(const BlockHandle& handle) const;
+
+  // Returns true if the block for the specified key is in cache.
+  // REQUIRES: key is in this table && block cache enabled
+  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  void SetupForCompaction() override;
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+  size_t ApproximateMemoryUsage() const override;
+
+  // convert SST file to a human readable form
+  Status DumpTable(WritableFile* out_file) override;
+
+  Status VerifyChecksum(const ReadOptions& readOptions,
+                        TableReaderCaller caller) override;
+
+  ~BlockBasedTable();
+
+  bool TEST_FilterBlockInCache() const;
+  bool TEST_IndexBlockInCache() const;
+
+  // IndexReader is the interface that provides the functionality for index
+  // access.
+  class IndexReader {
+   public:
+    virtual ~IndexReader() = default;
+
+    // Create an iterator for index access. If iter is null, then a new object
+    // is created on the heap, and the callee will have the ownership.
+    // If a non-null iter is passed in, it will be used, and the returned value
+    // is either the same as iter or a new on-heap object that
+    // wraps the passed iter. In the latter case the return value points
+    // to a different object then iter, and the callee has the ownership of the
+    // returned object.
+    virtual InternalIteratorBase<IndexValue>* NewIterator(
+        const ReadOptions& read_options, bool disable_prefix_seek,
+        IndexBlockIter* iter, GetContext* get_context,
+        BlockCacheLookupContext* lookup_context) = 0;
+
+    // Report an approximation of how much memory has been used other than
+    // memory that was allocated in block cache.
+    virtual size_t ApproximateMemoryUsage() const = 0;
+    // Cache the dependencies of the index reader (e.g. the partitions
+    // of a partitioned index).
+    virtual void CacheDependencies(bool /* pin */) {}
+  };
+
+  class IndexReaderCommon;
+
+  static Slice GetCacheKey(const char* cache_key_prefix,
+                           size_t cache_key_prefix_size,
+                           const BlockHandle& handle, char* cache_key);
+
+  // Retrieve all key value pairs from data blocks in the table.
+  // The key retrieved are internal keys.
+  Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
+
+  struct Rep;
+
+  Rep* get_rep() { return rep_; }
+  const Rep* get_rep() const { return rep_; }
+
+  // input_iter: if it is not null, update this one and return it as Iterator
+  template <typename TBlockIter>
+  TBlockIter* NewDataBlockIterator(
+      const ReadOptions& ro, const BlockHandle& block_handle,
+      TBlockIter* input_iter, BlockType block_type, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context, Status s,
+      FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const;
+
+  // input_iter: if it is not null, update this one and return it as Iterator
+  template <typename TBlockIter>
+  TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
+                                   CachableEntry<Block>& block,
+                                   TBlockIter* input_iter, Status s) const;
+
+  class PartitionedIndexIteratorState;
+
+  template <typename TBlocklike>
+  friend class FilterBlockReaderCommon;
+
+  friend class PartitionIndexReader;
+
+  friend class UncompressionDictReader;
+
+ protected:
+  Rep* rep_;
+  explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
+      : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
+  // No copying allowed
+  explicit BlockBasedTable(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+
+ private:
+  friend class MockedBlockBasedTable;
+  static std::atomic<uint64_t> next_cache_key_id_;
+  BlockCacheTracer* const block_cache_tracer_;
+
+  void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
+                             size_t usage) const;
+  void UpdateCacheMissMetrics(BlockType block_type,
+                              GetContext* get_context) const;
+  void UpdateCacheInsertionMetrics(BlockType block_type,
+                                   GetContext* get_context, size_t usage) const;
+  Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
+                                   BlockType block_type,
+                                   GetContext* get_context) const;
+
+  // Either Block::NewDataIterator() or Block::NewIndexIterator().
+  template <typename TBlockIter>
+  static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
+                                       TBlockIter* input_iter,
+                                       bool block_contents_pinned);
+
+  // If block cache enabled (compressed or uncompressed), looks for the block
+  // identified by handle in (1) uncompressed cache, (2) compressed cache, and
+  // then (3) file. If found, inserts into the cache(s) that were searched
+  // unsuccessfully (e.g., if found in file, will add to both uncompressed and
+  // compressed caches if they're enabled).
+  //
+  // @param block_entry value is set to the uncompressed block if found. If
+  //    in uncompressed block cache, also sets cache_handle to reference that
+  //    block.
+  template <typename TBlocklike>
+  Status MaybeReadBlockAndLoadToCache(
+      FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+      const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+      CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+      GetContext* get_context, BlockCacheLookupContext* lookup_context,
+      BlockContents* contents) const;
+
+  // Similar to the above, with one crucial difference: it will retrieve the
+  // block from the file even if there are no caches configured (assuming the
+  // read options allow I/O).
+  template <typename TBlocklike>
+  Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
+                       const ReadOptions& ro, const BlockHandle& handle,
+                       const UncompressionDict& uncompression_dict,
+                       CachableEntry<TBlocklike>* block_entry,
+                       BlockType block_type, GetContext* get_context,
+                       BlockCacheLookupContext* lookup_context,
+                       bool for_compaction, bool use_cache) const;
+
+  void RetrieveMultipleBlocks(
+      const ReadOptions& options, const MultiGetRange* batch,
+      const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
+      autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+      autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>*
+          results,
+      char* scratch, const UncompressionDict& uncompression_dict) const;
+
+  // Get the iterator from the index reader.
+  //
+  // If input_iter is not set, return a new Iterator.
+  // If input_iter is set, try to update it and return it as Iterator.
+  // However note that in some cases the returned iterator may be different
+  // from input_iter. In such case the returned iterator should be freed.
+  //
+  // Note: ErrorIterator with Status::Incomplete shall be returned if all the
+  // following conditions are met:
+  //  1. We enabled table_options.cache_index_and_filter_blocks.
+  //  2. index is not present in block cache.
+  //  3. We disallowed any io to be performed, that is, read_options ==
+  //     kBlockCacheTier
+  InternalIteratorBase<IndexValue>* NewIndexIterator(
+      const ReadOptions& read_options, bool need_upper_bound_check,
+      IndexBlockIter* input_iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) const;
+
+  // Read block cache from block caches (if set): block_cache and
+  // block_cache_compressed.
+  // On success, Status::OK with be returned and @block will be populated with
+  // pointer to the block as well as its block handle.
+  // @param uncompression_dict Data for presetting the compression library's
+  //    dictionary.
+  template <typename TBlocklike>
+  Status GetDataBlockFromCache(
+      const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+      Cache* block_cache, Cache* block_cache_compressed,
+      const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
+      const UncompressionDict& uncompression_dict, BlockType block_type,
+      GetContext* get_context) const;
+
+  // Put a raw block (maybe compressed) to the corresponding block caches.
+  // This method will perform decompression against raw_block if needed and then
+  // populate the block caches.
+  // On success, Status::OK will be returned; also @block will be populated with
+  // uncompressed block and its cache handle.
+  //
+  // Allocated memory managed by raw_block_contents will be transferred to
+  // PutDataBlockToCache(). After the call, the object will be invalid.
+  // @param uncompression_dict Data for presetting the compression library's
+  //    dictionary.
+  template <typename TBlocklike>
+  Status PutDataBlockToCache(
+      const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+      Cache* block_cache, Cache* block_cache_compressed,
+      CachableEntry<TBlocklike>* cached_block,
+      BlockContents* raw_block_contents, CompressionType raw_block_comp_type,
+      const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
+      MemoryAllocator* memory_allocator, BlockType block_type,
+      GetContext* get_context) const;
+
+  // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
+  // after a call to Seek(key), until handle_result returns false.
+  // May not make such a call if filter policy says that key is not present.
+  friend class TableCache;
+  friend class BlockBasedTableBuilder;
+
+  // Create a index reader based on the index type stored in the table.
+  // Optionally, user can pass a preloaded meta_index_iter for the index that
+  // need to access extra meta blocks for index construction. This parameter
+  // helps avoid re-reading meta index block if caller already created one.
+  Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer,
+                           InternalIterator* preloaded_meta_index_iter,
+                           bool use_cache, bool prefetch, bool pin,
+                           BlockCacheLookupContext* lookup_context,
+                           std::unique_ptr<IndexReader>* index_reader);
+
+  bool FullFilterKeyMayMatch(const ReadOptions& read_options,
+                             FilterBlockReader* filter, const Slice& user_key,
+                             const bool no_io,
+                             const SliceTransform* prefix_extractor,
+                             GetContext* get_context,
+                             BlockCacheLookupContext* lookup_context) const;
+
+  void FullFilterKeysMayMatch(const ReadOptions& read_options,
+                              FilterBlockReader* filter, MultiGetRange* range,
+                              const bool no_io,
+                              const SliceTransform* prefix_extractor,
+                              BlockCacheLookupContext* lookup_context) const;
+
+  static Status PrefetchTail(
+      RandomAccessFileReader* file, uint64_t file_size,
+      TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
+      const bool preload_all,
+      std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
+  Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer,
+                            std::unique_ptr<Block>* metaindex_block,
+                            std::unique_ptr<InternalIterator>* iter);
+  Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer,
+                                          const Slice& handle_value,
+                                          TableProperties** table_properties);
+  Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer,
+                             InternalIterator* meta_iter,
+                             const SequenceNumber largest_seqno);
+  Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer,
+                           InternalIterator* meta_iter,
+                           const InternalKeyComparator& internal_comparator,
+                           BlockCacheLookupContext* lookup_context);
+  Status PrefetchIndexAndFilterBlocks(
+      FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+      BlockBasedTable* new_table, bool prefetch_all,
+      const BlockBasedTableOptions& table_options, const int level,
+      BlockCacheLookupContext* lookup_context);
+
+  static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
+
+  Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
+  Status VerifyChecksumInBlocks(const ReadOptions& read_options,
+                                InternalIteratorBase<IndexValue>* index_iter);
+
+  // Create the filter from the filter block.
+  std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context);
+
+  static void SetupCacheKeyPrefix(Rep* rep);
+
+  // Generate a cache key prefix from the file
+  static void GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
+                                  char* buffer, size_t* size);
+  static void GenerateCachePrefix(Cache* cc, FSWritableFile* file, char* buffer,
+                                  size_t* size);
+
+  // Given an iterator return its offset in file.
+  uint64_t ApproximateOffsetOf(
+      const InternalIteratorBase<IndexValue>& index_iter) const;
+
+  // Helper functions for DumpTable()
+  Status DumpIndexBlock(WritableFile* out_file);
+  Status DumpDataBlocks(WritableFile* out_file);
+  void DumpKeyValue(const Slice& key, const Slice& value,
+                    WritableFile* out_file);
+
+  // A cumulative data block file read in MultiGet lower than this size will
+  // use a stack buffer
+  static constexpr size_t kMultiGetReadStackBufSize = 8192;
+
+  friend class PartitionedFilterBlockReader;
+  friend class PartitionedFilterBlockTest;
+  friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
+};
+
+// Maitaning state of a two-level iteration on a partitioned index structure.
+class BlockBasedTable::PartitionedIndexIteratorState
+    : public TwoLevelIteratorState {
+ public:
+  PartitionedIndexIteratorState(
+      const BlockBasedTable* table,
+      std::unordered_map<uint64_t, CachableEntry<Block>>* block_map);
+  InternalIteratorBase<IndexValue>* NewSecondaryIterator(
+      const BlockHandle& index_value) override;
+
+ private:
+  // Don't own table_
+  const BlockBasedTable* table_;
+  std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
+};
+
+// Stores all the properties associated with a BlockBasedTable.
+// These are immutable.
+struct BlockBasedTable::Rep {
+  Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
+      const BlockBasedTableOptions& _table_opt,
+      const InternalKeyComparator& _internal_comparator, bool skip_filters,
+      int _level, const bool _immortal_table)
+      : ioptions(_ioptions),
+        env_options(_env_options),
+        table_options(_table_opt),
+        filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
+        internal_comparator(_internal_comparator),
+        filter_type(FilterType::kNoFilter),
+        index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
+        hash_index_allow_collision(false),
+        whole_key_filtering(_table_opt.whole_key_filtering),
+        prefix_filtering(true),
+        global_seqno(kDisableGlobalSequenceNumber),
+        level(_level),
+        immortal_table(_immortal_table) {}
+
+  const ImmutableCFOptions& ioptions;
+  const EnvOptions& env_options;
+  const BlockBasedTableOptions table_options;
+  const FilterPolicy* const filter_policy;
+  const InternalKeyComparator& internal_comparator;
+  Status status;
+  std::unique_ptr<RandomAccessFileReader> file;
+  char cache_key_prefix[kMaxCacheKeyPrefixSize];
+  size_t cache_key_prefix_size = 0;
+  char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
+  size_t persistent_cache_key_prefix_size = 0;
+  char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
+  size_t compressed_cache_key_prefix_size = 0;
+  PersistentCacheOptions persistent_cache_options;
+
+  // Footer contains the fixed table information
+  Footer footer;
+
+  std::unique_ptr<IndexReader> index_reader;
+  std::unique_ptr<FilterBlockReader> filter;
+  std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
+
+  enum class FilterType {
+    kNoFilter,
+    kFullFilter,
+    kBlockFilter,
+    kPartitionedFilter,
+  };
+  FilterType filter_type;
+  BlockHandle filter_handle;
+  BlockHandle compression_dict_handle;
+
+  std::shared_ptr<const TableProperties> table_properties;
+  BlockBasedTableOptions::IndexType index_type;
+  bool hash_index_allow_collision;
+  bool whole_key_filtering;
+  bool prefix_filtering;
+  // TODO(kailiu) It is very ugly to use internal key in table, since table
+  // module should not be relying on db module. However to make things easier
+  // and compatible with existing code, we introduce a wrapper that allows
+  // block to extract prefix without knowing if a key is internal or not.
+  // null if no prefix_extractor is passed in when opening the table reader.
+  std::unique_ptr<SliceTransform> internal_prefix_transform;
+  std::shared_ptr<const SliceTransform> table_prefix_extractor;
+
+  std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
+
+  // If global_seqno is used, all Keys in this file will have the same
+  // seqno with value `global_seqno`.
+  //
+  // A value of kDisableGlobalSequenceNumber means that this feature is disabled
+  // and every key have it's own seqno.
+  SequenceNumber global_seqno;
+
+  // the level when the table is opened, could potentially change when trivial
+  // move is involved
+  int level;
+
+  // If false, blocks in this file are definitely all uncompressed. Knowing this
+  // before reading individual blocks enables certain optimizations.
+  bool blocks_maybe_compressed = true;
+
+  // If true, data blocks in this file are definitely ZSTD compressed. If false
+  // they might not be. When false we skip creating a ZSTD digested
+  // uncompression dictionary. Even if we get a false negative, things should
+  // still work, just not as quickly.
+  bool blocks_definitely_zstd_compressed = false;
+
+  // These describe how index is encoded.
+  bool index_has_first_key = false;
+  bool index_key_includes_seq = true;
+  bool index_value_is_full = true;
+
+  const bool immortal_table;
+
+  SequenceNumber get_global_seqno(BlockType block_type) const {
+    return (block_type == BlockType::kFilter ||
+            block_type == BlockType::kCompressionDictionary)
+               ? kDisableGlobalSequenceNumber
+               : global_seqno;
+  }
+
+  uint64_t cf_id_for_tracing() const {
+    return table_properties
+               ? table_properties->column_family_id
+               : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context::
+                     kUnknownColumnFamily;
+  }
+
+  Slice cf_name_for_tracing() const {
+    return table_properties ? table_properties->column_family_name
+                            : BlockCacheTraceHelper::kUnknownColumnFamilyName;
+  }
+
+  uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
+
+  uint64_t sst_number_for_tracing() const {
+    return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
+  }
+  void CreateFilePrefetchBuffer(
+      size_t readahead_size, size_t max_readahead_size,
+      std::unique_ptr<FilePrefetchBuffer>* fpb) const {
+    fpb->reset(new FilePrefetchBuffer(file.get(), readahead_size,
+                                      max_readahead_size,
+                                      !ioptions.allow_mmap_reads /* enable */));
+  }
+};
+
+// Iterates over the contents of BlockBasedTable.
+template <class TBlockIter, typename TValue = Slice>
+class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+ public:
+  BlockBasedTableIterator(const BlockBasedTable* table,
+                          const ReadOptions& read_options,
+                          const InternalKeyComparator& icomp,
+                          InternalIteratorBase<IndexValue>* index_iter,
+                          bool check_filter, bool need_upper_bound_check,
+                          const SliceTransform* prefix_extractor,
+                          BlockType block_type, TableReaderCaller caller,
+                          size_t compaction_readahead_size = 0)
+      : table_(table),
+        read_options_(read_options),
+        icomp_(icomp),
+        user_comparator_(icomp.user_comparator()),
+        index_iter_(index_iter),
+        pinned_iters_mgr_(nullptr),
+        block_iter_points_to_real_block_(false),
+        check_filter_(check_filter),
+        need_upper_bound_check_(need_upper_bound_check),
+        prefix_extractor_(prefix_extractor),
+        block_type_(block_type),
+        lookup_context_(caller),
+        compaction_readahead_size_(compaction_readahead_size) {}
+
+  ~BlockBasedTableIterator() { delete index_iter_; }
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() final override;
+  bool NextAndGetResult(IterateResult* result) override;
+  void Prev() override;
+  bool Valid() const override {
+    return !is_out_of_bound_ &&
+           (is_at_first_key_from_index_ ||
+            (block_iter_points_to_real_block_ && block_iter_.Valid()));
+  }
+  Slice key() const override {
+    assert(Valid());
+    if (is_at_first_key_from_index_) {
+      return index_iter_->value().first_internal_key;
+    } else {
+      return block_iter_.key();
+    }
+  }
+  Slice user_key() const override {
+    assert(Valid());
+    if (is_at_first_key_from_index_) {
+      return ExtractUserKey(index_iter_->value().first_internal_key);
+    } else {
+      return block_iter_.user_key();
+    }
+  }
+  TValue value() const override {
+    assert(Valid());
+
+    // Load current block if not loaded.
+    if (is_at_first_key_from_index_ &&
+        !const_cast<BlockBasedTableIterator*>(this)
+             ->MaterializeCurrentBlock()) {
+      // Oops, index is not consistent with block contents, but we have
+      // no good way to report error at this point. Let's return empty value.
+      return TValue();
+    }
+
+    return block_iter_.value();
+  }
+  Status status() const override {
+    // Prefix index set status to NotFound when the prefix does not exist
+    if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
+      return index_iter_->status();
+    } else if (block_iter_points_to_real_block_) {
+      return block_iter_.status();
+    } else {
+      return Status::OK();
+    }
+  }
+
+  // Whether iterator invalidated for being out of bound.
+  bool IsOutOfBound() override { return is_out_of_bound_; }
+
+  inline bool MayBeOutOfUpperBound() override {
+    assert(Valid());
+    return !data_block_within_upper_bound_;
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+  }
+  bool IsKeyPinned() const override {
+    // Our key comes either from block_iter_'s current key
+    // or index_iter_'s current *value*.
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
+            (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
+  }
+  bool IsValuePinned() const override {
+    // Load current block if not loaded.
+    if (is_at_first_key_from_index_) {
+      const_cast<BlockBasedTableIterator*>(this)->MaterializeCurrentBlock();
+    }
+    // BlockIter::IsValuePinned() is always true. No need to check
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           block_iter_points_to_real_block_;
+  }
+
+  void ResetDataIter() {
+    if (block_iter_points_to_real_block_) {
+      if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
+        block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
+      }
+      block_iter_.Invalidate(Status::OK());
+      block_iter_points_to_real_block_ = false;
+    }
+  }
+
+  void SavePrevIndexValue() {
+    if (block_iter_points_to_real_block_) {
+      // Reseek. If they end up with the same data block, we shouldn't re-fetch
+      // the same data block.
+      prev_block_offset_ = index_iter_->value().handle.offset();
+    }
+  }
+
+ private:
+  enum class IterDirection {
+    kForward,
+    kBackward,
+  };
+
+  const BlockBasedTable* table_;
+  const ReadOptions read_options_;
+  const InternalKeyComparator& icomp_;
+  UserComparatorWrapper user_comparator_;
+  InternalIteratorBase<IndexValue>* index_iter_;
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  TBlockIter block_iter_;
+
+  // True if block_iter_ is initialized and points to the same block
+  // as index iterator.
+  bool block_iter_points_to_real_block_;
+  // See InternalIteratorBase::IsOutOfBound().
+  bool is_out_of_bound_ = false;
+  // Whether current data block being fully within iterate upper bound.
+  bool data_block_within_upper_bound_ = false;
+  // True if we're standing at the first key of a block, and we haven't loaded
+  // that block yet. A call to value() will trigger loading the block.
+  bool is_at_first_key_from_index_ = false;
+  bool check_filter_;
+  // TODO(Zhongyi): pick a better name
+  bool need_upper_bound_check_;
+  const SliceTransform* prefix_extractor_;
+  BlockType block_type_;
+  uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
+  BlockCacheLookupContext lookup_context_;
+  // Readahead size used in compaction, its value is used only if
+  // lookup_context_.caller = kCompaction.
+  size_t compaction_readahead_size_;
+
+  size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
+  size_t readahead_limit_ = 0;
+  int64_t num_file_reads_ = 0;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
+
+  // If `target` is null, seek to first.
+  void SeekImpl(const Slice* target);
+
+  void InitDataBlock();
+  bool MaterializeCurrentBlock();
+  void FindKeyForward();
+  void FindBlockForward();
+  void FindKeyBackward();
+  void CheckOutOfBound();
+
+  // Check if data block is fully within iterate_upper_bound.
+  //
+  // Note MyRocks may update iterate bounds between seek. To workaround it,
+  // we need to check and update data_block_within_upper_bound_ accordingly.
+  void CheckDataBlockWithinUpperBound();
+
+  bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
+    if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
+      // Upper bound check isn't sufficnet for backward direction to
+      // guarantee the same result as total order, so disable prefix
+      // check.
+      return true;
+    }
+    if (check_filter_ &&
+        !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
+                                need_upper_bound_check_, &lookup_context_)) {
+      // TODO remember the iterator is invalidated because of prefix
+      // match. This can avoid the upper level file iterator to falsely
+      // believe the position is the end of the SST file and move to
+      // the first key of the next file.
+      ResetDataIter();
+      return false;
+    }
+    return true;
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_builder.cc b/src/rocksdb/table/block_based/block_builder.cc
new file mode 100644
index 000000000..6f77ef97c
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_builder.cc
@@ -0,0 +1,196 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// BlockBuilder generates blocks where keys are prefix-compressed:
+//
+// When we store a key, we drop the prefix shared with the previous
+// string.  This helps reduce the space requirement significantly.
+// Furthermore, once every K keys, we do not apply the prefix
+// compression and store the entire key.  We call this a "restart
+// point".  The tail end of the block stores the offsets of all of the
+// restart points, and can be used to do a binary search when looking
+// for a particular key.  Values are stored as-is (without compression)
+// immediately following the corresponding key.
+//
+// An entry for a particular key-value pair has the form:
+//     shared_bytes: varint32
+//     unshared_bytes: varint32
+//     value_length: varint32
+//     key_delta: char[unshared_bytes]
+//     value: char[value_length]
+// shared_bytes == 0 for restart points.
+//
+// The trailer of the block has the form:
+//     restarts: uint32[num_restarts]
+//     num_restarts: uint32
+// restarts[i] contains the offset within the block of the ith restart point.
+
+#include "table/block_based/block_builder.h"
+
+#include <assert.h>
+#include <algorithm>
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "table/block_based/data_block_footer.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlockBuilder::BlockBuilder(
+    int block_restart_interval, bool use_delta_encoding,
+    bool use_value_delta_encoding,
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    double data_block_hash_table_util_ratio)
+    : block_restart_interval_(block_restart_interval),
+      use_delta_encoding_(use_delta_encoding),
+      use_value_delta_encoding_(use_value_delta_encoding),
+      restarts_(),
+      counter_(0),
+      finished_(false) {
+  switch (index_type) {
+    case BlockBasedTableOptions::kDataBlockBinarySearch:
+      break;
+    case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+      data_block_hash_index_builder_.Initialize(
+          data_block_hash_table_util_ratio);
+      break;
+    default:
+      assert(0);
+  }
+  assert(block_restart_interval_ >= 1);
+  restarts_.push_back(0);  // First restart point is at offset 0
+  estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
+}
+
+void BlockBuilder::Reset() {
+  buffer_.clear();
+  restarts_.clear();
+  restarts_.push_back(0);  // First restart point is at offset 0
+  estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
+  counter_ = 0;
+  finished_ = false;
+  last_key_.clear();
+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Reset();
+  }
+}
+
+size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key,
+                                         const Slice& value) const {
+  size_t estimate = CurrentSizeEstimate();
+  // Note: this is an imprecise estimate as it accounts for the whole key size
+  // instead of non-shared key size.
+  estimate += key.size();
+  // In value delta encoding we estimate the value delta size as half the full
+  // value size since only the size field of block handle is encoded.
+  estimate +=
+      !use_value_delta_encoding_ || (counter_ >= block_restart_interval_)
+          ? value.size()
+          : value.size() / 2;
+
+  if (counter_ >= block_restart_interval_) {
+    estimate += sizeof(uint32_t);  // a new restart entry.
+  }
+
+  estimate += sizeof(int32_t);  // varint for shared prefix length.
+  // Note: this is an imprecise estimate as we will have to encoded size, one
+  // for shared key and one for non-shared key.
+  estimate += VarintLength(key.size());  // varint for key length.
+  if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) {
+    estimate += VarintLength(value.size());  // varint for value length.
+  }
+
+  return estimate;
+}
+
+Slice BlockBuilder::Finish() {
+  // Append restart array
+  for (size_t i = 0; i < restarts_.size(); i++) {
+    PutFixed32(&buffer_, restarts_[i]);
+  }
+
+  uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
+  BlockBasedTableOptions::DataBlockIndexType index_type =
+      BlockBasedTableOptions::kDataBlockBinarySearch;
+  if (data_block_hash_index_builder_.Valid() &&
+      CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
+    data_block_hash_index_builder_.Finish(buffer_);
+    index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  }
+
+  // footer is a packed format of data_block_index_type and num_restarts
+  uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
+
+  PutFixed32(&buffer_, block_footer);
+  finished_ = true;
+  return Slice(buffer_);
+}
+
+void BlockBuilder::Add(const Slice& key, const Slice& value,
+                       const Slice* const delta_value) {
+  assert(!finished_);
+  assert(counter_ <= block_restart_interval_);
+  assert(!use_value_delta_encoding_ || delta_value);
+  size_t shared = 0;  // number of bytes shared with prev key
+  if (counter_ >= block_restart_interval_) {
+    // Restart compression
+    restarts_.push_back(static_cast<uint32_t>(buffer_.size()));
+    estimate_ += sizeof(uint32_t);
+    counter_ = 0;
+
+    if (use_delta_encoding_) {
+      // Update state
+      last_key_.assign(key.data(), key.size());
+    }
+  } else if (use_delta_encoding_) {
+    Slice last_key_piece(last_key_);
+    // See how much sharing to do with previous string
+    shared = key.difference_offset(last_key_piece);
+
+    // Update state
+    // We used to just copy the changed data here, but it appears to be
+    // faster to just copy the whole thing.
+    last_key_.assign(key.data(), key.size());
+  }
+
+  const size_t non_shared = key.size() - shared;
+  const size_t curr_size = buffer_.size();
+
+  if (use_value_delta_encoding_) {
+    // Add "<shared><non_shared>" to buffer_
+    PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+                        static_cast<uint32_t>(non_shared));
+  } else {
+    // Add "<shared><non_shared><value_size>" to buffer_
+    PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+                                static_cast<uint32_t>(non_shared),
+                                static_cast<uint32_t>(value.size()));
+  }
+
+  // Add string delta to buffer_ followed by value
+  buffer_.append(key.data() + shared, non_shared);
+  // Use value delta encoding only when the key has shared bytes. This would
+  // simplify the decoding, where it can figure which decoding to use simply by
+  // looking at the shared bytes size.
+  if (shared != 0 && use_value_delta_encoding_) {
+    buffer_.append(delta_value->data(), delta_value->size());
+  } else {
+    buffer_.append(value.data(), value.size());
+  }
+
+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Add(ExtractUserKey(key),
+                                       restarts_.size() - 1);
+  }
+
+  counter_++;
+  estimate_ += buffer_.size() - curr_size;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_builder.h b/src/rocksdb/table/block_based/block_builder.h
new file mode 100644
index 000000000..42c996e5b
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_builder.h
@@ -0,0 +1,75 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+
+#include <stdint.h>
+#include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "table/block_based/data_block_hash_index.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder {
+ public:
+  BlockBuilder(const BlockBuilder&) = delete;
+  void operator=(const BlockBuilder&) = delete;
+
+  explicit BlockBuilder(int block_restart_interval,
+                        bool use_delta_encoding = true,
+                        bool use_value_delta_encoding = false,
+                        BlockBasedTableOptions::DataBlockIndexType index_type =
+                            BlockBasedTableOptions::kDataBlockBinarySearch,
+                        double data_block_hash_table_util_ratio = 0.75);
+
+  // Reset the contents as if the BlockBuilder was just constructed.
+  void Reset();
+
+  // REQUIRES: Finish() has not been called since the last call to Reset().
+  // REQUIRES: key is larger than any previously added key
+  void Add(const Slice& key, const Slice& value,
+           const Slice* const delta_value = nullptr);
+
+  // Finish building the block and return a slice that refers to the
+  // block contents.  The returned slice will remain valid for the
+  // lifetime of this builder or until Reset() is called.
+  Slice Finish();
+
+  // Returns an estimate of the current (uncompressed) size of the block
+  // we are building.
+  inline size_t CurrentSizeEstimate() const {
+    return estimate_ + (data_block_hash_index_builder_.Valid()
+                            ? data_block_hash_index_builder_.EstimateSize()
+                            : 0);
+  }
+
+  // Returns an estimated block size after appending key and value.
+  size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const;
+
+  // Return true iff no entries have been added since the last Reset()
+  bool empty() const { return buffer_.empty(); }
+
+ private:
+  const int block_restart_interval_;
+  // TODO(myabandeh): put it into a separate IndexBlockBuilder
+  const bool use_delta_encoding_;
+  // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values
+  const bool use_value_delta_encoding_;
+
+  std::string buffer_;              // Destination buffer
+  std::vector<uint32_t> restarts_;  // Restart points
+  size_t estimate_;
+  int counter_;    // Number of entries emitted since restart
+  bool finished_;  // Has Finish() been called?
+  std::string last_key_;
+  DataBlockHashIndexBuilder data_block_hash_index_builder_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefix_index.cc b/src/rocksdb/table/block_based/block_prefix_index.cc
new file mode 100644
index 000000000..f9d92c74c
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefix_index.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/block_prefix_index.h"
+
+#include <vector>
+
+#include "memory/arena.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+inline uint32_t Hash(const Slice& s) {
+  return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0);
+}
+
+inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
+  return Hash(prefix) % num_buckets;
+}
+
+// The prefix block index is simply a bucket array, with each entry pointing to
+// the blocks that span the prefixes hashed to this bucket.
+//
+// To reduce memory footprint, if there is only one block per bucket, the entry
+// stores the block id directly. If there are more than one blocks per bucket,
+// because of hash collision or a single prefix spanning multiple blocks,
+// the entry points to an array of block ids. The block array is an array of
+// uint32_t's. The first uint32_t indicates the total number of blocks, followed
+// by the block ids.
+//
+// To differentiate the two cases, the high order bit of the entry indicates
+// whether it is a 'pointer' into a separate block array.
+// 0x7FFFFFFF is reserved for empty bucket.
+
+const uint32_t kNoneBlock = 0x7FFFFFFF;
+const uint32_t kBlockArrayMask = 0x80000000;
+
+inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; }
+
+inline bool IsBlockId(uint32_t block_id) {
+  return (block_id & kBlockArrayMask) == 0;
+}
+
+inline uint32_t DecodeIndex(uint32_t block_id) {
+  uint32_t index = block_id ^ kBlockArrayMask;
+  assert(index < kBlockArrayMask);
+  return index;
+}
+
+inline uint32_t EncodeIndex(uint32_t index) {
+  assert(index < kBlockArrayMask);
+  return index | kBlockArrayMask;
+}
+
+// temporary storage for prefix information during index building
+struct PrefixRecord {
+  Slice prefix;
+  uint32_t start_block;
+  uint32_t end_block;
+  uint32_t num_blocks;
+  PrefixRecord* next;
+};
+
+class BlockPrefixIndex::Builder {
+ public:
+  explicit Builder(const SliceTransform* internal_prefix_extractor)
+      : internal_prefix_extractor_(internal_prefix_extractor) {}
+
+  void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) {
+    PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
+        arena_.AllocateAligned(sizeof(PrefixRecord)));
+    record->prefix = key_prefix;
+    record->start_block = start_block;
+    record->end_block = start_block + num_blocks - 1;
+    record->num_blocks = num_blocks;
+    prefixes_.push_back(record);
+  }
+
+  BlockPrefixIndex* Finish() {
+    // For now, use roughly 1:1 prefix to bucket ratio.
+    uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
+
+    // Collect prefix records that hash to the same bucket, into a single
+    // linklist.
+    std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
+    std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
+    for (PrefixRecord* current : prefixes_) {
+      uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
+      // merge the prefix block span if the first block of this prefix is
+      // connected to the last block of the previous prefix.
+      PrefixRecord* prev = prefixes_per_bucket[bucket];
+      if (prev) {
+        assert(current->start_block >= prev->end_block);
+        auto distance = current->start_block - prev->end_block;
+        if (distance <= 1) {
+          prev->end_block = current->end_block;
+          prev->num_blocks = prev->end_block - prev->start_block + 1;
+          num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
+          continue;
+        }
+      }
+      current->next = prev;
+      prefixes_per_bucket[bucket] = current;
+      num_blocks_per_bucket[bucket] += current->num_blocks;
+    }
+
+    // Calculate the block array buffer size
+    uint32_t total_block_array_entries = 0;
+    for (uint32_t i = 0; i < num_buckets; i++) {
+      uint32_t num_blocks = num_blocks_per_bucket[i];
+      if (num_blocks > 1) {
+        total_block_array_entries += (num_blocks + 1);
+      }
+    }
+
+    // Populate the final prefix block index
+    uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
+    uint32_t* buckets = new uint32_t[num_buckets];
+    uint32_t offset = 0;
+    for (uint32_t i = 0; i < num_buckets; i++) {
+      uint32_t num_blocks = num_blocks_per_bucket[i];
+      if (num_blocks == 0) {
+        assert(prefixes_per_bucket[i] == nullptr);
+        buckets[i] = kNoneBlock;
+      } else if (num_blocks == 1) {
+        assert(prefixes_per_bucket[i] != nullptr);
+        assert(prefixes_per_bucket[i]->next == nullptr);
+        buckets[i] = prefixes_per_bucket[i]->start_block;
+      } else {
+        assert(total_block_array_entries > 0);
+        assert(prefixes_per_bucket[i] != nullptr);
+        buckets[i] = EncodeIndex(offset);
+        block_array_buffer[offset] = num_blocks;
+        uint32_t* last_block = &block_array_buffer[offset + num_blocks];
+        auto current = prefixes_per_bucket[i];
+        // populate block ids from largest to smallest
+        while (current != nullptr) {
+          for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
+            *last_block = current->end_block - iter;
+            last_block--;
+          }
+          current = current->next;
+        }
+        assert(last_block == &block_array_buffer[offset]);
+        offset += (num_blocks + 1);
+      }
+    }
+
+    assert(offset == total_block_array_entries);
+
+    return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets,
+                                buckets, total_block_array_entries,
+                                block_array_buffer);
+  }
+
+ private:
+  const SliceTransform* internal_prefix_extractor_;
+
+  std::vector<PrefixRecord*> prefixes_;
+  Arena arena_;
+};
+
+Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
+                                const Slice& prefixes, const Slice& prefix_meta,
+                                BlockPrefixIndex** prefix_index) {
+  uint64_t pos = 0;
+  auto meta_pos = prefix_meta;
+  Status s;
+  Builder builder(internal_prefix_extractor);
+
+  while (!meta_pos.empty()) {
+    uint32_t prefix_size = 0;
+    uint32_t entry_index = 0;
+    uint32_t num_blocks = 0;
+    if (!GetVarint32(&meta_pos, &prefix_size) ||
+        !GetVarint32(&meta_pos, &entry_index) ||
+        !GetVarint32(&meta_pos, &num_blocks)) {
+      s = Status::Corruption(
+          "Corrupted prefix meta block: unable to read from it.");
+      break;
+    }
+    if (pos + prefix_size > prefixes.size()) {
+      s = Status::Corruption(
+          "Corrupted prefix meta block: size inconsistency.");
+      break;
+    }
+    Slice prefix(prefixes.data() + pos, prefix_size);
+    builder.Add(prefix, entry_index, num_blocks);
+
+    pos += prefix_size;
+  }
+
+  if (s.ok() && pos != prefixes.size()) {
+    s = Status::Corruption("Corrupted prefix meta block");
+  }
+
+  if (s.ok()) {
+    *prefix_index = builder.Finish();
+  }
+
+  return s;
+}
+
+uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) {
+  Slice prefix = internal_prefix_extractor_->Transform(key);
+
+  uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
+  uint32_t block_id = buckets_[bucket];
+
+  if (IsNone(block_id)) {
+    return 0;
+  } else if (IsBlockId(block_id)) {
+    *blocks = &buckets_[bucket];
+    return 1;
+  } else {
+    uint32_t index = DecodeIndex(block_id);
+    assert(index < num_block_array_buffer_entries_);
+    *blocks = &block_array_buffer_[index + 1];
+    uint32_t num_blocks = block_array_buffer_[index];
+    assert(num_blocks > 1);
+    assert(index + num_blocks < num_block_array_buffer_entries_);
+    return num_blocks;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefix_index.h b/src/rocksdb/table/block_based/block_prefix_index.h
new file mode 100644
index 000000000..04121320e
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefix_index.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+class Iterator;
+class Slice;
+class SliceTransform;
+
+// Build a hash-based index to speed up the lookup for "index block".
+// BlockHashIndex accepts a key and, if found, returns its restart index within
+// that index block.
+class BlockPrefixIndex {
+ public:
+  // Maps a key to a list of data blocks that could potentially contain
+  // the key, based on the prefix.
+  // Returns the total number of relevant blocks, 0 means the key does
+  // not exist.
+  uint32_t GetBlocks(const Slice& key, uint32_t** blocks);
+
+  size_t ApproximateMemoryUsage() const {
+    return sizeof(BlockPrefixIndex) +
+           (num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t);
+  }
+
+  // Create hash index by reading from the metadata blocks.
+  // @params prefixes: a sequence of prefixes.
+  // @params prefix_meta: contains the "metadata" to of the prefixes.
+  static Status Create(const SliceTransform* hash_key_extractor,
+                       const Slice& prefixes, const Slice& prefix_meta,
+                       BlockPrefixIndex** prefix_index);
+
+  ~BlockPrefixIndex() {
+    delete[] buckets_;
+    delete[] block_array_buffer_;
+  }
+
+ private:
+  class Builder;
+  friend Builder;
+
+  BlockPrefixIndex(const SliceTransform* internal_prefix_extractor,
+                   uint32_t num_buckets, uint32_t* buckets,
+                   uint32_t num_block_array_buffer_entries,
+                   uint32_t* block_array_buffer)
+      : internal_prefix_extractor_(internal_prefix_extractor),
+        num_buckets_(num_buckets),
+        num_block_array_buffer_entries_(num_block_array_buffer_entries),
+        buckets_(buckets),
+        block_array_buffer_(block_array_buffer) {}
+
+  const SliceTransform* internal_prefix_extractor_;
+  uint32_t num_buckets_;
+  uint32_t num_block_array_buffer_entries_;
+  uint32_t* buckets_;
+  uint32_t* block_array_buffer_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_test.cc b/src/rocksdb/table/block_based/block_test.cc
new file mode 100644
index 000000000..efa5b3ae3
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_test.cc
@@ -0,0 +1,627 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include <stdio.h>
+#include <algorithm>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string RandomString(Random *rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random *rnd) {
+  char buf[50];
+  char *p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += RandomString(rnd, padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string> *keys,
+                       std::vector<std::string> *values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(RandomString(&rnd, 100));
+    }
+  }
+}
+
+class BlockTest : public testing::Test {};
+
+// block test
+TEST_F(BlockTest, SimpleTest) {
+  Random rnd(301);
+  Options options = Options();
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  BlockBuilder builder(16);
+  int num_records = 100000;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records);
+  // add a bunch of records to a block
+  for (int i = 0; i < num_records; i++) {
+    builder.Add(keys[i], values[i]);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+  // read contents of block sequentially
+  int count = 0;
+  InternalIterator *iter =
+      reader.NewDataIterator(options.comparator, options.comparator);
+  for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
+    // read kv from block
+    Slice k = iter->key();
+    Slice v = iter->value();
+
+    // compare with lookaside array
+    ASSERT_EQ(k.ToString().compare(keys[count]), 0);
+    ASSERT_EQ(v.ToString().compare(values[count]), 0);
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter = reader.NewDataIterator(options.comparator, options.comparator);
+  for (int i = 0; i < num_records; i++) {
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(keys[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    Slice v = iter->value();
+    ASSERT_EQ(v.ToString().compare(values[index]), 0);
+  }
+  delete iter;
+}
+
+// return the block contents
+BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
+                               const std::vector<std::string> &keys,
+                               const std::vector<std::string> &values,
+                               const int /*prefix_group_size*/ = 1) {
+  builder->reset(new BlockBuilder(1 /* restart interval */));
+
+  // Add only half of the keys
+  for (size_t i = 0; i < keys.size(); ++i) {
+    (*builder)->Add(keys[i], values[i]);
+  }
+  Slice rawblock = (*builder)->Finish();
+
+  BlockContents contents;
+  contents.data = rawblock;
+
+  return contents;
+}
+
+void CheckBlockContents(BlockContents contents, const int max_key,
+                        const std::vector<std::string> &keys,
+                        const std::vector<std::string> &values) {
+  const size_t prefix_size = 6;
+  // create block reader
+  BlockContents contents_ref(contents.data);
+  Block reader1(std::move(contents), kDisableGlobalSequenceNumber);
+  Block reader2(std::move(contents_ref), kDisableGlobalSequenceNumber);
+
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(prefix_size));
+
+  std::unique_ptr<InternalIterator> regular_iter(
+      reader2.NewDataIterator(BytewiseComparator(), BytewiseComparator()));
+
+  // Seek existent keys
+  for (size_t i = 0; i < keys.size(); i++) {
+    regular_iter->Seek(keys[i]);
+    ASSERT_OK(regular_iter->status());
+    ASSERT_TRUE(regular_iter->Valid());
+
+    Slice v = regular_iter->value();
+    ASSERT_EQ(v.ToString().compare(values[i]), 0);
+  }
+
+  // Seek non-existent keys.
+  // For hash index, if no key with a given prefix is not found, iterator will
+  // simply be set as invalid; whereas the binary search based iterator will
+  // return the one that is closest.
+  for (int i = 1; i < max_key - 1; i += 2) {
+    auto key = GenerateKey(i, 0, 0, nullptr);
+    regular_iter->Seek(key);
+    ASSERT_TRUE(regular_iter->Valid());
+  }
+}
+
+// In this test case, no two key share same prefix.
+TEST_F(BlockTest, SimpleIndexHash) {
+  const int kMaxKey = 100000;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  GenerateRandomKVs(&keys, &values, 0 /* first key id */,
+                    kMaxKey /* last key id */, 2 /* step */,
+                    8 /* padding size (8 bytes randomly generated suffix) */);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values);
+
+  CheckBlockContents(std::move(contents), kMaxKey, keys, values);
+}
+
+TEST_F(BlockTest, IndexHashWithSharedPrefix) {
+  const int kMaxKey = 100000;
+  // for each prefix, there will be 5 keys starts with it.
+  const int kPrefixGroup = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  // Generate keys with same prefix.
+  GenerateRandomKVs(&keys, &values, 0,  // first key id
+                    kMaxKey,            // last key id
+                    2,                  // step
+                    10,                 // padding size,
+                    kPrefixGroup);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
+
+  CheckBlockContents(std::move(contents), kMaxKey, keys, values);
+}
+
+// A slow and accurate version of BlockReadAmpBitmap that simply store
+// all the marked ranges in a set.
+class BlockReadAmpBitmapSlowAndAccurate {
+ public:
+  void Mark(size_t start_offset, size_t end_offset) {
+    assert(end_offset >= start_offset);
+    marked_ranges_.emplace(end_offset, start_offset);
+  }
+
+  void ResetCheckSequence() { iter_valid_ = false; }
+
+  // Return true if any byte in this range was Marked
+  // This does linear search from the previous position. When calling
+  // multiple times, `offset` needs to be incremental to get correct results.
+  // Call ResetCheckSequence() to reset it.
+  bool IsPinMarked(size_t offset) {
+    if (iter_valid_) {
+      // Has existing iterator, try linear search from
+      // the iterator.
+      for (int i = 0; i < 64; i++) {
+        if (offset < iter_->second) {
+          return false;
+        }
+        if (offset <= iter_->first) {
+          return true;
+        }
+
+        iter_++;
+        if (iter_ == marked_ranges_.end()) {
+          iter_valid_ = false;
+          return false;
+        }
+      }
+    }
+    // Initial call or have linear searched too many times.
+    // Do binary search.
+    iter_ = marked_ranges_.lower_bound(
+        std::make_pair(offset, static_cast<size_t>(0)));
+    if (iter_ == marked_ranges_.end()) {
+      iter_valid_ = false;
+      return false;
+    }
+    iter_valid_ = true;
+    return offset <= iter_->first && offset >= iter_->second;
+  }
+
+ private:
+  std::set<std::pair<size_t, size_t>> marked_ranges_;
+  std::set<std::pair<size_t, size_t>>::iterator iter_;
+  bool iter_valid_ = false;
+};
+
+TEST_F(BlockTest, BlockReadAmpBitmap) {
+  uint32_t pin_offset = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockReadAmpBitmap:rnd", [&pin_offset](void *arg) {
+        pin_offset = *(static_cast<uint32_t *>(arg));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  std::vector<size_t> block_sizes = {
+      1,                // 1 byte
+      32,               // 32 bytes
+      61,               // 61 bytes
+      64,               // 64 bytes
+      512,              // 0.5 KB
+      1024,             // 1 KB
+      1024 * 4,         // 4 KB
+      1024 * 10,        // 10 KB
+      1024 * 50,        // 50 KB
+      1024 * 1024 * 4,  // 5 MB
+      777,
+      124653,
+  };
+  const size_t kBytesPerBit = 64;
+
+  Random rnd(301);
+  for (size_t block_size : block_sizes) {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    BlockReadAmpBitmap read_amp_bitmap(block_size, kBytesPerBit, stats.get());
+    BlockReadAmpBitmapSlowAndAccurate read_amp_slow_and_accurate;
+
+    size_t needed_bits = (block_size / kBytesPerBit);
+    if (block_size % kBytesPerBit != 0) {
+      needed_bits++;
+    }
+
+    ASSERT_EQ(stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES), block_size);
+
+    // Generate some random entries
+    std::vector<size_t> random_entry_offsets;
+    for (int i = 0; i < 1000; i++) {
+      random_entry_offsets.push_back(rnd.Next() % block_size);
+    }
+    std::sort(random_entry_offsets.begin(), random_entry_offsets.end());
+    auto it =
+        std::unique(random_entry_offsets.begin(), random_entry_offsets.end());
+    random_entry_offsets.resize(
+        std::distance(random_entry_offsets.begin(), it));
+
+    std::vector<std::pair<size_t, size_t>> random_entries;
+    for (size_t i = 0; i < random_entry_offsets.size(); i++) {
+      size_t entry_start = random_entry_offsets[i];
+      size_t entry_end;
+      if (i + 1 < random_entry_offsets.size()) {
+        entry_end = random_entry_offsets[i + 1] - 1;
+      } else {
+        entry_end = block_size - 1;
+      }
+      random_entries.emplace_back(entry_start, entry_end);
+    }
+
+    for (size_t i = 0; i < random_entries.size(); i++) {
+      read_amp_slow_and_accurate.ResetCheckSequence();
+      auto &current_entry = random_entries[rnd.Next() % random_entries.size()];
+
+      read_amp_bitmap.Mark(static_cast<uint32_t>(current_entry.first),
+                           static_cast<uint32_t>(current_entry.second));
+      read_amp_slow_and_accurate.Mark(current_entry.first,
+                                      current_entry.second);
+
+      size_t total_bits = 0;
+      for (size_t bit_idx = 0; bit_idx < needed_bits; bit_idx++) {
+        total_bits += read_amp_slow_and_accurate.IsPinMarked(
+            bit_idx * kBytesPerBit + pin_offset);
+      }
+      size_t expected_estimate_useful = total_bits * kBytesPerBit;
+      size_t got_estimate_useful =
+          stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+      ASSERT_EQ(expected_estimate_useful, got_estimate_useful);
+    }
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlockTest, BlockWithReadAmpBitmap) {
+  Random rnd(301);
+  Options options = Options();
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  BlockBuilder builder(16);
+  int num_records = 10000;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records, 1);
+  // add a bunch of records to a block
+  for (int i = 0; i < num_records; i++) {
+    builder.Add(keys[i], values[i]);
+  }
+
+  Slice rawblock = builder.Finish();
+  const size_t kBytesPerBit = 8;
+
+  // Read the block sequentially using Next()
+  {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber,
+                 kBytesPerBit, stats.get());
+
+    // read contents of block sequentially
+    size_t read_bytes = 0;
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, options.comparator, nullptr, stats.get());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      iter->value();
+      read_bytes += iter->TEST_CurrentEntrySize();
+
+      double semi_acc_read_amp =
+          static_cast<double>(read_bytes) / rawblock.size();
+      double read_amp = static_cast<double>(stats->getTickerCount(
+                            READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+                        stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      // Error in read amplification will be less than 1% if we are reading
+      // sequentially
+      double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+      EXPECT_LT(error_pct, 1);
+    }
+
+    delete iter;
+  }
+
+  // Read the block sequentially using Seek()
+  {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber,
+                 kBytesPerBit, stats.get());
+
+    size_t read_bytes = 0;
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, options.comparator, nullptr, stats.get());
+    for (int i = 0; i < num_records; i++) {
+      Slice k(keys[i]);
+
+      // search in block for this key
+      iter->Seek(k);
+      iter->value();
+      read_bytes += iter->TEST_CurrentEntrySize();
+
+      double semi_acc_read_amp =
+          static_cast<double>(read_bytes) / rawblock.size();
+      double read_amp = static_cast<double>(stats->getTickerCount(
+                            READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+                        stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      // Error in read amplification will be less than 1% if we are reading
+      // sequentially
+      double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+      EXPECT_LT(error_pct, 1);
+    }
+    delete iter;
+  }
+
+  // Read the block randomly
+  {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber,
+                 kBytesPerBit, stats.get());
+
+    size_t read_bytes = 0;
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, options.comparator, nullptr, stats.get());
+    std::unordered_set<int> read_keys;
+    for (int i = 0; i < num_records; i++) {
+      int index = rnd.Uniform(num_records);
+      Slice k(keys[index]);
+
+      iter->Seek(k);
+      iter->value();
+      if (read_keys.find(index) == read_keys.end()) {
+        read_keys.insert(index);
+        read_bytes += iter->TEST_CurrentEntrySize();
+      }
+
+      double semi_acc_read_amp =
+          static_cast<double>(read_bytes) / rawblock.size();
+      double read_amp = static_cast<double>(stats->getTickerCount(
+                            READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+                        stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+      // Error in read amplification will be less than 2% if we are reading
+      // randomly
+      EXPECT_LT(error_pct, 2);
+    }
+    delete iter;
+  }
+}
+
+TEST_F(BlockTest, ReadAmpBitmapPow2) {
+  std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32u);
+
+  ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u);
+}
+
+class IndexBlockTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  IndexBlockTest() = default;
+
+  bool useValueDeltaEncoding() const { return std::get<0>(GetParam()); }
+  bool includeFirstKey() const { return std::get<1>(GetParam()); }
+};
+
+// Similar to GenerateRandomKVs but for index block contents.
+void GenerateRandomIndexEntries(std::vector<std::string> *separators,
+                                std::vector<BlockHandle> *block_handles,
+                                std::vector<std::string> *first_keys,
+                                const int len) {
+  Random rnd(42);
+
+  // For each of `len` blocks, we need to generate a first and last key.
+  // Let's generate n*2 random keys, sort them, group into consecutive pairs.
+  std::set<std::string> keys;
+  while ((int)keys.size() < len * 2) {
+    // Keys need to be at least 8 bytes long to look like internal keys.
+    keys.insert(test::RandomKey(&rnd, 12));
+  }
+
+  uint64_t offset = 0;
+  for (auto it = keys.begin(); it != keys.end();) {
+    first_keys->emplace_back(*it++);
+    separators->emplace_back(*it++);
+    uint64_t size = rnd.Uniform(1024 * 16);
+    BlockHandle handle(offset, size);
+    offset += size + kBlockTrailerSize;
+    block_handles->emplace_back(handle);
+  }
+}
+
+TEST_P(IndexBlockTest, IndexValueEncodingTest) {
+  Random rnd(301);
+  Options options = Options();
+
+  std::vector<std::string> separators;
+  std::vector<BlockHandle> block_handles;
+  std::vector<std::string> first_keys;
+  const bool kUseDeltaEncoding = true;
+  BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding());
+  int num_records = 100;
+
+  GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
+                             num_records);
+  BlockHandle last_encoded_handle;
+  for (int i = 0; i < num_records; i++) {
+    IndexValue entry(block_handles[i], first_keys[i]);
+    std::string encoded_entry;
+    std::string delta_encoded_entry;
+    entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr);
+    if (useValueDeltaEncoding() && i > 0) {
+      entry.EncodeTo(&delta_encoded_entry, includeFirstKey(),
+                     &last_encoded_handle);
+    }
+    last_encoded_handle = entry.handle;
+    const Slice delta_encoded_entry_slice(delta_encoded_entry);
+    builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+  const bool kTotalOrderSeek = true;
+  const bool kIncludesSeq = true;
+  const bool kValueIsFull = !useValueDeltaEncoding();
+  IndexBlockIter *kNullIter = nullptr;
+  Statistics *kNullStats = nullptr;
+  // read contents of block sequentially
+  InternalIteratorBase<IndexValue> *iter = reader.NewIndexIterator(
+      options.comparator, options.comparator, kNullIter, kNullStats,
+      kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull);
+  iter->SeekToFirst();
+  for (int index = 0; index < num_records; ++index) {
+    ASSERT_TRUE(iter->Valid());
+
+    Slice k = iter->key();
+    IndexValue v = iter->value();
+
+    EXPECT_EQ(separators[index], k.ToString());
+    EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+    EXPECT_EQ(block_handles[index].size(), v.handle.size());
+    EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+              v.first_internal_key.ToString());
+
+    iter->Next();
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter = reader.NewIndexIterator(options.comparator, options.comparator,
+                                 kNullIter, kNullStats, kTotalOrderSeek,
+                                 includeFirstKey(), kIncludesSeq, kValueIsFull);
+  for (int i = 0; i < num_records * 2; i++) {
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(separators[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    IndexValue v = iter->value();
+    EXPECT_EQ(separators[index], iter->key().ToString());
+    EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+    EXPECT_EQ(block_handles[index].size(), v.handle.size());
+    EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+              v.first_internal_key.ToString());
+  }
+  delete iter;
+}
+
+INSTANTIATE_TEST_CASE_P(P, IndexBlockTest,
+                        ::testing::Values(std::make_tuple(false, false),
+                                          std::make_tuple(false, true),
+                                          std::make_tuple(true, false),
+                                          std::make_tuple(true, true)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/block_type.h b/src/rocksdb/table/block_based/block_type.h
new file mode 100644
index 000000000..b2a913746
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_type.h
@@ -0,0 +1,30 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+namespace ROCKSDB_NAMESPACE {
+
+// Represents the types of blocks used in the block based table format.
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details.
+
+enum class BlockType : uint8_t {
+  kData,
+  kFilter,
+  kProperties,
+  kCompressionDictionary,
+  kRangeDeletion,
+  kHashIndexPrefixes,
+  kHashIndexMetadata,
+  kMetaIndex,
+  kIndex,
+  // Note: keep kInvalid the last value when adding new enum values.
+  kInvalid
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/cachable_entry.h b/src/rocksdb/table/block_based/cachable_entry.h
new file mode 100644
index 000000000..598f1ef57
--- /dev/null
+++ b/src/rocksdb/table/block_based/cachable_entry.h
@@ -0,0 +1,220 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include "port/likely.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/cleanable.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// CachableEntry is a handle to an object that may or may not be in the block
+// cache. It is used in a variety of ways:
+//
+// 1) It may refer to an object in the block cache. In this case, cache_ and
+// cache_handle_ are not nullptr, and the cache handle has to be released when
+// the CachableEntry is destroyed (the lifecycle of the cached object, on the
+// other hand, is managed by the cache itself).
+// 2) It may uniquely own the (non-cached) object it refers to (examples include
+// a block read directly from file, or uncompressed blocks when there is a
+// compressed block cache but no uncompressed block cache). In such cases, the
+// object has to be destroyed when the CachableEntry is destroyed.
+// 3) It may point to an object (cached or not) without owning it. In this case,
+// no action is needed when the CachableEntry is destroyed.
+// 4) Sometimes, management of a cached or owned object (see #1 and #2 above)
+// is transferred to some other object. This is used for instance with iterators
+// (where cleanup is performed using a chain of cleanup functions,
+// see Cleanable).
+//
+// Because of #1 and #2 above, copying a CachableEntry is not safe (and thus not
+// allowed); hence, this is a move-only type, where a move transfers the
+// management responsibilities, and leaves the source object in an empty state.
+
+template <class T>
+class CachableEntry {
+public:
+  CachableEntry() = default;
+
+  CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle,
+    bool own_value)
+    : value_(value)
+    , cache_(cache)
+    , cache_handle_(cache_handle)
+    , own_value_(own_value)
+  {
+    assert(value_ != nullptr ||
+      (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+  }
+
+  CachableEntry(const CachableEntry&) = delete;
+  CachableEntry& operator=(const CachableEntry&) = delete;
+
+  CachableEntry(CachableEntry&& rhs)
+    : value_(rhs.value_)
+    , cache_(rhs.cache_)
+    , cache_handle_(rhs.cache_handle_)
+    , own_value_(rhs.own_value_)
+  {
+    assert(value_ != nullptr ||
+      (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+
+    rhs.ResetFields();
+  }
+
+  CachableEntry& operator=(CachableEntry&& rhs) {
+    if (UNLIKELY(this == &rhs)) {
+      return *this;
+    }
+
+    ReleaseResource();
+
+    value_ = rhs.value_;
+    cache_ = rhs.cache_;
+    cache_handle_ = rhs.cache_handle_;
+    own_value_ = rhs.own_value_;
+
+    assert(value_ != nullptr ||
+      (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+
+    rhs.ResetFields();
+
+    return *this;
+  }
+
+  ~CachableEntry() {
+    ReleaseResource();
+  }
+
+  bool IsEmpty() const {
+    return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr &&
+      !own_value_;
+  }
+
+  bool IsCached() const {
+    assert(!!cache_ == !!cache_handle_);
+
+    return cache_handle_ != nullptr;
+  }
+
+  T* GetValue() const { return value_; }
+  Cache* GetCache() const { return cache_; }
+  Cache::Handle* GetCacheHandle() const { return cache_handle_; }
+  bool GetOwnValue() const { return own_value_; }
+
+  void Reset() {
+    ReleaseResource();
+    ResetFields();
+  }
+
+  void TransferTo(Cleanable* cleanable) {
+    if (cleanable) {
+      if (cache_handle_ != nullptr) {
+        assert(cache_ != nullptr);
+        cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, cache_handle_);
+      } else if (own_value_) {
+        cleanable->RegisterCleanup(&DeleteValue, value_, nullptr);
+      }
+    }
+
+    ResetFields();
+  }
+
+  void SetOwnedValue(T* value) {
+    assert(value != nullptr);
+
+    if (UNLIKELY(value_ == value && own_value_)) {
+      assert(cache_ == nullptr && cache_handle_ == nullptr);
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    own_value_ = true;
+  }
+
+  void SetUnownedValue(T* value) {
+    assert(value != nullptr);
+
+    if (UNLIKELY(value_ == value && cache_ == nullptr &&
+                 cache_handle_ == nullptr && !own_value_)) {
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    assert(!own_value_);
+  }
+
+  void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) {
+    assert(value != nullptr);
+    assert(cache != nullptr);
+    assert(cache_handle != nullptr);
+
+    if (UNLIKELY(value_ == value && cache_ == cache &&
+                 cache_handle_ == cache_handle && !own_value_)) {
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    cache_ = cache;
+    cache_handle_ = cache_handle;
+    assert(!own_value_);
+  }
+
+private:
+  void ReleaseResource() {
+    if (LIKELY(cache_handle_ != nullptr)) {
+      assert(cache_ != nullptr);
+      cache_->Release(cache_handle_);
+    } else if (own_value_) {
+      delete value_;
+    }
+  }
+
+  void ResetFields() {
+    value_ = nullptr;
+    cache_ = nullptr;
+    cache_handle_ = nullptr;
+    own_value_ = false;
+  }
+
+  static void ReleaseCacheHandle(void* arg1, void* arg2) {
+    Cache* const cache = static_cast<Cache*>(arg1);
+    assert(cache);
+
+    Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
+    assert(cache_handle);
+
+    cache->Release(cache_handle);
+  }
+
+  static void DeleteValue(void* arg1, void* /* arg2 */) {
+    delete static_cast<T*>(arg1);
+  }
+
+private:
+  T* value_ = nullptr;
+  Cache* cache_ = nullptr;
+  Cache::Handle* cache_handle_ = nullptr;
+  bool own_value_ = false;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_footer.cc b/src/rocksdb/table/block_based/data_block_footer.cc
new file mode 100644
index 000000000..5d5d8ed55
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_footer.cc
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/data_block_footer.h"
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const int kDataBlockIndexTypeBitShift = 31;
+
+// 0x7FFFFFFF
+const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+// 0x7FFFFFFF
+const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts) {
+  if (num_restarts > kMaxNumRestarts) {
+    assert(0);  // mute travis "unused" warning
+  }
+
+  uint32_t block_footer = num_restarts;
+  if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) {
+    block_footer |= 1u << kDataBlockIndexTypeBitShift;
+  } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) {
+    assert(0);
+  }
+
+  return block_footer;
+}
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts) {
+  if (index_type) {
+    if (block_footer & 1u << kDataBlockIndexTypeBitShift) {
+      *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+    } else {
+      *index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
+    }
+  }
+
+  if (num_restarts) {
+    *num_restarts = block_footer & kNumRestartsMask;
+    assert(*num_restarts <= kMaxNumRestarts);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_footer.h b/src/rocksdb/table/block_based/data_block_footer.h
new file mode 100644
index 000000000..c1cfd4730
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_footer.h
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts);
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index.cc b/src/rocksdb/table/block_based/data_block_hash_index.cc
new file mode 100644
index 000000000..222475834
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+#include "table/block_based/data_block_hash_index.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DataBlockHashIndexBuilder::Add(const Slice& key,
+                                    const size_t restart_index) {
+  assert(Valid());
+  if (restart_index > kMaxRestartSupportedByHashIndex) {
+    valid_ = false;
+    return;
+  }
+
+  uint32_t hash_value = GetSliceHash(key);
+  hash_and_restart_pairs_.emplace_back(hash_value,
+                                       static_cast<uint8_t>(restart_index));
+  estimated_num_buckets_ += bucket_per_key_;
+}
+
+void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
+  assert(Valid());
+  uint16_t num_buckets = static_cast<uint16_t>(estimated_num_buckets_);
+
+  if (num_buckets == 0) {
+    num_buckets = 1;  // sanity check
+  }
+
+  // The build-in hash cannot well distribute strings when into different
+  // buckets when num_buckets is power of two, resulting in high hash
+  // collision.
+  // We made the num_buckets to be odd to avoid this issue.
+  num_buckets |= 1;
+
+  std::vector<uint8_t> buckets(num_buckets, kNoEntry);
+  // write the restart_index array
+  for (auto& entry : hash_and_restart_pairs_) {
+    uint32_t hash_value = entry.first;
+    uint8_t restart_index = entry.second;
+    uint16_t buck_idx = static_cast<uint16_t>(hash_value % num_buckets);
+    if (buckets[buck_idx] == kNoEntry) {
+      buckets[buck_idx] = restart_index;
+    } else if (buckets[buck_idx] != restart_index) {
+      // same bucket cannot store two different restart_index, mark collision
+      buckets[buck_idx] = kCollision;
+    }
+  }
+
+  for (uint8_t restart_index : buckets) {
+    buffer.append(
+        const_cast<const char*>(reinterpret_cast<char*>(&restart_index)),
+        sizeof(restart_index));
+  }
+
+  // write NUM_BUCK
+  PutFixed16(&buffer, num_buckets);
+
+  assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex);
+}
+
+void DataBlockHashIndexBuilder::Reset() {
+  estimated_num_buckets_ = 0;
+  valid_ = true;
+  hash_and_restart_pairs_.clear();
+}
+
+void DataBlockHashIndex::Initialize(const char* data, uint16_t size,
+                                    uint16_t* map_offset) {
+  assert(size >= sizeof(uint16_t));  // NUM_BUCKETS
+  num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t));
+  assert(num_buckets_ > 0);
+  assert(size > num_buckets_ * sizeof(uint8_t));
+  *map_offset = static_cast<uint16_t>(size - sizeof(uint16_t) -
+                                      num_buckets_ * sizeof(uint8_t));
+}
+
+uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset,
+                                   const Slice& key) const {
+  uint32_t hash_value = GetSliceHash(key);
+  uint16_t idx = static_cast<uint16_t>(hash_value % num_buckets_);
+  const char* bucket_table = data + map_offset;
+  return static_cast<uint8_t>(*(bucket_table + idx * sizeof(uint8_t)));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index.h b/src/rocksdb/table/block_based/data_block_hash_index.h
new file mode 100644
index 000000000..f356395f3
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index.h
@@ -0,0 +1,136 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+// This is an experimental feature aiming to reduce the CPU utilization of
+// point-lookup within a data-block. It is only used in data blocks, and not
+// in meta-data blocks or per-table index blocks.
+//
+// It only used to support BlockBasedTable::Get().
+//
+// A serialized hash index is appended to the data-block. The new block data
+// format is as follows:
+//
+// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
+//
+// RI:       Restart Interval (the same as the default data-block format)
+// RI_IDX:   Restart Interval index (the same as the default data-block format)
+// HASH_IDX: The new data-block hash index feature.
+// FOOTER:   A 32bit block footer, which is the NUM_RESTARTS with the MSB as
+//           the flag indicating if this hash index is in use. Note that
+//           given a data block < 32KB, the MSB is never used. So we can
+//           borrow the MSB as the hash index flag. Therefore, this format is
+//           compatible with the legacy data-blocks with num_restarts < 32768,
+//           as the MSB is 0.
+//
+// The format of the data-block hash index is as follows:
+//
+// HASH_IDX: [B B B ... B NUM_BUCK]
+//
+// B:         bucket, an array of restart index. Each buckets is uint8_t.
+// NUM_BUCK:  Number of buckets, which is the length of the bucket array.
+//
+// We reserve two special flag:
+//    kNoEntry=255,
+//    kCollision=254.
+//
+// Therefore, the max number of restarts this hash index can supoport is 253.
+//
+// Buckets are initialized to be kNoEntry.
+//
+// When storing a key in the hash index, the key is first hashed to a bucket.
+// If there the bucket is empty (kNoEntry), the restart index is stored in
+// the bucket. If there is already a restart index there, we will update the
+// existing restart index to a collision marker (kCollision). If the
+// the bucket is already marked as collision, we do not store the restart
+// index either.
+//
+// During query process, a key is first hashed to a bucket. Then we examine if
+// the buckets store nothing (kNoEntry) or the bucket had a collision
+// (kCollision). If either of those happens, we get the restart index of
+// the key and will directly go to the restart interval to search the key.
+//
+// Note that we only support blocks with #restart_interval < 254. If a block
+// has more restart interval than that, hash index will not be create for it.
+
+const uint8_t kNoEntry = 255;
+const uint8_t kCollision = 254;
+const uint8_t kMaxRestartSupportedByHashIndex = 253;
+
+// Because we use uint16_t address, we only support block no more than 64KB
+const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
+const double kDefaultUtilRatio = 0.75;
+
+class DataBlockHashIndexBuilder {
+ public:
+  DataBlockHashIndexBuilder()
+      : bucket_per_key_(-1 /*uninitialized marker*/),
+        estimated_num_buckets_(0),
+        valid_(false) {}
+
+  void Initialize(double util_ratio) {
+    if (util_ratio <= 0) {
+      util_ratio = kDefaultUtilRatio;  // sanity check
+    }
+    bucket_per_key_ = 1 / util_ratio;
+    valid_ = true;
+  }
+
+  inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
+  void Add(const Slice& key, const size_t restart_index);
+  void Finish(std::string& buffer);
+  void Reset();
+  inline size_t EstimateSize() const {
+    uint16_t estimated_num_buckets =
+        static_cast<uint16_t>(estimated_num_buckets_);
+
+    // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
+    estimated_num_buckets |= 1;
+
+    return sizeof(uint16_t) +
+           static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
+  }
+
+ private:
+  double bucket_per_key_;  // is the multiplicative inverse of util_ratio_
+  double estimated_num_buckets_;
+
+  // Now the only usage for `valid_` is to mark false when the inserted
+  // restart_index is larger than supported. In this case HashIndex is not
+  // appended to the block content.
+  bool valid_;
+
+  std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
+  friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
+};
+
+class DataBlockHashIndex {
+ public:
+  DataBlockHashIndex() : num_buckets_(0) {}
+
+  void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
+
+  uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
+
+  inline bool Valid() { return num_buckets_ != 0; }
+
+ private:
+  // To make the serialized hash index compact and to save the space overhead,
+  // here all the data fields persisted in the block are in uint16 format.
+  // We find that a uint16 is large enough to index every offset of a 64KiB
+  // block.
+  // So in other words, DataBlockHashIndex does not support block size equal
+  // or greater then 64KiB.
+  uint16_t num_buckets_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index_test.cc b/src/rocksdb/table/block_based/data_block_hash_index_test.cc
new file mode 100644
index 000000000..8548c8508
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index_test.cc
@@ -0,0 +1,719 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdlib>
+#include <string>
+#include <unordered_map>
+
+#include "db/table_properties_collector.h"
+#include "rocksdb/slice.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/data_block_hash_index.h"
+#include "table/get_context.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool SearchForOffset(DataBlockHashIndex& index, const char* data,
+                     uint16_t map_offset, const Slice& key,
+                     uint8_t& restart_point) {
+  uint8_t entry = index.Lookup(data, map_offset, key);
+  if (entry == kCollision) {
+    return true;
+  }
+
+  if (entry == kNoEntry) {
+    return false;
+  }
+
+  return entry == restart_point;
+}
+
+// Random KV generator similer to block_test
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random* rnd) {
+  char buf[50];
+  char* p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += RandomString(rnd, padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string>* keys,
+                       std::vector<std::string>* values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(RandomString(&rnd, 100));
+    }
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestSmall) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  for (int j = 0; j < 5; j++) {
+    for (uint8_t i = 0; i < 2 + j; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      builder.Add(key, restart_point);
+    }
+
+    size_t estimated_size = builder.EstimateSize();
+
+    std::string buffer("fake"), buffer2;
+    size_t original_size = buffer.size();
+    estimated_size += original_size;
+    builder.Finish(buffer);
+
+    ASSERT_EQ(buffer.size(), estimated_size);
+
+    buffer2 = buffer;  // test for the correctness of relative offset
+
+    Slice s(buffer2);
+    DataBlockHashIndex index;
+    uint16_t map_offset;
+    index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+    // the additional hash map should start at the end of the buffer
+    ASSERT_EQ(original_size, map_offset);
+    for (uint8_t i = 0; i < 2; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
+    }
+    builder.Reset();
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTest) {
+  // bucket_num = 200, #keys = 100. 50% utilization
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("fake content"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer; // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestCollision) {
+  // bucket_num = 2. There will be intense hash collisions
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("some other fake content to take up space"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer; // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestLarge) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;
+
+  for (uint8_t i = 0; i < 100; i++) {
+    if (i % 2) {
+      continue;  // leave half of the keys out
+    }
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+    m[key] = restart_point;
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("filling stuff"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer; // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    if (m.count(key)) {
+      ASSERT_TRUE(m[key] == restart_point);
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
+    } else {
+      // we allow false positve, so don't test the nonexisting keys.
+      // when false positive happens, the search will continue to the
+      // restart intervals to see if the key really exist.
+    }
+  }
+}
+
+TEST(DataBlockHashIndex, RestartIndexExceedMax) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;
+
+  for (uint8_t i = 0; i <= 253; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+  ASSERT_TRUE(builder.Valid());
+
+  builder.Reset();
+
+  for (uint8_t i = 0; i <= 254; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  ASSERT_FALSE(builder.Valid());
+
+  builder.Reset();
+  ASSERT_TRUE(builder.Valid());
+}
+
+TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) {
+  Options options = Options();
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  // #restarts <= 253. HashIndex is valid
+  for (int i = 0; i <= 253; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+  builder.Reset();
+
+  // #restarts > 253. HashIndex is not used
+  for (int i = 0; i <= 254; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockSizeExceedMax) {
+  Options options = Options();
+  std::string ukey(10, 'k');
+  InternalKey ikey(ukey, 0, kTypeValue);
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       false /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  {
+    // insert a large value. The block size plus HashIndex is 65536.
+    std::string value(65502, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+  builder.Reset();
+
+  {
+    // insert a large value. The block size plus HashIndex would be 65537.
+    // This excceed the max block size supported by HashIndex (65536).
+    // So when build finishes HashIndex will not be created for the block.
+    std::string value(65503, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    // the index type have fallen back to binary when build finish.
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockTestSingleKey) {
+  Options options = Options();
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  std::string ukey("gopher");
+  std::string value("gold");
+  InternalKey ikey(ukey, 10, kTypeValue);
+  builder.Add(ikey.Encode().ToString(), value /*value*/);
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+  const InternalKeyComparator icmp(BytewiseComparator());
+  auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
+  bool may_exist;
+  // search in block for the key just inserted
+  {
+    InternalKey seek_ikey(ukey, 10, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(
+        options.comparator->Compare(iter->key(), ikey.Encode().ToString()), 0);
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // search in block for the existing ukey, but with higher seqno
+  {
+    InternalKey seek_ikey(ukey, 20, kValueTypeForSeek);
+
+    // HashIndex should be able to set the iter correctly
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+
+    // user key should match
+    ASSERT_EQ(options.comparator->Compare(ExtractUserKey(iter->key()), ukey),
+              0);
+
+    // seek_key seqno number should be greater than that of iter result
+    ASSERT_GT(GetInternalKeySeqno(seek_ikey.Encode()),
+              GetInternalKeySeqno(iter->key()));
+
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // Search in block for the existing ukey, but with lower seqno
+  // in this case, hash can find the only occurrence of the user_key, but
+  // ParseNextDataKey() will skip it as it does not have a older seqno.
+  // In this case, GetForSeek() is effective to locate the user_key, and
+  // iter->Valid() == false indicates that we've reached to the end of
+  // the block and the caller should continue searching the next block.
+  {
+    InternalKey seek_ikey(ukey, 5, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_FALSE(iter->Valid());  // should have reached to the end of block
+  }
+
+  delete iter;
+}
+
+TEST(DataBlockHashIndex, BlockTestLarge) {
+  Random rnd(1019);
+  Options options = Options();
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  int num_records = 500;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records);
+
+  // Generate keys. Adding a trailing "1" to indicate existent keys.
+  // Later will Seeking for keys with a trailing "0" to test seeking
+  // non-existent keys.
+  for (int i = 0; i < num_records; i++) {
+    std::string ukey(keys[i] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), values[i]);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+  const InternalKeyComparator icmp(BytewiseComparator());
+
+  // random seek existent keys
+  for (int i = 0; i < num_records; i++) {
+    auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(values[index], iter->value());
+
+    delete iter;
+  }
+
+  // random seek non-existent user keys
+  // In this case A), the user_key cannot be found in HashIndex. The key may
+  // exist in the next block. So the iter is set invalidated to tell the
+  // caller to search the next block. This test case belongs to this case A).
+  //
+  // Note that for non-existent keys, there is possibility of false positive,
+  // i.e. the key is still hashed into some restart interval.
+  // Two additional possible outcome:
+  // B) linear seek the restart interval and not found, the iter stops at the
+  //    starting of the next restart interval. The key does not exist
+  //    anywhere.
+  // C) linear seek the restart interval and not found, the iter stops at the
+  //    the end of the block, i.e. restarts_. The key may exist in the next
+  //    block.
+  // So these combinations are possible when searching non-existent user_key:
+  //
+  // case#    may_exist  iter->Valid()
+  //     A         true          false
+  //     B        false           true
+  //     C         true          false
+
+  for (int i = 0; i < num_records; i++) {
+    auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "0" /* non-existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    if (!may_exist) {
+      ASSERT_TRUE(iter->Valid());
+    }
+    if (!iter->Valid()) {
+      ASSERT_TRUE(may_exist);
+    }
+
+    delete iter;
+  }
+}
+
+// helper routine for DataBlockHashIndex.BlockBoundary
+void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
+                  std::string& v2, InternalKey& seek_ikey,
+                  GetContext& get_context, Options& options) {
+  std::unique_ptr<WritableFileWriter> file_writer;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  std::unique_ptr<TableReader> table_reader;
+  int level_ = -1;
+
+  std::vector<std::string> keys;
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+
+  EnvOptions soptions;
+
+  soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+  file_writer.reset(
+      test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
+  std::unique_ptr<TableBuilder> builder;
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  builder.reset(ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, internal_comparator,
+                          &int_tbl_prop_collector_factories,
+                          options.compression, options.sample_for_compression,
+                          CompressionOptions(), false /* skip_filters */,
+                          column_family_name, level_),
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      file_writer.get()));
+
+  builder->Add(ik1.Encode().ToString(), v1);
+  builder->Add(ik2.Encode().ToString(), v2);
+  EXPECT_TRUE(builder->status().ok());
+
+  Status s = builder->Finish();
+  file_writer->Flush();
+  EXPECT_TRUE(s.ok()) << s.ToString();
+
+  EXPECT_EQ(
+      test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(),
+      builder->FileSize());
+
+  // Open the table
+  file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource(
+      test::GetStringSinkFromLegacyWriter(file_writer.get())->contents(),
+      0 /*uniq_id*/, ioptions.allow_mmap_reads)));
+  const bool kSkipFilters = true;
+  const bool kImmortal = true;
+  ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
+                         internal_comparator, !kSkipFilters, !kImmortal,
+                         level_),
+      std::move(file_reader),
+      test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(),
+      &table_reader);
+  // Search using Get()
+  ReadOptions ro;
+
+  ASSERT_OK(table_reader->Get(ro, seek_ikey.Encode().ToString(), &get_context,
+                              moptions.prefix_extractor.get()));
+}
+
+TEST(DataBlockHashIndex, BlockBoundary) {
+  BlockBasedTableOptions table_options;
+  table_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  table_options.block_restart_interval = 1;
+  table_options.block_size = 4096;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // insert two large k/v pair. Given that the block_size is 4096, one k/v
+  // pair will take up one block.
+  // [    k1/v1   ][    k2/v2  ]
+  // [   Block N  ][ Block N+1 ]
+
+  {
+    // [ "aab"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("aab");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@120
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 120, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v1);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@5
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 5, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+    value.Reset();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/filter_block.h b/src/rocksdb/table/block_based/filter_block.h
new file mode 100644
index 000000000..1ad8d3f18
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block.h
@@ -0,0 +1,176 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file.  It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+//
+// It is a base class for BlockBasedFilter and FullFilter.
+// These two are both used in BlockBasedTable. The first one contain filter
+// For a part of keys in sst file, the second contain filter for all keys
+// in sst file.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "db/dbformat.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/format.h"
+#include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint64_t kNotValid = ULLONG_MAX;
+class FilterPolicy;
+
+class GetContext;
+using MultiGetRange = MultiGetContext::Range;
+
+// A FilterBlockBuilder is used to construct all of the filters for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+//
+// The sequence of calls to FilterBlockBuilder must match the regexp:
+//      (StartBlock Add*)* Finish
+//
+// BlockBased/Full FilterBlock would be called in the same way.
+class FilterBlockBuilder {
+ public:
+  explicit FilterBlockBuilder() {}
+  // No copying allowed
+  FilterBlockBuilder(const FilterBlockBuilder&) = delete;
+  void operator=(const FilterBlockBuilder&) = delete;
+
+  virtual ~FilterBlockBuilder() {}
+
+  virtual bool IsBlockBased() = 0;                    // If is blockbased filter
+  virtual void StartBlock(uint64_t block_offset) = 0;  // Start new block filter
+  virtual void Add(const Slice& key) = 0;      // Add a key to current filter
+  virtual size_t NumAdded() const = 0;         // Number of keys added
+  Slice Finish() {                             // Generate Filter
+    const BlockHandle empty_handle;
+    Status dont_care_status;
+    auto ret = Finish(empty_handle, &dont_care_status);
+    assert(dont_care_status.ok());
+    return ret;
+  }
+  virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0;
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+//
+// BlockBased/Full FilterBlock would be called in the same way.
+class FilterBlockReader {
+ public:
+  FilterBlockReader() = default;
+  virtual ~FilterBlockReader() = default;
+
+  FilterBlockReader(const FilterBlockReader&) = delete;
+  FilterBlockReader& operator=(const FilterBlockReader&) = delete;
+
+  virtual bool IsBlockBased() = 0;  // If is blockbased filter
+
+  /**
+   * If no_io is set, then it returns true if it cannot answer the query without
+   * reading data from disk. This is used in PartitionedFilterBlockReader to
+   * avoid reading partitions that are not in block cache already
+   *
+   * Normally filters are built on only the user keys and the InternalKey is not
+   * needed for a query. The index in PartitionedFilterBlockReader however is
+   * built upon InternalKey and must be provided via const_ikey_ptr when running
+   * queries.
+   */
+  virtual bool KeyMayMatch(const Slice& key,
+                           const SliceTransform* prefix_extractor,
+                           uint64_t block_offset, const bool no_io,
+                           const Slice* const const_ikey_ptr,
+                           GetContext* get_context,
+                           BlockCacheLookupContext* lookup_context) = 0;
+
+  virtual void KeysMayMatch(MultiGetRange* range,
+                            const SliceTransform* prefix_extractor,
+                            uint64_t block_offset, const bool no_io,
+                            BlockCacheLookupContext* lookup_context) {
+    for (auto iter = range->begin(); iter != range->end(); ++iter) {
+      const Slice ukey = iter->ukey;
+      const Slice ikey = iter->ikey;
+      GetContext* const get_context = iter->get_context;
+      if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey,
+                       get_context, lookup_context)) {
+        range->SkipKey(iter);
+      }
+    }
+  }
+
+  /**
+   * no_io and const_ikey_ptr here means the same as in KeyMayMatch
+   */
+  virtual bool PrefixMayMatch(const Slice& prefix,
+                              const SliceTransform* prefix_extractor,
+                              uint64_t block_offset, const bool no_io,
+                              const Slice* const const_ikey_ptr,
+                              GetContext* get_context,
+                              BlockCacheLookupContext* lookup_context) = 0;
+
+  virtual void PrefixesMayMatch(MultiGetRange* range,
+                                const SliceTransform* prefix_extractor,
+                                uint64_t block_offset, const bool no_io,
+                                BlockCacheLookupContext* lookup_context) {
+    for (auto iter = range->begin(); iter != range->end(); ++iter) {
+      const Slice ukey = iter->ukey;
+      const Slice ikey = iter->ikey;
+      GetContext* const get_context = iter->get_context;
+      if (prefix_extractor->InDomain(ukey) &&
+          !PrefixMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
+                          block_offset, no_io, &ikey, get_context,
+                          lookup_context)) {
+        range->SkipKey(iter);
+      }
+    }
+  }
+
+  virtual size_t ApproximateMemoryUsage() const = 0;
+
+  // convert this object to a human readable form
+  virtual std::string ToString() const {
+    std::string error_msg("Unsupported filter \n");
+    return error_msg;
+  }
+
+  virtual void CacheDependencies(bool /*pin*/) {}
+
+  virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/,
+                             const Slice& user_key,
+                             const SliceTransform* prefix_extractor,
+                             const Comparator* /*comparator*/,
+                             const Slice* const const_ikey_ptr,
+                             bool* filter_checked, bool need_upper_bound_check,
+                             BlockCacheLookupContext* lookup_context) {
+    if (need_upper_bound_check) {
+      return true;
+    }
+    *filter_checked = true;
+    Slice prefix = prefix_extractor->Transform(user_key);
+    return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
+                          const_ikey_ptr, /* get_context */ nullptr,
+                          lookup_context);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_block_reader_common.cc b/src/rocksdb/table/block_based/filter_block_reader_common.cc
new file mode 100644
index 000000000..fa0802669
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block_reader_common.cc
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/filter_block_reader_common.h"
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/parsed_full_filter_block.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<TBlocklike>* filter_block) {
+  PERF_TIMER_GUARD(read_filter_block_nanos);
+
+  assert(table);
+  assert(filter_block);
+  assert(filter_block->IsEmpty());
+
+  const BlockBasedTable::Rep* const rep = table->get_rep();
+  assert(rep);
+
+  const Status s =
+      table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle,
+                           UncompressionDict::GetEmptyDict(), filter_block,
+                           BlockType::kFilter, get_context, lookup_context,
+                           /* for_compaction */ false, use_cache);
+
+  return s;
+}
+
+template <typename TBlocklike>
+const SliceTransform*
+FilterBlockReaderCommon<TBlocklike>::table_prefix_extractor() const {
+  assert(table_);
+
+  const BlockBasedTable::Rep* const rep = table_->get_rep();
+  assert(rep);
+
+  return rep->prefix_filtering ? rep->table_prefix_extractor.get() : nullptr;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::whole_key_filtering() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->whole_key_filtering;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::cache_filter_blocks() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::GetOrReadFilterBlock(
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<TBlocklike>* filter_block) const {
+  assert(filter_block);
+
+  if (!filter_block_.IsEmpty()) {
+    filter_block->SetUnownedValue(filter_block_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options,
+                         cache_filter_blocks(), get_context, lookup_context,
+                         filter_block);
+}
+
+template <typename TBlocklike>
+size_t FilterBlockReaderCommon<TBlocklike>::ApproximateFilterBlockMemoryUsage()
+    const {
+  assert(!filter_block_.GetOwnValue() || filter_block_.GetValue() != nullptr);
+  return filter_block_.GetOwnValue()
+             ? filter_block_.GetValue()->ApproximateMemoryUsage()
+             : 0;
+}
+
+// Explicitly instantiate templates for both "blocklike" types we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template class FilterBlockReaderCommon<BlockContents>;
+template class FilterBlockReaderCommon<Block>;
+template class FilterBlockReaderCommon<ParsedFullFilterBlock>;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_block_reader_common.h b/src/rocksdb/table/block_based/filter_block_reader_common.h
new file mode 100644
index 000000000..a18bc5449
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block_reader_common.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBasedTable;
+class FilePrefetchBuffer;
+
+// Encapsulates common functionality for the various filter block reader
+// implementations. Provides access to the filter block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+template <typename TBlocklike>
+class FilterBlockReaderCommon : public FilterBlockReader {
+ public:
+  FilterBlockReaderCommon(const BlockBasedTable* t,
+                          CachableEntry<TBlocklike>&& filter_block)
+      : table_(t), filter_block_(std::move(filter_block)) {
+    assert(table_);
+  }
+
+ protected:
+  static Status ReadFilterBlock(const BlockBasedTable* table,
+                                FilePrefetchBuffer* prefetch_buffer,
+                                const ReadOptions& read_options, bool use_cache,
+                                GetContext* get_context,
+                                BlockCacheLookupContext* lookup_context,
+                                CachableEntry<TBlocklike>* filter_block);
+
+  const BlockBasedTable* table() const { return table_; }
+  const SliceTransform* table_prefix_extractor() const;
+  bool whole_key_filtering() const;
+  bool cache_filter_blocks() const;
+
+  Status GetOrReadFilterBlock(bool no_io, GetContext* get_context,
+                              BlockCacheLookupContext* lookup_context,
+                              CachableEntry<TBlocklike>* filter_block) const;
+
+  size_t ApproximateFilterBlockMemoryUsage() const;
+
+ private:
+  const BlockBasedTable* table_;
+  CachableEntry<TBlocklike> filter_block_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_policy.cc b/src/rocksdb/table/block_based/filter_policy.cc
new file mode 100644
index 000000000..c8f23ee33
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_policy.cc
@@ -0,0 +1,759 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <array>
+#include <deque>
+
+#include "rocksdb/filter_policy.h"
+
+#include "rocksdb/slice.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "third-party/folly/folly/ConstexprMath.h"
+#include "util/bloom_impl.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  explicit FastLocalBloomBitsBuilder(const int millibits_per_key)
+      : millibits_per_key_(millibits_per_key),
+        num_probes_(FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_)) {
+    assert(millibits_per_key >= 1000);
+  }
+
+  // No Copy allowed
+  FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete;
+  void operator=(const FastLocalBloomBitsBuilder&) = delete;
+
+  ~FastLocalBloomBitsBuilder() override {}
+
+  virtual void AddKey(const Slice& key) override {
+    uint64_t hash = GetSliceHash64(key);
+    if (hash_entries_.empty() || hash != hash_entries_.back()) {
+      hash_entries_.push_back(hash);
+    }
+  }
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    uint32_t len_with_metadata =
+        CalculateSpace(static_cast<uint32_t>(hash_entries_.size()));
+    char* data = new char[len_with_metadata];
+    memset(data, 0, len_with_metadata);
+
+    assert(data);
+    assert(len_with_metadata >= 5);
+
+    uint32_t len = len_with_metadata - 5;
+    if (len > 0) {
+      AddAllEntries(data, len);
+    }
+
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -1 = Marker for newer Bloom implementations
+    data[len] = static_cast<char>(-1);
+    // 0 = Marker for this sub-implementation
+    data[len + 1] = static_cast<char>(0);
+    // num_probes (and 0 in upper bits for 64-byte block size)
+    data[len + 2] = static_cast<char>(num_probes_);
+    // rest of metadata stays zero
+
+    const char* const_data = data;
+    buf->reset(const_data);
+    assert(hash_entries_.empty());
+
+    return Slice(data, len_with_metadata);
+  }
+
+  int CalculateNumEntry(const uint32_t bytes) override {
+    uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0;
+    return static_cast<int>(uint64_t{8000} * bytes_no_meta /
+                            millibits_per_key_);
+  }
+
+  uint32_t CalculateSpace(const int num_entry) override {
+    uint32_t num_cache_lines = 0;
+    if (millibits_per_key_ > 0 && num_entry > 0) {
+      num_cache_lines = static_cast<uint32_t>(
+          (int64_t{num_entry} * millibits_per_key_ + 511999) / 512000);
+    }
+    return num_cache_lines * 64 + /*metadata*/ 5;
+  }
+
+  double EstimatedFpRate(size_t keys, size_t bytes) override {
+    return FastLocalBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5,
+                                               num_probes_, /*hash bits*/ 64);
+  }
+
+ private:
+  void AddAllEntries(char* data, uint32_t len) {
+    // Simple version without prefetching:
+    //
+    // for (auto h : hash_entries_) {
+    //   FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len,
+    //                               num_probes_, data);
+    // }
+
+    const size_t num_entries = hash_entries_.size();
+    constexpr size_t kBufferMask = 7;
+    static_assert(((kBufferMask + 1) & kBufferMask) == 0,
+                  "Must be power of 2 minus 1");
+
+    std::array<uint32_t, kBufferMask + 1> hashes;
+    std::array<uint32_t, kBufferMask + 1> byte_offsets;
+
+    // Prime the buffer
+    size_t i = 0;
+    for (; i <= kBufferMask && i < num_entries; ++i) {
+      uint64_t h = hash_entries_.front();
+      hash_entries_.pop_front();
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+                                      /*out*/ &byte_offsets[i]);
+      hashes[i] = Upper32of64(h);
+    }
+
+    // Process and buffer
+    for (; i < num_entries; ++i) {
+      uint32_t& hash_ref = hashes[i & kBufferMask];
+      uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask];
+      // Process (add)
+      FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes_,
+                                          data + byte_offset_ref);
+      // And buffer
+      uint64_t h = hash_entries_.front();
+      hash_entries_.pop_front();
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+                                      /*out*/ &byte_offset_ref);
+      hash_ref = Upper32of64(h);
+    }
+
+    // Finish processing
+    for (i = 0; i <= kBufferMask && i < num_entries; ++i) {
+      FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes_,
+                                          data + byte_offsets[i]);
+    }
+  }
+
+  int millibits_per_key_;
+  int num_probes_;
+  // A deque avoids unnecessary copying of already-saved values
+  // and has near-minimal peak memory use.
+  std::deque<uint64_t> hash_entries_;
+};
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsReader : public FilterBitsReader {
+ public:
+  FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes)
+      : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {}
+
+  // No Copy allowed
+  FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete;
+  void operator=(const FastLocalBloomBitsReader&) = delete;
+
+  ~FastLocalBloomBitsReader() override {}
+
+  bool MayMatch(const Slice& key) override {
+    uint64_t h = GetSliceHash64(key);
+    uint32_t byte_offset;
+    FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+                                    /*out*/ &byte_offset);
+    return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_,
+                                                    data_ + byte_offset);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+    for (int i = 0; i < num_keys; ++i) {
+      uint64_t h = GetSliceHash64(*keys[i]);
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+                                      /*out*/ &byte_offsets[i]);
+      hashes[i] = Upper32of64(h);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared(
+          hashes[i], num_probes_, data_ + byte_offsets[i]);
+    }
+  }
+
+ private:
+  const char* data_;
+  const int num_probes_;
+  const uint32_t len_bytes_;
+};
+
+using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
+
+class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log);
+
+  // No Copy allowed
+  LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete;
+  void operator=(const LegacyBloomBitsBuilder&) = delete;
+
+  ~LegacyBloomBitsBuilder() override;
+
+  void AddKey(const Slice& key) override;
+
+  Slice Finish(std::unique_ptr<const char[]>* buf) override;
+
+  int CalculateNumEntry(const uint32_t bytes) override;
+
+  uint32_t CalculateSpace(const int num_entry) override {
+    uint32_t dont_care1;
+    uint32_t dont_care2;
+    return CalculateSpace(num_entry, &dont_care1, &dont_care2);
+  }
+
+  double EstimatedFpRate(size_t keys, size_t bytes) override {
+    return LegacyBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5,
+                                            num_probes_);
+  }
+
+ private:
+  int bits_per_key_;
+  int num_probes_;
+  std::vector<uint32_t> hash_entries_;
+  Logger* info_log_;
+
+  // Get totalbits that optimized for cpu cache line
+  uint32_t GetTotalBitsForLocality(uint32_t total_bits);
+
+  // Reserve space for new filter
+  char* ReserveSpace(const int num_entry, uint32_t* total_bits,
+                     uint32_t* num_lines);
+
+  // Implementation-specific variant of public CalculateSpace
+  uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
+                          uint32_t* num_lines);
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
+};
+
+LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key,
+                                               Logger* info_log)
+    : bits_per_key_(bits_per_key),
+      num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)),
+      info_log_(info_log) {
+  assert(bits_per_key_);
+}
+
+LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {}
+
+void LegacyBloomBitsBuilder::AddKey(const Slice& key) {
+  uint32_t hash = BloomHash(key);
+  if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
+    hash_entries_.push_back(hash);
+  }
+}
+
+Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
+  uint32_t total_bits, num_lines;
+  size_t num_entries = hash_entries_.size();
+  char* data =
+      ReserveSpace(static_cast<int>(num_entries), &total_bits, &num_lines);
+  assert(data);
+
+  if (total_bits != 0 && num_lines != 0) {
+    for (auto h : hash_entries_) {
+      AddHash(h, data, num_lines, total_bits);
+    }
+
+    // Check for excessive entries for 32-bit hash function
+    if (num_entries >= /* minimum of 3 million */ 3000000U) {
+      // More specifically, we can detect that the 32-bit hash function
+      // is causing significant increase in FP rate by comparing current
+      // estimated FP rate to what we would get with a normal number of
+      // keys at same memory ratio.
+      double est_fp_rate = LegacyBloomImpl::EstimatedFpRate(
+          num_entries, total_bits / 8, num_probes_);
+      double vs_fp_rate = LegacyBloomImpl::EstimatedFpRate(
+          1U << 16, (1U << 16) * bits_per_key_ / 8, num_probes_);
+
+      if (est_fp_rate >= 1.50 * vs_fp_rate) {
+        // For more details, see
+        // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
+        ROCKS_LOG_WARN(
+            info_log_,
+            "Using legacy SST/BBT Bloom filter with excessive key count "
+            "(%.1fM @ %dbpk), causing estimated %.1fx higher filter FP rate. "
+            "Consider using new Bloom with format_version>=5, smaller SST "
+            "file size, or partitioned filters.",
+            num_entries / 1000000.0, bits_per_key_, est_fp_rate / vs_fp_rate);
+      }
+    }
+  }
+  // See BloomFilterPolicy::GetFilterBitsReader for metadata
+  data[total_bits / 8] = static_cast<char>(num_probes_);
+  EncodeFixed32(data + total_bits / 8 + 1, static_cast<uint32_t>(num_lines));
+
+  const char* const_data = data;
+  buf->reset(const_data);
+  hash_entries_.clear();
+
+  return Slice(data, total_bits / 8 + 5);
+}
+
+uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_lines =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_lines an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_lines % 2 == 0) {
+    num_lines++;
+  }
+  return num_lines * (CACHE_LINE_SIZE * 8);
+}
+
+uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry,
+                                                uint32_t* total_bits,
+                                                uint32_t* num_lines) {
+  assert(bits_per_key_);
+  if (num_entry != 0) {
+    uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
+
+    *total_bits = GetTotalBitsForLocality(total_bits_tmp);
+    *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
+    assert(*total_bits > 0 && *total_bits % 8 == 0);
+  } else {
+    // filter is empty, just leave space for metadata
+    *total_bits = 0;
+    *num_lines = 0;
+  }
+
+  // Reserve space for Filter
+  uint32_t sz = *total_bits / 8;
+  sz += 5;  // 4 bytes for num_lines, 1 byte for num_probes
+  return sz;
+}
+
+char* LegacyBloomBitsBuilder::ReserveSpace(const int num_entry,
+                                           uint32_t* total_bits,
+                                           uint32_t* num_lines) {
+  uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
+  char* data = new char[sz];
+  memset(data, 0, sz);
+  return data;
+}
+
+int LegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
+  assert(bits_per_key_);
+  assert(bytes > 0);
+  int high = static_cast<int>(bytes * 8 / bits_per_key_ + 1);
+  int low = 1;
+  int n = high;
+  for (; n >= low; n--) {
+    if (CalculateSpace(n) <= bytes) {
+      break;
+    }
+  }
+  assert(n < high);  // High should be an overestimation
+  return n;
+}
+
+inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data,
+                                            uint32_t num_lines,
+                                            uint32_t total_bits) {
+#ifdef NDEBUG
+  static_cast<void>(total_bits);
+#endif
+  assert(num_lines > 0 && total_bits > 0);
+
+  LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data,
+                           folly::constexpr_log2(CACHE_LINE_SIZE));
+}
+
+class LegacyBloomBitsReader : public FilterBitsReader {
+ public:
+  LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines,
+                        uint32_t log2_cache_line_size)
+      : data_(data),
+        num_probes_(num_probes),
+        num_lines_(num_lines),
+        log2_cache_line_size_(log2_cache_line_size) {}
+
+  // No Copy allowed
+  LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete;
+  void operator=(const LegacyBloomBitsReader&) = delete;
+
+  ~LegacyBloomBitsReader() override {}
+
+  // "contents" contains the data built by a preceding call to
+  // FilterBitsBuilder::Finish. MayMatch must return true if the key was
+  // passed to FilterBitsBuilder::AddKey. This method may return true or false
+  // if the key was not on the list, but it should aim to return false with a
+  // high probability.
+  bool MayMatch(const Slice& key) override {
+    uint32_t hash = BloomHash(key);
+    uint32_t byte_offset;
+    LegacyBloomImpl::PrepareHashMayMatch(
+        hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
+    return LegacyBloomImpl::HashMayMatchPrepared(
+        hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+    for (int i = 0; i < num_keys; ++i) {
+      hashes[i] = BloomHash(*keys[i]);
+      LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
+                                           /*out*/ &byte_offsets[i],
+                                           log2_cache_line_size_);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = LegacyBloomImpl::HashMayMatchPrepared(
+          hashes[i], num_probes_, data_ + byte_offsets[i],
+          log2_cache_line_size_);
+    }
+  }
+
+ private:
+  const char* data_;
+  const int num_probes_;
+  const uint32_t num_lines_;
+  const uint32_t log2_cache_line_size_;
+};
+
+class AlwaysTrueFilter : public FilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return true; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+};
+
+class AlwaysFalseFilter : public FilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return false; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+};
+
+}  // namespace
+
+const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllFixedImpls = {
+    kLegacyBloom,
+    kDeprecatedBlock,
+    kFastLocalBloom,
+};
+
+const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllUserModes = {
+    kDeprecatedBlock,
+    kAuto,
+};
+
+BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode)
+    : mode_(mode), warned_(false) {
+  // Sanitize bits_per_key
+  if (bits_per_key < 1.0) {
+    bits_per_key = 1.0;
+  } else if (!(bits_per_key < 100.0)) {  // including NaN
+    bits_per_key = 100.0;
+  }
+
+  // Includes a nudge toward rounding up, to ensure on all platforms
+  // that doubles specified with three decimal digits after the decimal
+  // point are interpreted accurately.
+  millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001);
+
+  // For better or worse, this is a rounding up of a nudged rounding up,
+  // e.g. 7.4999999999999 will round up to 8, but that provides more
+  // predictability against small arithmetic errors in floating point.
+  whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000;
+}
+
+BloomFilterPolicy::~BloomFilterPolicy() {}
+
+const char* BloomFilterPolicy::Name() const {
+  return "rocksdb.BuiltinBloomFilter";
+}
+
+void BloomFilterPolicy::CreateFilter(const Slice* keys, int n,
+                                     std::string* dst) const {
+  // We should ideally only be using this deprecated interface for
+  // appropriately constructed BloomFilterPolicy
+  assert(mode_ == kDeprecatedBlock);
+
+  // Compute bloom filter size (in both bits and bytes)
+  uint32_t bits = static_cast<uint32_t>(n * whole_bits_per_key_);
+
+  // For small n, we can see a very high false positive rate.  Fix it
+  // by enforcing a minimum bloom filter length.
+  if (bits < 64) bits = 64;
+
+  uint32_t bytes = (bits + 7) / 8;
+  bits = bytes * 8;
+
+  int num_probes =
+      LegacyNoLocalityBloomImpl::ChooseNumProbes(whole_bits_per_key_);
+
+  const size_t init_size = dst->size();
+  dst->resize(init_size + bytes, 0);
+  dst->push_back(static_cast<char>(num_probes));  // Remember # of probes
+  char* array = &(*dst)[init_size];
+  for (int i = 0; i < n; i++) {
+    LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes,
+                                       array);
+  }
+}
+
+bool BloomFilterPolicy::KeyMayMatch(const Slice& key,
+                                    const Slice& bloom_filter) const {
+  const size_t len = bloom_filter.size();
+  if (len < 2 || len > 0xffffffffU) {
+    return false;
+  }
+
+  const char* array = bloom_filter.data();
+  const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
+
+  // Use the encoded k so that we can read filters generated by
+  // bloom filters created using different parameters.
+  const int k = static_cast<uint8_t>(array[len - 1]);
+  if (k > 30) {
+    // Reserved for potentially new encodings for short bloom filters.
+    // Consider it a match.
+    return true;
+  }
+  // NB: using stored k not num_probes for whole_bits_per_key_
+  return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k,
+                                                 array);
+}
+
+FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const {
+  // This code path should no longer be used, for the built-in
+  // BloomFilterPolicy. Internal to RocksDB and outside
+  // BloomFilterPolicy, only get a FilterBitsBuilder with
+  // BloomFilterPolicy::GetBuilderFromContext(), which will call
+  // BloomFilterPolicy::GetBuilderWithContext(). RocksDB users have
+  // been warned (HISTORY.md) that they can no longer call this on
+  // the built-in BloomFilterPolicy (unlikely).
+  assert(false);
+  return GetBuilderWithContext(FilterBuildingContext(BlockBasedTableOptions()));
+}
+
+FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  Mode cur = mode_;
+  // Unusual code construction so that we can have just
+  // one exhaustive switch without (risky) recursion
+  for (int i = 0; i < 2; ++i) {
+    switch (cur) {
+      case kAuto:
+        if (context.table_options.format_version < 5) {
+          cur = kLegacyBloom;
+        } else {
+          cur = kFastLocalBloom;
+        }
+        break;
+      case kDeprecatedBlock:
+        return nullptr;
+      case kFastLocalBloom:
+        return new FastLocalBloomBitsBuilder(millibits_per_key_);
+      case kLegacyBloom:
+        if (whole_bits_per_key_ >= 14 && context.info_log &&
+            !warned_.load(std::memory_order_relaxed)) {
+          warned_ = true;
+          const char* adjective;
+          if (whole_bits_per_key_ >= 20) {
+            adjective = "Dramatic";
+          } else {
+            adjective = "Significant";
+          }
+          // For more details, see
+          // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
+          ROCKS_LOG_WARN(
+              context.info_log,
+              "Using legacy Bloom filter with high (%d) bits/key. "
+              "%s filter space and/or accuracy improvement is available "
+              "with format_version>=5.",
+              whole_bits_per_key_, adjective);
+        }
+        return new LegacyBloomBitsBuilder(whole_bits_per_key_,
+                                          context.info_log);
+    }
+  }
+  assert(false);
+  return nullptr;  // something legal
+}
+
+FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext(
+    const FilterBuildingContext& context) {
+  if (context.table_options.filter_policy) {
+    return context.table_options.filter_policy->GetBuilderWithContext(context);
+  } else {
+    return nullptr;
+  }
+}
+
+// Read metadata to determine what kind of FilterBitsReader is needed
+// and return a new one.
+FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
+    const Slice& contents) const {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  if (len_with_meta <= 5) {
+    // filter is empty or broken. Treat like zero keys added.
+    return new AlwaysFalseFilter();
+  }
+
+  // Legacy Bloom filter data:
+  //             0 +-----------------------------------+
+  //               | Raw Bloom filter data             |
+  //               | ...                               |
+  //           len +-----------------------------------+
+  //               | byte for num_probes or            |
+  //               |   marker for new implementations  |
+  //         len+1 +-----------------------------------+
+  //               | four bytes for number of cache    |
+  //               |   lines                           |
+  // len_with_meta +-----------------------------------+
+
+  int8_t raw_num_probes =
+      static_cast<int8_t>(contents.data()[len_with_meta - 5]);
+  // NB: *num_probes > 30 and < 128 probably have not been used, because of
+  // BloomFilterPolicy::initialize, unless directly calling
+  // LegacyBloomBitsBuilder as an API, but we are leaving those cases in
+  // limbo with LegacyBloomBitsReader for now.
+
+  if (raw_num_probes < 1) {
+    // Note: < 0 (or unsigned > 127) indicate special new implementations
+    // (or reserved for future use)
+    if (raw_num_probes == -1) {
+      // Marker for newer Bloom implementations
+      return GetBloomBitsReader(contents);
+    }
+    // otherwise
+    // Treat as zero probes (always FP) for now.
+    return new AlwaysTrueFilter();
+  }
+  // else attempt decode for LegacyBloomBitsReader
+
+  int num_probes = raw_num_probes;
+  assert(num_probes >= 1);
+  assert(num_probes <= 127);
+
+  uint32_t len = len_with_meta - 5;
+  assert(len > 0);
+
+  uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
+  uint32_t log2_cache_line_size;
+
+  if (num_lines * CACHE_LINE_SIZE == len) {
+    // Common case
+    log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE);
+  } else if (num_lines == 0 || len % num_lines != 0) {
+    // Invalid (no solution to num_lines * x == len)
+    // Treat as zero probes (always FP) for now.
+    return new AlwaysTrueFilter();
+  } else {
+    // Determine the non-native cache line size (from another system)
+    log2_cache_line_size = 0;
+    while ((num_lines << log2_cache_line_size) < len) {
+      ++log2_cache_line_size;
+    }
+    if ((num_lines << log2_cache_line_size) != len) {
+      // Invalid (block size not a power of two)
+      // Treat as zero probes (always FP) for now.
+      return new AlwaysTrueFilter();
+    }
+  }
+  // if not early return
+  return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines,
+                                   log2_cache_line_size);
+}
+
+// For newer Bloom filter implementations
+FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader(
+    const Slice& contents) const {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  uint32_t len = len_with_meta - 5;
+
+  assert(len > 0);  // precondition
+
+  // New Bloom filter data:
+  //             0 +-----------------------------------+
+  //               | Raw Bloom filter data             |
+  //               | ...                               |
+  //           len +-----------------------------------+
+  //               | char{-1} byte -> new Bloom filter |
+  //         len+1 +-----------------------------------+
+  //               | byte for subimplementation        |
+  //               |   0: FastLocalBloom               |
+  //               |   other: reserved                 |
+  //         len+2 +-----------------------------------+
+  //               | byte for block_and_probes         |
+  //               |   0 in top 3 bits -> 6 -> 64-byte |
+  //               |   reserved:                       |
+  //               |   1 in top 3 bits -> 7 -> 128-byte|
+  //               |   2 in top 3 bits -> 8 -> 256-byte|
+  //               |   ...                             |
+  //               |   num_probes in bottom 5 bits,    |
+  //               |     except 0 and 31 reserved      |
+  //         len+3 +-----------------------------------+
+  //               | two bytes reserved                |
+  //               |   possibly for hash seed          |
+  // len_with_meta +-----------------------------------+
+
+  // Read more metadata (see above)
+  char sub_impl_val = contents.data()[len_with_meta - 4];
+  char block_and_probes = contents.data()[len_with_meta - 3];
+  int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6;
+
+  int num_probes = (block_and_probes & 31);
+  if (num_probes < 1 || num_probes > 30) {
+    // Reserved / future safe
+    return new AlwaysTrueFilter();
+  }
+
+  uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2);
+  if (rest != 0) {
+    // Reserved, possibly for hash seed
+    // Future safe
+    return new AlwaysTrueFilter();
+  }
+
+  if (sub_impl_val == 0) {        // FastLocalBloom
+    if (log2_block_bytes == 6) {  // Only block size supported for now
+      return new FastLocalBloomBitsReader(contents.data(), num_probes, len);
+    }
+  }
+  // otherwise
+  // Reserved / future safe
+  return new AlwaysTrueFilter();
+}
+
+const FilterPolicy* NewBloomFilterPolicy(double bits_per_key,
+                                         bool use_block_based_builder) {
+  BloomFilterPolicy::Mode m;
+  if (use_block_based_builder) {
+    m = BloomFilterPolicy::kDeprecatedBlock;
+  } else {
+    m = BloomFilterPolicy::kAuto;
+  }
+  assert(std::find(BloomFilterPolicy::kAllUserModes.begin(),
+                   BloomFilterPolicy::kAllUserModes.end(),
+                   m) != BloomFilterPolicy::kAllUserModes.end());
+  return new BloomFilterPolicy(bits_per_key, m);
+}
+
+FilterBuildingContext::FilterBuildingContext(
+    const BlockBasedTableOptions& _table_options)
+    : table_options(_table_options) {}
+
+FilterPolicy::~FilterPolicy() { }
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_policy_internal.h b/src/rocksdb/table/block_based/filter_policy_internal.h
new file mode 100644
index 000000000..2ca9dc859
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_policy_internal.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// Exposes any extra information needed for testing built-in
+// FilterBitsBuilders
+class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  // Calculate number of bytes needed for a new filter, including
+  // metadata. Passing the result to CalculateNumEntry should
+  // return >= the num_entry passed in.
+  virtual uint32_t CalculateSpace(const int num_entry) = 0;
+
+  // Returns an estimate of the FP rate of the returned filter if
+  // `keys` keys are added and the filter returned by Finish is `bytes`
+  // bytes.
+  virtual double EstimatedFpRate(size_t keys, size_t bytes) = 0;
+};
+
+// RocksDB built-in filter policy for Bloom or Bloom-like filters.
+// This class is considered internal API and subject to change.
+// See NewBloomFilterPolicy.
+class BloomFilterPolicy : public FilterPolicy {
+ public:
+  // An internal marker for operating modes of BloomFilterPolicy, in terms
+  // of selecting an implementation. This makes it easier for tests to track
+  // or to walk over the built-in set of Bloom filter implementations. The
+  // only variance in BloomFilterPolicy by mode/implementation is in
+  // GetFilterBitsBuilder(), so an enum is practical here vs. subclasses.
+  //
+  // This enum is essentially the union of all the different kinds of return
+  // value from GetFilterBitsBuilder, or "underlying implementation", and
+  // higher-level modes that choose an underlying implementation based on
+  // context information.
+  enum Mode {
+    // Legacy implementation of Bloom filter for full and partitioned filters.
+    // Set to 0 in case of value confusion with bool use_block_based_builder
+    // NOTE: TESTING ONLY as this mode does not use best compatible
+    // implementation
+    kLegacyBloom = 0,
+    // Deprecated block-based Bloom filter implementation.
+    // Set to 1 in case of value confusion with bool use_block_based_builder
+    // NOTE: DEPRECATED but user exposed
+    kDeprecatedBlock = 1,
+    // A fast, cache-local Bloom filter implementation. See description in
+    // FastLocalBloomImpl.
+    // NOTE: TESTING ONLY as this mode does not check format_version
+    kFastLocalBloom = 2,
+    // Automatically choose from the above (except kDeprecatedBlock) based on
+    // context at build time, including compatibility with format_version.
+    // NOTE: This is currently the only recommended mode that is user exposed.
+    kAuto = 100,
+  };
+  // All the different underlying implementations that a BloomFilterPolicy
+  // might use, as a mode that says "always use this implementation."
+  // Only appropriate for unit tests.
+  static const std::vector<Mode> kAllFixedImpls;
+
+  // All the different modes of BloomFilterPolicy that are exposed from
+  // user APIs. Only appropriate for higher-level unit tests. Integration
+  // tests should prefer using NewBloomFilterPolicy (user-exposed).
+  static const std::vector<Mode> kAllUserModes;
+
+  explicit BloomFilterPolicy(double bits_per_key, Mode mode);
+
+  ~BloomFilterPolicy() override;
+
+  const char* Name() const override;
+
+  // Deprecated block-based filter only
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override;
+
+  // Deprecated block-based filter only
+  bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override;
+
+  FilterBitsBuilder* GetFilterBitsBuilder() const override;
+
+  // To use this function, call GetBuilderFromContext().
+  //
+  // Neither the context nor any objects therein should be saved beyond
+  // the call to this function, unless it's shared_ptr.
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override;
+
+  // Returns a new FilterBitsBuilder from the filter_policy in
+  // table_options of a context, or nullptr if not applicable.
+  // (An internal convenience function to save boilerplate.)
+  static FilterBitsBuilder* GetBuilderFromContext(const FilterBuildingContext&);
+
+  // Read metadata to determine what kind of FilterBitsReader is needed
+  // and return a new one. This must successfully process any filter data
+  // generated by a built-in FilterBitsBuilder, regardless of the impl
+  // chosen for this BloomFilterPolicy. Not compatible with CreateFilter.
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
+
+  // Essentially for testing only: configured millibits/key
+  int GetMillibitsPerKey() const { return millibits_per_key_; }
+  // Essentially for testing only: legacy whole bits/key
+  int GetWholeBitsPerKey() const { return whole_bits_per_key_; }
+
+ private:
+  // Newer filters support fractional bits per key. For predictable behavior
+  // of 0.001-precision values across floating point implementations, we
+  // round to thousandths of a bit (on average) per key.
+  int millibits_per_key_;
+
+  // Older filters round to whole number bits per key. (There *should* be no
+  // compatibility issue with fractional bits per key, but preserving old
+  // behavior with format_version < 5 just in case.)
+  int whole_bits_per_key_;
+
+  // Selected mode (a specific implementation or way of selecting an
+  // implementation) for building new SST filters.
+  Mode mode_;
+
+  // Whether relevant warnings have been logged already. (Remember so we
+  // only report once per BloomFilterPolicy instance, to keep the noise down.)
+  mutable std::atomic<bool> warned_;
+
+  // For newer Bloom filter implementation(s)
+  FilterBitsReader* GetBloomBitsReader(const Slice& contents) const;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/flush_block_policy.cc b/src/rocksdb/table/block_based/flush_block_policy.cc
new file mode 100644
index 000000000..f5cb2d227
--- /dev/null
+++ b/src/rocksdb/table/block_based/flush_block_policy.cc
@@ -0,0 +1,88 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+
+#include <cassert>
+
+namespace ROCKSDB_NAMESPACE {
+
+// Flush block by size
+class FlushBlockBySizePolicy : public FlushBlockPolicy {
+ public:
+  // @params block_size:           Approximate size of user data packed per
+  //                               block.
+  // @params block_size_deviation: This is used to close a block before it
+  //                               reaches the configured
+  FlushBlockBySizePolicy(const uint64_t block_size,
+                         const uint64_t block_size_deviation,
+                         const bool align,
+                         const BlockBuilder& data_block_builder)
+      : block_size_(block_size),
+        block_size_deviation_limit_(
+            ((block_size * (100 - block_size_deviation)) + 99) / 100),
+        align_(align),
+        data_block_builder_(data_block_builder) {}
+
+  bool Update(const Slice& key, const Slice& value) override {
+    // it makes no sense to flush when the data block is empty
+    if (data_block_builder_.empty()) {
+      return false;
+    }
+
+    auto curr_size = data_block_builder_.CurrentSizeEstimate();
+
+    // Do flush if one of the below two conditions is true:
+    // 1) if the current estimated size already exceeds the block size,
+    // 2) block_size_deviation is set and the estimated size after appending
+    // the kv will exceed the block size and the current size is under the
+    // the deviation.
+    return curr_size >= block_size_ || BlockAlmostFull(key, value);
+  }
+
+ private:
+  bool BlockAlmostFull(const Slice& key, const Slice& value) const {
+    if (block_size_deviation_limit_ == 0) {
+      return false;
+    }
+
+    const auto curr_size = data_block_builder_.CurrentSizeEstimate();
+    auto estimated_size_after =
+        data_block_builder_.EstimateSizeAfterKV(key, value);
+
+    if (align_) {
+      estimated_size_after += kBlockTrailerSize;
+      return estimated_size_after > block_size_;
+    }
+
+    return estimated_size_after > block_size_ &&
+           curr_size > block_size_deviation_limit_;
+  }
+
+  const uint64_t block_size_;
+  const uint64_t block_size_deviation_limit_;
+  const bool align_;
+  const BlockBuilder& data_block_builder_;
+};
+
+FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+    const BlockBasedTableOptions& table_options,
+    const BlockBuilder& data_block_builder) const {
+  return new FlushBlockBySizePolicy(
+      table_options.block_size, table_options.block_size_deviation,
+      table_options.block_align, data_block_builder);
+}
+
+FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+    const uint64_t size, const int deviation,
+    const BlockBuilder& data_block_builder) {
+  return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/flush_block_policy.h b/src/rocksdb/table/block_based/flush_block_policy.h
new file mode 100644
index 000000000..68c60c168
--- /dev/null
+++ b/src/rocksdb/table/block_based/flush_block_policy.h
@@ -0,0 +1,41 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/flush_block_policy.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// FlushBlockEveryKeyPolicy currently used only in tests.
+
+class FlushBlockEveryKeyPolicy : public FlushBlockPolicy {
+ public:
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+    if (!start_) {
+      start_ = true;
+      return false;
+    }
+    return true;
+  }
+
+ private:
+  bool start_ = false;
+};
+
+class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  explicit FlushBlockEveryKeyPolicyFactory() {}
+
+  const char* Name() const override {
+    return "FlushBlockEveryKeyPolicyFactory";
+  }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& /*data_block_builder*/) const override {
+    return new FlushBlockEveryKeyPolicy;
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block.cc b/src/rocksdb/table/block_based/full_filter_block.cc
new file mode 100644
index 000000000..e2f7f476f
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block.cc
@@ -0,0 +1,338 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/full_filter_block.h"
+#include <array>
+
+#include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FullFilterBlockBuilder::FullFilterBlockBuilder(
+    const SliceTransform* _prefix_extractor, bool whole_key_filtering,
+    FilterBitsBuilder* filter_bits_builder)
+    : prefix_extractor_(_prefix_extractor),
+      whole_key_filtering_(whole_key_filtering),
+      last_whole_key_recorded_(false),
+      last_prefix_recorded_(false),
+      num_added_(0) {
+  assert(filter_bits_builder != nullptr);
+  filter_bits_builder_.reset(filter_bits_builder);
+}
+
+void FullFilterBlockBuilder::Add(const Slice& key) {
+  const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key);
+  if (whole_key_filtering_) {
+    if (!add_prefix) {
+      AddKey(key);
+    } else {
+      // if both whole_key and prefix are added to bloom then we will have whole
+      // key and prefix addition being interleaved and thus cannot rely on the
+      // bits builder to properly detect the duplicates by comparing with the
+      // last item.
+      Slice last_whole_key = Slice(last_whole_key_str_);
+      if (!last_whole_key_recorded_ || last_whole_key.compare(key) != 0) {
+        AddKey(key);
+        last_whole_key_recorded_ = true;
+        last_whole_key_str_.assign(key.data(), key.size());
+      }
+    }
+  }
+  if (add_prefix) {
+    AddPrefix(key);
+  }
+}
+
+// Add key to filter if needed
+inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
+  filter_bits_builder_->AddKey(key);
+  num_added_++;
+}
+
+// Add prefix to filter if needed
+void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
+  Slice prefix = prefix_extractor_->Transform(key);
+  if (whole_key_filtering_) {
+    // if both whole_key and prefix are added to bloom then we will have whole
+    // key and prefix addition being interleaved and thus cannot rely on the
+    // bits builder to properly detect the duplicates by comparing with the last
+    // item.
+    Slice last_prefix = Slice(last_prefix_str_);
+    if (!last_prefix_recorded_ || last_prefix.compare(prefix) != 0) {
+      AddKey(prefix);
+      last_prefix_recorded_ = true;
+      last_prefix_str_.assign(prefix.data(), prefix.size());
+    }
+  } else {
+    AddKey(prefix);
+  }
+}
+
+void FullFilterBlockBuilder::Reset() {
+  last_whole_key_recorded_ = false;
+  last_prefix_recorded_ = false;
+}
+
+Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
+                                     Status* status) {
+  Reset();
+  // In this impl we ignore BlockHandle
+  *status = Status::OK();
+  if (num_added_ != 0) {
+    num_added_ = 0;
+    return filter_bits_builder_->Finish(&filter_data_);
+  }
+  return Slice();
+}
+
+FullFilterBlockReader::FullFilterBlockReader(
+    const BlockBasedTable* t,
+    CachableEntry<ParsedFullFilterBlock>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {
+  const SliceTransform* const prefix_extractor = table_prefix_extractor();
+  if (prefix_extractor) {
+    full_length_enabled_ =
+        prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_);
+  }
+}
+
+bool FullFilterBlockReader::KeyMayMatch(
+    const Slice& key, const SliceTransform* /*prefix_extractor*/,
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+#ifdef NDEBUG
+  (void)block_offset;
+#endif
+  assert(block_offset == kNotValid);
+  if (!whole_key_filtering()) {
+    return true;
+  }
+  return MayMatch(key, no_io, get_context, lookup_context);
+}
+
+std::unique_ptr<FilterBlockReader> FullFilterBlockReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+                                     use_cache, nullptr /* get_context */,
+                                     lookup_context, &filter_block);
+    if (!s.ok()) {
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new FullFilterBlockReader(table, std::move(filter_block)));
+}
+
+bool FullFilterBlockReader::PrefixMayMatch(
+    const Slice& prefix, const SliceTransform* /* prefix_extractor */,
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+#ifdef NDEBUG
+  (void)block_offset;
+#endif
+  assert(block_offset == kNotValid);
+  return MayMatch(prefix, no_io, get_context, lookup_context);
+}
+
+bool FullFilterBlockReader::MayMatch(
+    const Slice& entry, bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) const {
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+
+  const Status s =
+      GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+  if (!s.ok()) {
+    return true;
+  }
+
+  assert(filter_block.GetValue());
+
+  FilterBitsReader* const filter_bits_reader =
+      filter_block.GetValue()->filter_bits_reader();
+
+  if (filter_bits_reader) {
+    if (filter_bits_reader->MayMatch(entry)) {
+      PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+      return true;
+    } else {
+      PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+      return false;
+    }
+  }
+  return true;  // remain the same with block_based filter
+}
+
+void FullFilterBlockReader::KeysMayMatch(
+    MultiGetRange* range, const SliceTransform* /*prefix_extractor*/,
+    uint64_t block_offset, const bool no_io,
+    BlockCacheLookupContext* lookup_context) {
+#ifdef NDEBUG
+  (void)range;
+  (void)block_offset;
+#endif
+  assert(block_offset == kNotValid);
+  if (!whole_key_filtering()) {
+    // Simply return. Don't skip any key - consider all keys as likely to be
+    // present
+    return;
+  }
+  MayMatch(range, no_io, nullptr, lookup_context);
+}
+
+void FullFilterBlockReader::PrefixesMayMatch(
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, const bool no_io,
+    BlockCacheLookupContext* lookup_context) {
+#ifdef NDEBUG
+  (void)range;
+  (void)block_offset;
+#endif
+  assert(block_offset == kNotValid);
+  MayMatch(range, no_io, prefix_extractor, lookup_context);
+}
+
+void FullFilterBlockReader::MayMatch(
+    MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor,
+    BlockCacheLookupContext* lookup_context) const {
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+
+  const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context,
+                                        lookup_context, &filter_block);
+  if (!s.ok()) {
+    return;
+  }
+
+  assert(filter_block.GetValue());
+
+  FilterBitsReader* const filter_bits_reader =
+      filter_block.GetValue()->filter_bits_reader();
+
+  if (!filter_bits_reader) {
+    return;
+  }
+
+  // We need to use an array instead of autovector for may_match since
+  // &may_match[0] doesn't work for autovector<bool> (compiler error). So
+  // declare both keys and may_match as arrays, which is also slightly less
+  // expensive compared to autovector
+  std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
+  std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
+  autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+  int num_keys = 0;
+  MultiGetRange filter_range(*range, range->begin(), range->end());
+  for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
+    if (!prefix_extractor) {
+      keys[num_keys++] = &iter->ukey;
+    } else if (prefix_extractor->InDomain(iter->ukey)) {
+      prefixes.emplace_back(prefix_extractor->Transform(iter->ukey));
+      keys[num_keys++] = &prefixes.back();
+    } else {
+      filter_range.SkipKey(iter);
+    }
+  }
+
+  filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]);
+
+  int i = 0;
+  for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
+    if (!may_match[i]) {
+      // Update original MultiGet range to skip this key. The filter_range
+      // was temporarily used just to skip keys not in prefix_extractor domain
+      range->SkipKey(iter);
+      PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+    } else {
+      // PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+      PerfContext* perf_ctx = get_perf_context();
+      perf_ctx->bloom_sst_hit_count++;
+    }
+    ++i;
+  }
+}
+
+size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
+  size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<FullFilterBlockReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
+}
+
+bool FullFilterBlockReader::RangeMayExist(
+    const Slice* iterate_upper_bound, const Slice& user_key,
+    const SliceTransform* prefix_extractor, const Comparator* comparator,
+    const Slice* const const_ikey_ptr, bool* filter_checked,
+    bool need_upper_bound_check, BlockCacheLookupContext* lookup_context) {
+  if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) {
+    *filter_checked = false;
+    return true;
+  }
+  Slice prefix = prefix_extractor->Transform(user_key);
+  if (need_upper_bound_check &&
+      !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
+    *filter_checked = false;
+    return true;
+  } else {
+    *filter_checked = true;
+    return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
+                          const_ikey_ptr, /* get_context */ nullptr,
+                          lookup_context);
+  }
+}
+
+bool FullFilterBlockReader::IsFilterCompatible(
+    const Slice* iterate_upper_bound, const Slice& prefix,
+    const Comparator* comparator) const {
+  // Try to reuse the bloom filter in the SST table if prefix_extractor in
+  // mutable_cf_options has changed. If range [user_key, upper_bound) all
+  // share the same prefix then we may still be able to use the bloom filter.
+  const SliceTransform* const prefix_extractor = table_prefix_extractor();
+  if (iterate_upper_bound != nullptr && prefix_extractor) {
+    if (!prefix_extractor->InDomain(*iterate_upper_bound)) {
+      return false;
+    }
+    Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
+    // first check if user_key and upper_bound all share the same prefix
+    if (!comparator->Equal(prefix, upper_bound_xform)) {
+      // second check if user_key's prefix is the immediate predecessor of
+      // upper_bound and have the same length. If so, we know for sure all
+      // keys in the range [user_key, upper_bound) share the same prefix.
+      // Also need to make sure upper_bound are full length to ensure
+      // correctness
+      if (!full_length_enabled_ ||
+          iterate_upper_bound->size() != prefix_extractor_full_length_ ||
+          !comparator->IsSameLengthImmediateSuccessor(prefix,
+                                                      *iterate_upper_bound)) {
+        return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block.h b/src/rocksdb/table/block_based/full_filter_block.h
new file mode 100644
index 000000000..c72a58021
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block.h
@@ -0,0 +1,139 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FilterPolicy;
+class FilterBitsBuilder;
+class FilterBitsReader;
+
+// A FullFilterBlockBuilder is used to construct a full filter for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+// The format of full filter block is:
+// +----------------------------------------------------------------+
+// |              full filter for all keys in sst file              |
+// +----------------------------------------------------------------+
+// The full filter can be very large. At the end of it, we put
+// num_probes: how many hash functions are used in bloom filter
+//
+class FullFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+  explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor,
+                                  bool whole_key_filtering,
+                                  FilterBitsBuilder* filter_bits_builder);
+  // No copying allowed
+  FullFilterBlockBuilder(const FullFilterBlockBuilder&) = delete;
+  void operator=(const FullFilterBlockBuilder&) = delete;
+
+  // bits_builder is created in filter_policy, it should be passed in here
+  // directly. and be deleted here
+  ~FullFilterBlockBuilder() {}
+
+  virtual bool IsBlockBased() override { return false; }
+  virtual void StartBlock(uint64_t /*block_offset*/) override {}
+  virtual void Add(const Slice& key) override;
+  virtual size_t NumAdded() const override { return num_added_; }
+  virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
+  using FilterBlockBuilder::Finish;
+
+ protected:
+  virtual void AddKey(const Slice& key);
+  std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
+  virtual void Reset();
+  void AddPrefix(const Slice& key);
+  const SliceTransform* prefix_extractor() { return prefix_extractor_; }
+
+ private:
+  // important: all of these might point to invalid addresses
+  // at the time of destruction of this filter block. destructor
+  // should NOT dereference them.
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  bool last_whole_key_recorded_;
+  std::string last_whole_key_str_;
+  bool last_prefix_recorded_;
+  std::string last_prefix_str_;
+
+  uint32_t num_added_;
+  std::unique_ptr<const char[]> filter_data_;
+
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class FullFilterBlockReader
+    : public FilterBlockReaderCommon<ParsedFullFilterBlock> {
+ public:
+  FullFilterBlockReader(const BlockBasedTable* t,
+                        CachableEntry<ParsedFullFilterBlock>&& filter_block);
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
+
+  bool IsBlockBased() override { return false; }
+
+  bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+                   uint64_t block_offset, const bool no_io,
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context) override;
+
+  bool PrefixMayMatch(const Slice& prefix,
+                      const SliceTransform* prefix_extractor,
+                      uint64_t block_offset, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context) override;
+
+  void KeysMayMatch(MultiGetRange* range,
+                    const SliceTransform* prefix_extractor,
+                    uint64_t block_offset, const bool no_io,
+                    BlockCacheLookupContext* lookup_context) override;
+
+  void PrefixesMayMatch(MultiGetRange* range,
+                        const SliceTransform* prefix_extractor,
+                        uint64_t block_offset, const bool no_io,
+                        BlockCacheLookupContext* lookup_context) override;
+  size_t ApproximateMemoryUsage() const override;
+  bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
+                     const SliceTransform* prefix_extractor,
+                     const Comparator* comparator,
+                     const Slice* const const_ikey_ptr, bool* filter_checked,
+                     bool need_upper_bound_check,
+                     BlockCacheLookupContext* lookup_context) override;
+
+ private:
+  bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context,
+                BlockCacheLookupContext* lookup_context) const;
+  void MayMatch(MultiGetRange* range, bool no_io,
+                const SliceTransform* prefix_extractor,
+                BlockCacheLookupContext* lookup_context) const;
+  bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix,
+                          const Comparator* comparator) const;
+
+ private:
+  bool full_length_enabled_;
+  size_t prefix_extractor_full_length_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block_test.cc b/src/rocksdb/table/block_based/full_filter_block_test.cc
new file mode 100644
index 000000000..496b149ab
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block_test.cc
@@ -0,0 +1,333 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <set>
+
+#include "table/block_based/full_filter_block.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  explicit TestFilterBitsBuilder() {}
+
+  // Add Key to filter
+  void AddKey(const Slice& key) override {
+    hash_entries_.push_back(Hash(key.data(), key.size(), 1));
+  }
+
+  // Generate the filter using the keys that are added
+  Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    uint32_t len = static_cast<uint32_t>(hash_entries_.size()) * 4;
+    char* data = new char[len];
+    for (size_t i = 0; i < hash_entries_.size(); i++) {
+      EncodeFixed32(data + i * 4, hash_entries_[i]);
+    }
+    const char* const_data = data;
+    buf->reset(const_data);
+    return Slice(data, len);
+  }
+
+ private:
+  std::vector<uint32_t> hash_entries_;
+};
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class TestFilterBitsReader : public FilterBitsReader {
+ public:
+  explicit TestFilterBitsReader(const Slice& contents)
+      : data_(contents.data()), len_(static_cast<uint32_t>(contents.size())) {}
+
+  // Silence compiler warning about overloaded virtual
+  using FilterBitsReader::MayMatch;
+  bool MayMatch(const Slice& entry) override {
+    uint32_t h = Hash(entry.data(), entry.size(), 1);
+    for (size_t i = 0; i + 4 <= len_; i += 4) {
+      if (h == DecodeFixed32(data_ + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  const char* data_;
+  uint32_t len_;
+};
+
+
+class TestHashFilter : public FilterPolicy {
+ public:
+  const char* Name() const override { return "TestHashFilter"; }
+
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  FilterBitsBuilder* GetFilterBitsBuilder() const override {
+    return new TestFilterBitsBuilder();
+  }
+
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+    return new TestFilterBitsReader(contents);
+  }
+};
+
+class PluginFullFilterBlockTest : public mock::MockBlockBasedTableTester,
+                                  public testing::Test {
+ public:
+  PluginFullFilterBlockTest()
+      : mock::MockBlockBasedTableTester(new TestHashFilter) {}
+};
+
+TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  Slice slice = builder.Finish();
+  ASSERT_EQ("", EscapeString(slice));
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+}
+
+TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  Slice slice = builder.Finish();
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+}
+
+class FullFilterBlockTest : public mock::MockBlockBasedTableTester,
+                            public testing::Test {
+ public:
+  FullFilterBlockTest()
+      : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, false)) {}
+};
+
+TEST_F(FullFilterBlockTest, EmptyBuilder) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  Slice slice = builder.Finish();
+  ASSERT_EQ("", EscapeString(slice));
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+}
+
+class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder {
+  std::unique_ptr<FilterBitsBuilder> b_;
+  std::set<std::string> uniq_;
+
+ public:
+  explicit CountUniqueFilterBitsBuilderWrapper(FilterBitsBuilder* b) : b_(b) {}
+
+  ~CountUniqueFilterBitsBuilderWrapper() override {}
+
+  void AddKey(const Slice& key) override {
+    b_->AddKey(key);
+    uniq_.insert(key.ToString());
+  }
+
+  Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    Slice rv = b_->Finish(buf);
+    uniq_.clear();
+    return rv;
+  }
+
+  int CalculateNumEntry(const uint32_t bytes) override {
+    return b_->CalculateNumEntry(bytes);
+  }
+
+  size_t CountUnique() { return uniq_.size(); }
+};
+
+TEST_F(FullFilterBlockTest, DuplicateEntries) {
+  {  // empty prefixes
+    std::unique_ptr<const SliceTransform> prefix_extractor(
+        NewFixedPrefixTransform(0));
+    auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
+    const bool WHOLE_KEY = true;
+    FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+                                   bits_builder);
+    ASSERT_EQ(0, builder.NumAdded());
+    ASSERT_EQ(0, bits_builder->CountUnique());
+    // adds key and empty prefix; both abstractions count them
+    builder.Add("key1");
+    ASSERT_EQ(2, builder.NumAdded());
+    ASSERT_EQ(2, bits_builder->CountUnique());
+    // Add different key (unique) and also empty prefix (not unique).
+    // From here in this test, it's immaterial whether the block builder
+    // can count unique keys.
+    builder.Add("key2");
+    ASSERT_EQ(3, bits_builder->CountUnique());
+    // Empty key -> nothing unique
+    builder.Add("");
+    ASSERT_EQ(3, bits_builder->CountUnique());
+  }
+
+  // mix of empty and non-empty
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(7));
+  auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
+  const bool WHOLE_KEY = true;
+  FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+                                 bits_builder);
+  ASSERT_EQ(0, builder.NumAdded());
+  builder.Add("");  // test with empty key too
+  builder.Add("prefix1key1");
+  builder.Add("prefix1key1");
+  builder.Add("prefix1key2");
+  builder.Add("prefix1key3");
+  builder.Add("prefix2key4");
+  // 1 empty, 2 non-empty prefixes, and 4 non-empty keys
+  ASSERT_EQ(1 + 2 + 4, bits_builder->CountUnique());
+}
+
+TEST_F(FullFilterBlockTest, SingleChunk) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  ASSERT_EQ(0, builder.NumAdded());
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  ASSERT_EQ(5, builder.NumAdded());
+  Slice slice = builder.Finish();
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/index_builder.cc b/src/rocksdb/table/block_based/index_builder.cc
new file mode 100644
index 000000000..277bec61d
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_builder.cc
@@ -0,0 +1,222 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/index_builder.h"
+
+#include <assert.h>
+#include <cinttypes>
+
+#include <list>
+#include <string>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/flush_block_policy.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/format.h"
+
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace ROCKSDB_NAMESPACE {
+// using namespace rocksdb;
+// Create a index builder based on its type.
+IndexBuilder* IndexBuilder::CreateIndexBuilder(
+    BlockBasedTableOptions::IndexType index_type,
+    const InternalKeyComparator* comparator,
+    const InternalKeySliceTransform* int_key_slice_transform,
+    const bool use_value_delta_encoding,
+    const BlockBasedTableOptions& table_opt) {
+  IndexBuilder* result = nullptr;
+  switch (index_type) {
+    case BlockBasedTableOptions::kBinarySearch: {
+      result = new ShortenedIndexBuilder(
+          comparator, table_opt.index_block_restart_interval,
+          table_opt.format_version, use_value_delta_encoding,
+          table_opt.index_shortening, /* include_first_key */ false);
+    } break;
+    case BlockBasedTableOptions::kHashSearch: {
+      // Currently kHashSearch is incompatible with index_block_restart_interval
+      // > 1
+      assert(table_opt.index_block_restart_interval == 1);
+      result = new HashIndexBuilder(
+          comparator, int_key_slice_transform,
+          table_opt.index_block_restart_interval, table_opt.format_version,
+          use_value_delta_encoding, table_opt.index_shortening);
+    } break;
+    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
+      result = PartitionedIndexBuilder::CreateIndexBuilder(
+          comparator, use_value_delta_encoding, table_opt);
+    } break;
+    case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
+      result = new ShortenedIndexBuilder(
+          comparator, table_opt.index_block_restart_interval,
+          table_opt.format_version, use_value_delta_encoding,
+          table_opt.index_shortening, /* include_first_key */ true);
+    } break;
+    default: {
+      assert(!"Do not recognize the index type ");
+    } break;
+  }
+  return result;
+}
+
+PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
+    const InternalKeyComparator* comparator,
+    const bool use_value_delta_encoding,
+    const BlockBasedTableOptions& table_opt) {
+  return new PartitionedIndexBuilder(comparator, table_opt,
+                                     use_value_delta_encoding);
+}
+
+PartitionedIndexBuilder::PartitionedIndexBuilder(
+    const InternalKeyComparator* comparator,
+    const BlockBasedTableOptions& table_opt,
+    const bool use_value_delta_encoding)
+    : IndexBuilder(comparator),
+      index_block_builder_(table_opt.index_block_restart_interval,
+                           true /*use_delta_encoding*/,
+                           use_value_delta_encoding),
+      index_block_builder_without_seq_(table_opt.index_block_restart_interval,
+                                       true /*use_delta_encoding*/,
+                                       use_value_delta_encoding),
+      sub_index_builder_(nullptr),
+      table_opt_(table_opt),
+      // We start by false. After each partition we revise the value based on
+      // what the sub_index_builder has decided. If the feature is disabled
+      // entirely, this will be set to true after switching the first
+      // sub_index_builder. Otherwise, it could be set to true even one of the
+      // sub_index_builders could not safely exclude seq from the keys, then it
+      // wil be enforced on all sub_index_builders on ::Finish.
+      seperator_is_key_plus_seq_(false),
+      use_value_delta_encoding_(use_value_delta_encoding) {}
+
+PartitionedIndexBuilder::~PartitionedIndexBuilder() {
+  delete sub_index_builder_;
+}
+
+void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
+  assert(sub_index_builder_ == nullptr);
+  sub_index_builder_ = new ShortenedIndexBuilder(
+      comparator_, table_opt_.index_block_restart_interval,
+      table_opt_.format_version, use_value_delta_encoding_,
+      table_opt_.index_shortening, /* include_first_key */ false);
+  flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+      table_opt_.metadata_block_size, table_opt_.block_size_deviation,
+      // Note: this is sub-optimal since sub_index_builder_ could later reset
+      // seperator_is_key_plus_seq_ but the probability of that is low.
+      sub_index_builder_->seperator_is_key_plus_seq_
+          ? sub_index_builder_->index_block_builder_
+          : sub_index_builder_->index_block_builder_without_seq_));
+  partition_cut_requested_ = false;
+}
+
+void PartitionedIndexBuilder::RequestPartitionCut() {
+  partition_cut_requested_ = true;
+}
+
+void PartitionedIndexBuilder::AddIndexEntry(
+    std::string* last_key_in_current_block,
+    const Slice* first_key_in_next_block, const BlockHandle& block_handle) {
+  // Note: to avoid two consecuitive flush in the same method call, we do not
+  // check flush policy when adding the last key
+  if (UNLIKELY(first_key_in_next_block == nullptr)) {  // no more keys
+    if (sub_index_builder_ == nullptr) {
+      MakeNewSubIndexBuilder();
+    }
+    sub_index_builder_->AddIndexEntry(last_key_in_current_block,
+                                      first_key_in_next_block, block_handle);
+    if (sub_index_builder_->seperator_is_key_plus_seq_) {
+      // then we need to apply it to all sub-index builders
+      seperator_is_key_plus_seq_ = true;
+    }
+    sub_index_last_key_ = std::string(*last_key_in_current_block);
+    entries_.push_back(
+        {sub_index_last_key_,
+         std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
+    sub_index_builder_ = nullptr;
+    cut_filter_block = true;
+  } else {
+    // apply flush policy only to non-empty sub_index_builder_
+    if (sub_index_builder_ != nullptr) {
+      std::string handle_encoding;
+      block_handle.EncodeTo(&handle_encoding);
+      bool do_flush =
+          partition_cut_requested_ ||
+          flush_policy_->Update(*last_key_in_current_block, handle_encoding);
+      if (do_flush) {
+        entries_.push_back(
+            {sub_index_last_key_,
+             std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
+        cut_filter_block = true;
+        sub_index_builder_ = nullptr;
+      }
+    }
+    if (sub_index_builder_ == nullptr) {
+      MakeNewSubIndexBuilder();
+    }
+    sub_index_builder_->AddIndexEntry(last_key_in_current_block,
+                                      first_key_in_next_block, block_handle);
+    sub_index_last_key_ = std::string(*last_key_in_current_block);
+    if (sub_index_builder_->seperator_is_key_plus_seq_) {
+      // then we need to apply it to all sub-index builders
+      seperator_is_key_plus_seq_ = true;
+    }
+  }
+}
+
+Status PartitionedIndexBuilder::Finish(
+    IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
+  if (partition_cnt_ == 0) {
+    partition_cnt_ = entries_.size();
+  }
+  // It must be set to null after last key is added
+  assert(sub_index_builder_ == nullptr);
+  if (finishing_indexes == true) {
+    Entry& last_entry = entries_.front();
+    std::string handle_encoding;
+    last_partition_block_handle.EncodeTo(&handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(
+        &handle_delta_encoding,
+        last_partition_block_handle.size() - last_encoded_handle_.size());
+    last_encoded_handle_ = last_partition_block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_block_builder_.Add(last_entry.key, handle_encoding,
+                             &handle_delta_encoding_slice);
+    if (!seperator_is_key_plus_seq_) {
+      index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
+                                           handle_encoding,
+                                           &handle_delta_encoding_slice);
+    }
+    entries_.pop_front();
+  }
+  // If there is no sub_index left, then return the 2nd level index.
+  if (UNLIKELY(entries_.empty())) {
+    if (seperator_is_key_plus_seq_) {
+      index_blocks->index_block_contents = index_block_builder_.Finish();
+    } else {
+      index_blocks->index_block_contents =
+          index_block_builder_without_seq_.Finish();
+    }
+    top_level_index_size_ = index_blocks->index_block_contents.size();
+    index_size_ += top_level_index_size_;
+    return Status::OK();
+  } else {
+    // Finish the next partition index in line and Incomplete() to indicate we
+    // expect more calls to Finish
+    Entry& entry = entries_.front();
+    // Apply the policy to all sub-indexes
+    entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
+    auto s = entry.value->Finish(index_blocks);
+    index_size_ += index_blocks->index_block_contents.size();
+    finishing_indexes = true;
+    return s.ok() ? Status::Incomplete() : s;
+  }
+}
+
+size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; }
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/index_builder.h b/src/rocksdb/table/block_based/index_builder.h
new file mode 100644
index 000000000..bfffc5996
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_builder.h
@@ -0,0 +1,443 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <assert.h>
+#include <cinttypes>
+
+#include <list>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/comparator.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The interface for building index.
+// Instruction for adding a new concrete IndexBuilder:
+//  1. Create a subclass instantiated from IndexBuilder.
+//  2. Add a new entry associated with that subclass in TableOptions::IndexType.
+//  3. Add a create function for the new subclass in CreateIndexBuilder.
+// Note: we can devise more advanced design to simplify the process for adding
+// new subclass, which will, on the other hand, increase the code complexity and
+// catch unwanted attention from readers. Given that we won't add/change
+// indexes frequently, it makes sense to just embrace a more straightforward
+// design that just works.
+class IndexBuilder {
+ public:
+  static IndexBuilder* CreateIndexBuilder(
+      BlockBasedTableOptions::IndexType index_type,
+      const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
+      const InternalKeySliceTransform* int_key_slice_transform,
+      const bool use_value_delta_encoding,
+      const BlockBasedTableOptions& table_opt);
+
+  // Index builder will construct a set of blocks which contain:
+  //  1. One primary index block.
+  //  2. (Optional) a set of metablocks that contains the metadata of the
+  //     primary index.
+  struct IndexBlocks {
+    Slice index_block_contents;
+    std::unordered_map<std::string, Slice> meta_blocks;
+  };
+  explicit IndexBuilder(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
+
+  virtual ~IndexBuilder() {}
+
+  // Add a new index entry to index block.
+  // To allow further optimization, we provide `last_key_in_current_block` and
+  // `first_key_in_next_block`, based on which the specific implementation can
+  // determine the best index key to be used for the index block.
+  // Called before the OnKeyAdded() call for first_key_in_next_block.
+  // @last_key_in_current_block: this parameter maybe overridden with the value
+  //                             "substitute key".
+  // @first_key_in_next_block: it will be nullptr if the entry being added is
+  //                           the last one in the table
+  //
+  // REQUIRES: Finish() has not yet been called.
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) = 0;
+
+  // This method will be called whenever a key is added. The subclasses may
+  // override OnKeyAdded() if they need to collect additional information.
+  virtual void OnKeyAdded(const Slice& /*key*/) {}
+
+  // Inform the index builder that all entries has been written. Block builder
+  // may therefore perform any operation required for block finalization.
+  //
+  // REQUIRES: Finish() has not yet been called.
+  inline Status Finish(IndexBlocks* index_blocks) {
+    // Throw away the changes to last_partition_block_handle. It has no effect
+    // on the first call to Finish anyway.
+    BlockHandle last_partition_block_handle;
+    return Finish(index_blocks, last_partition_block_handle);
+  }
+
+  // This override of Finish can be utilized to build the 2nd level index in
+  // PartitionIndexBuilder.
+  //
+  // index_blocks will be filled with the resulting index data. If the return
+  // value is Status::InComplete() then it means that the index is partitioned
+  // and the callee should keep calling Finish until Status::OK() is returned.
+  // In that case, last_partition_block_handle is pointer to the block written
+  // with the result of the last call to Finish. This can be utilized to build
+  // the second level index pointing to each block of partitioned indexes. The
+  // last call to Finish() that returns Status::OK() populates index_blocks with
+  // the 2nd level index content.
+  virtual Status Finish(IndexBlocks* index_blocks,
+                        const BlockHandle& last_partition_block_handle) = 0;
+
+  // Get the size for index block. Must be called after ::Finish.
+  virtual size_t IndexSize() const = 0;
+
+  virtual bool seperator_is_key_plus_seq() { return true; }
+
+ protected:
+  const InternalKeyComparator* comparator_;
+  // Set after ::Finish is called
+  size_t index_size_ = 0;
+};
+
+// This index builder builds space-efficient index block.
+//
+// Optimizations:
+//  1. Made block's `block_restart_interval` to be 1, which will avoid linear
+//     search when doing index lookup (can be disabled by setting
+//     index_block_restart_interval).
+//  2. Shorten the key length for index block. Other than honestly using the
+//     last key in the data block as the index key, we instead find a shortest
+//     substitute key that serves the same function.
+class ShortenedIndexBuilder : public IndexBuilder {
+ public:
+  explicit ShortenedIndexBuilder(
+      const InternalKeyComparator* comparator,
+      const int index_block_restart_interval, const uint32_t format_version,
+      const bool use_value_delta_encoding,
+      BlockBasedTableOptions::IndexShorteningMode shortening_mode,
+      bool include_first_key)
+      : IndexBuilder(comparator),
+        index_block_builder_(index_block_restart_interval,
+                             true /*use_delta_encoding*/,
+                             use_value_delta_encoding),
+        index_block_builder_without_seq_(index_block_restart_interval,
+                                         true /*use_delta_encoding*/,
+                                         use_value_delta_encoding),
+        use_value_delta_encoding_(use_value_delta_encoding),
+        include_first_key_(include_first_key),
+        shortening_mode_(shortening_mode) {
+    // Making the default true will disable the feature for old versions
+    seperator_is_key_plus_seq_ = (format_version <= 2);
+  }
+
+  virtual void OnKeyAdded(const Slice& key) override {
+    if (include_first_key_ && current_block_first_internal_key_.empty()) {
+      current_block_first_internal_key_.assign(key.data(), key.size());
+    }
+  }
+
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) override {
+    if (first_key_in_next_block != nullptr) {
+      if (shortening_mode_ !=
+          BlockBasedTableOptions::IndexShorteningMode::kNoShortening) {
+        comparator_->FindShortestSeparator(last_key_in_current_block,
+                                           *first_key_in_next_block);
+      }
+      if (!seperator_is_key_plus_seq_ &&
+          comparator_->user_comparator()->Compare(
+              ExtractUserKey(*last_key_in_current_block),
+              ExtractUserKey(*first_key_in_next_block)) == 0) {
+        seperator_is_key_plus_seq_ = true;
+      }
+    } else {
+      if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode::
+                                  kShortenSeparatorsAndSuccessor) {
+        comparator_->FindShortSuccessor(last_key_in_current_block);
+      }
+    }
+    auto sep = Slice(*last_key_in_current_block);
+
+    assert(!include_first_key_ || !current_block_first_internal_key_.empty());
+    IndexValue entry(block_handle, current_block_first_internal_key_);
+    std::string encoded_entry;
+    std::string delta_encoded_entry;
+    entry.EncodeTo(&encoded_entry, include_first_key_, nullptr);
+    if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) {
+      entry.EncodeTo(&delta_encoded_entry, include_first_key_,
+                     &last_encoded_handle_);
+    } else {
+      // If it's the first block, or delta encoding is disabled,
+      // BlockBuilder::Add() below won't use delta-encoded slice.
+    }
+    last_encoded_handle_ = block_handle;
+    const Slice delta_encoded_entry_slice(delta_encoded_entry);
+    index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice);
+    if (!seperator_is_key_plus_seq_) {
+      index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry,
+                                           &delta_encoded_entry_slice);
+    }
+
+    current_block_first_internal_key_.clear();
+  }
+
+  using IndexBuilder::Finish;
+  virtual Status Finish(
+      IndexBlocks* index_blocks,
+      const BlockHandle& /*last_partition_block_handle*/) override {
+    if (seperator_is_key_plus_seq_) {
+      index_blocks->index_block_contents = index_block_builder_.Finish();
+    } else {
+      index_blocks->index_block_contents =
+          index_block_builder_without_seq_.Finish();
+    }
+    index_size_ = index_blocks->index_block_contents.size();
+    return Status::OK();
+  }
+
+  virtual size_t IndexSize() const override { return index_size_; }
+
+  virtual bool seperator_is_key_plus_seq() override {
+    return seperator_is_key_plus_seq_;
+  }
+
+  friend class PartitionedIndexBuilder;
+
+ private:
+  BlockBuilder index_block_builder_;
+  BlockBuilder index_block_builder_without_seq_;
+  const bool use_value_delta_encoding_;
+  bool seperator_is_key_plus_seq_;
+  const bool include_first_key_;
+  BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
+  BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
+  std::string current_block_first_internal_key_;
+};
+
+// HashIndexBuilder contains a binary-searchable primary index and the
+// metadata for secondary hash index construction.
+// The metadata for hash index consists two parts:
+//  - a metablock that compactly contains a sequence of prefixes. All prefixes
+//    are stored consectively without any metadata (like, prefix sizes) being
+//    stored, which is kept in the other metablock.
+//  - a metablock contains the metadata of the prefixes, including prefix size,
+//    restart index and number of block it spans. The format looks like:
+//
+// +-----------------+---------------------------+---------------------+
+// <=prefix 1
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+// <=prefix 2
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+// |                                                                   |
+// | ....                                                              |
+// |                                                                   |
+// +-----------------+---------------------------+---------------------+
+// <=prefix n
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+//
+// The reason of separating these two metablocks is to enable the efficiently
+// reuse the first metablock during hash index construction without unnecessary
+// data copy or small heap allocations for prefixes.
+class HashIndexBuilder : public IndexBuilder {
+ public:
+  explicit HashIndexBuilder(
+      const InternalKeyComparator* comparator,
+      const SliceTransform* hash_key_extractor,
+      int index_block_restart_interval, int format_version,
+      bool use_value_delta_encoding,
+      BlockBasedTableOptions::IndexShorteningMode shortening_mode)
+      : IndexBuilder(comparator),
+        primary_index_builder_(comparator, index_block_restart_interval,
+                               format_version, use_value_delta_encoding,
+                               shortening_mode, /* include_first_key */ false),
+        hash_key_extractor_(hash_key_extractor) {}
+
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) override {
+    ++current_restart_index_;
+    primary_index_builder_.AddIndexEntry(last_key_in_current_block,
+                                         first_key_in_next_block, block_handle);
+  }
+
+  virtual void OnKeyAdded(const Slice& key) override {
+    auto key_prefix = hash_key_extractor_->Transform(key);
+    bool is_first_entry = pending_block_num_ == 0;
+
+    // Keys may share the prefix
+    if (is_first_entry || pending_entry_prefix_ != key_prefix) {
+      if (!is_first_entry) {
+        FlushPendingPrefix();
+      }
+
+      // need a hard copy otherwise the underlying data changes all the time.
+      // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
+      // copy.
+      pending_entry_prefix_ = key_prefix.ToString();
+      pending_block_num_ = 1;
+      pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
+    } else {
+      // entry number increments when keys share the prefix reside in
+      // different data blocks.
+      auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
+      assert(last_restart_index <= current_restart_index_);
+      if (last_restart_index != current_restart_index_) {
+        ++pending_block_num_;
+      }
+    }
+  }
+
+  virtual Status Finish(
+      IndexBlocks* index_blocks,
+      const BlockHandle& last_partition_block_handle) override {
+    if (pending_block_num_ != 0) {
+      FlushPendingPrefix();
+    }
+    primary_index_builder_.Finish(index_blocks, last_partition_block_handle);
+    index_blocks->meta_blocks.insert(
+        {kHashIndexPrefixesBlock.c_str(), prefix_block_});
+    index_blocks->meta_blocks.insert(
+        {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
+    return Status::OK();
+  }
+
+  virtual size_t IndexSize() const override {
+    return primary_index_builder_.IndexSize() + prefix_block_.size() +
+           prefix_meta_block_.size();
+  }
+
+  virtual bool seperator_is_key_plus_seq() override {
+    return primary_index_builder_.seperator_is_key_plus_seq();
+  }
+
+ private:
+  void FlushPendingPrefix() {
+    prefix_block_.append(pending_entry_prefix_.data(),
+                         pending_entry_prefix_.size());
+    PutVarint32Varint32Varint32(
+        &prefix_meta_block_,
+        static_cast<uint32_t>(pending_entry_prefix_.size()),
+        pending_entry_index_, pending_block_num_);
+  }
+
+  ShortenedIndexBuilder primary_index_builder_;
+  const SliceTransform* hash_key_extractor_;
+
+  // stores a sequence of prefixes
+  std::string prefix_block_;
+  // stores the metadata of prefixes
+  std::string prefix_meta_block_;
+
+  // The following 3 variables keeps unflushed prefix and its metadata.
+  // The details of block_num and entry_index can be found in
+  // "block_hash_index.{h,cc}"
+  uint32_t pending_block_num_ = 0;
+  uint32_t pending_entry_index_ = 0;
+  std::string pending_entry_prefix_;
+
+  uint64_t current_restart_index_ = 0;
+};
+
+/**
+ * IndexBuilder for two-level indexing. Internally it creates a new index for
+ * each partition and Finish then in order when Finish is called on it
+ * continiously until Status::OK() is returned.
+ *
+ * The format on the disk would be I I I I I I IP where I is block containing a
+ * partition of indexes built using ShortenedIndexBuilder and IP is a block
+ * containing a secondary index on the partitions, built using
+ * ShortenedIndexBuilder.
+ */
+class PartitionedIndexBuilder : public IndexBuilder {
+ public:
+  static PartitionedIndexBuilder* CreateIndexBuilder(
+      const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
+      const bool use_value_delta_encoding,
+      const BlockBasedTableOptions& table_opt);
+
+  explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
+                                   const BlockBasedTableOptions& table_opt,
+                                   const bool use_value_delta_encoding);
+
+  virtual ~PartitionedIndexBuilder();
+
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) override;
+
+  virtual Status Finish(
+      IndexBlocks* index_blocks,
+      const BlockHandle& last_partition_block_handle) override;
+
+  virtual size_t IndexSize() const override { return index_size_; }
+  size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
+  size_t NumPartitions() const;
+
+  inline bool ShouldCutFilterBlock() {
+    // Current policy is to align the partitions of index and filters
+    if (cut_filter_block) {
+      cut_filter_block = false;
+      return true;
+    }
+    return false;
+  }
+
+  std::string& GetPartitionKey() { return sub_index_last_key_; }
+
+  // Called when an external entity (such as filter partition builder) request
+  // cutting the next partition
+  void RequestPartitionCut();
+
+  virtual bool seperator_is_key_plus_seq() override {
+    return seperator_is_key_plus_seq_;
+  }
+
+  bool get_use_value_delta_encoding() { return use_value_delta_encoding_; }
+
+ private:
+  // Set after ::Finish is called
+  size_t top_level_index_size_ = 0;
+  // Set after ::Finish is called
+  size_t partition_cnt_ = 0;
+
+  void MakeNewSubIndexBuilder();
+
+  struct Entry {
+    std::string key;
+    std::unique_ptr<ShortenedIndexBuilder> value;
+  };
+  std::list<Entry> entries_;  // list of partitioned indexes and their keys
+  BlockBuilder index_block_builder_;              // top-level index builder
+  BlockBuilder index_block_builder_without_seq_;  // same for user keys
+  // the active partition index builder
+  ShortenedIndexBuilder* sub_index_builder_;
+  // the last key in the active partition index builder
+  std::string sub_index_last_key_;
+  std::unique_ptr<FlushBlockPolicy> flush_policy_;
+  // true if Finish is called once but not complete yet.
+  bool finishing_indexes = false;
+  const BlockBasedTableOptions& table_opt_;
+  bool seperator_is_key_plus_seq_;
+  bool use_value_delta_encoding_;
+  // true if an external entity (such as filter partition builder) request
+  // cutting the next partition
+  bool partition_cut_requested_ = true;
+  // true if it should cut the next filter partition block
+  bool cut_filter_block = false;
+  BlockHandle last_encoded_handle_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/mock_block_based_table.h b/src/rocksdb/table/block_based/mock_block_based_table.h
new file mode 100644
index 000000000..54817bd67
--- /dev/null
+++ b/src/rocksdb/table/block_based/mock_block_based_table.h
@@ -0,0 +1,56 @@
+//  Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace mock {
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class MockBlockBasedTableTester {
+  static constexpr int kMockLevel = 0;
+
+ public:
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  EnvOptions env_options_;
+  BlockBasedTableOptions table_options_;
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+
+  MockBlockBasedTableTester(const FilterPolicy *filter_policy)
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator) {
+    table_options_.filter_policy.reset(filter_policy);
+
+    constexpr bool skip_filters = false;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep(
+        ioptions_, env_options_, table_options_, icomp_, skip_filters,
+        kMockLevel, immortal_table)));
+  }
+
+  FilterBitsBuilder* GetBuilder() const {
+    FilterBuildingContext context(table_options_);
+    context.column_family_name = "mock_cf";
+    context.compaction_style = ioptions_.compaction_style;
+    context.level_at_creation = kMockLevel;
+    context.info_log = ioptions_.info_log;
+    return BloomFilterPolicy::GetBuilderFromContext(context);
+  }
+};
+
+}  // namespace mock
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/parsed_full_filter_block.cc b/src/rocksdb/table/block_based/parsed_full_filter_block.cc
new file mode 100644
index 000000000..3e555387e
--- /dev/null
+++ b/src/rocksdb/table/block_based/parsed_full_filter_block.cc
@@ -0,0 +1,22 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/parsed_full_filter_block.h"
+#include "rocksdb/filter_policy.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ParsedFullFilterBlock::ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+                                             BlockContents&& contents)
+    : block_contents_(std::move(contents)),
+      filter_bits_reader_(
+          !block_contents_.data.empty()
+              ? filter_policy->GetFilterBitsReader(block_contents_.data)
+              : nullptr) {}
+
+ParsedFullFilterBlock::~ParsedFullFilterBlock() = default;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/parsed_full_filter_block.h b/src/rocksdb/table/block_based/parsed_full_filter_block.h
new file mode 100644
index 000000000..36c619921
--- /dev/null
+++ b/src/rocksdb/table/block_based/parsed_full_filter_block.h
@@ -0,0 +1,40 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FilterBitsReader;
+class FilterPolicy;
+
+// The sharable/cachable part of the full filter.
+class ParsedFullFilterBlock {
+ public:
+  ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+                        BlockContents&& contents);
+  ~ParsedFullFilterBlock();
+
+  FilterBitsReader* filter_bits_reader() const {
+    return filter_bits_reader_.get();
+  }
+
+  // TODO: consider memory usage of the FilterBitsReader
+  size_t ApproximateMemoryUsage() const {
+    return block_contents_.ApproximateMemoryUsage();
+  }
+
+  bool own_bytes() const { return block_contents_.own_bytes(); }
+
+ private:
+  BlockContents block_contents_;
+  std::unique_ptr<FilterBitsReader> filter_bits_reader_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block.cc b/src/rocksdb/table/block_based/partitioned_filter_block.cc
new file mode 100644
index 000000000..2138d96dd
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block.cc
@@ -0,0 +1,388 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/partitioned_filter_block.h"
+
+#include <utility>
+
+#include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
+    const SliceTransform* _prefix_extractor, bool whole_key_filtering,
+    FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+    const bool use_value_delta_encoding,
+    PartitionedIndexBuilder* const p_index_builder,
+    const uint32_t partition_size)
+    : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering,
+                             filter_bits_builder),
+      index_on_filter_block_builder_(index_block_restart_interval,
+                                     true /*use_delta_encoding*/,
+                                     use_value_delta_encoding),
+      index_on_filter_block_builder_without_seq_(index_block_restart_interval,
+                                                 true /*use_delta_encoding*/,
+                                                 use_value_delta_encoding),
+      p_index_builder_(p_index_builder),
+      keys_added_to_partition_(0) {
+  keys_per_partition_ =
+      filter_bits_builder_->CalculateNumEntry(partition_size);
+}
+
+PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
+
+void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
+    const Slice* next_key) {
+  // Use == to send the request only once
+  if (keys_added_to_partition_ == keys_per_partition_) {
+    // Currently only index builder is in charge of cutting a partition. We keep
+    // requesting until it is granted.
+    p_index_builder_->RequestPartitionCut();
+  }
+  if (!p_index_builder_->ShouldCutFilterBlock()) {
+    return;
+  }
+  filter_gc.push_back(std::unique_ptr<const char[]>(nullptr));
+
+  // Add the prefix of the next key before finishing the partition. This hack,
+  // fixes a bug with format_verison=3 where seeking for the prefix would lead
+  // us to the previous partition.
+  const bool add_prefix =
+      next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
+  if (add_prefix) {
+    FullFilterBlockBuilder::AddPrefix(*next_key);
+  }
+
+  Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
+  std::string& index_key = p_index_builder_->GetPartitionKey();
+  filters.push_back({index_key, filter});
+  keys_added_to_partition_ = 0;
+  Reset();
+}
+
+void PartitionedFilterBlockBuilder::Add(const Slice& key) {
+  MaybeCutAFilterBlock(&key);
+  FullFilterBlockBuilder::Add(key);
+}
+
+void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
+  FullFilterBlockBuilder::AddKey(key);
+  keys_added_to_partition_++;
+}
+
+Slice PartitionedFilterBlockBuilder::Finish(
+    const BlockHandle& last_partition_block_handle, Status* status) {
+  if (finishing_filters == true) {
+    // Record the handle of the last written filter block in the index
+    FilterEntry& last_entry = filters.front();
+    std::string handle_encoding;
+    last_partition_block_handle.EncodeTo(&handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(
+        &handle_delta_encoding,
+        last_partition_block_handle.size() - last_encoded_handle_.size());
+    last_encoded_handle_ = last_partition_block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_on_filter_block_builder_.Add(last_entry.key, handle_encoding,
+                                       &handle_delta_encoding_slice);
+    if (!p_index_builder_->seperator_is_key_plus_seq()) {
+      index_on_filter_block_builder_without_seq_.Add(
+          ExtractUserKey(last_entry.key), handle_encoding,
+          &handle_delta_encoding_slice);
+    }
+    filters.pop_front();
+  } else {
+    MaybeCutAFilterBlock(nullptr);
+  }
+  // If there is no filter partition left, then return the index on filter
+  // partitions
+  if (UNLIKELY(filters.empty())) {
+    *status = Status::OK();
+    if (finishing_filters) {
+      if (p_index_builder_->seperator_is_key_plus_seq()) {
+        return index_on_filter_block_builder_.Finish();
+      } else {
+        return index_on_filter_block_builder_without_seq_.Finish();
+      }
+    } else {
+      // This is the rare case where no key was added to the filter
+      return Slice();
+    }
+  } else {
+    // Return the next filter partition in line and set Incomplete() status to
+    // indicate we expect more calls to Finish
+    *status = Status::Incomplete();
+    finishing_filters = true;
+    return filters.front().filter;
+  }
+}
+
+PartitionedFilterBlockReader::PartitionedFilterBlockReader(
+    const BlockBasedTable* t, CachableEntry<Block>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {}
+
+std::unique_ptr<FilterBlockReader> PartitionedFilterBlockReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<Block> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+                                     use_cache, nullptr /* get_context */,
+                                     lookup_context, &filter_block);
+    if (!s.ok()) {
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new PartitionedFilterBlockReader(table, std::move(filter_block)));
+}
+
+bool PartitionedFilterBlockReader::KeyMayMatch(
+    const Slice& key, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context) {
+  assert(const_ikey_ptr != nullptr);
+  assert(block_offset == kNotValid);
+  if (!whole_key_filtering()) {
+    return true;
+  }
+
+  return MayMatch(key, prefix_extractor, block_offset, no_io, const_ikey_ptr,
+                  get_context, lookup_context,
+                  &FullFilterBlockReader::KeyMayMatch);
+}
+
+bool PartitionedFilterBlockReader::PrefixMayMatch(
+    const Slice& prefix, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context) {
+#ifdef NDEBUG
+  (void)block_offset;
+#endif
+  assert(const_ikey_ptr != nullptr);
+  assert(block_offset == kNotValid);
+  if (!table_prefix_extractor() && !prefix_extractor) {
+    return true;
+  }
+
+  return MayMatch(prefix, prefix_extractor, block_offset, no_io, const_ikey_ptr,
+                  get_context, lookup_context,
+                  &FullFilterBlockReader::PrefixMayMatch);
+}
+
+BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
+    const CachableEntry<Block>& filter_block, const Slice& entry) const {
+  IndexBlockIter iter;
+  const InternalKeyComparator* const comparator = internal_comparator();
+  Statistics* kNullStats = nullptr;
+  filter_block.GetValue()->NewIndexIterator(
+      comparator, comparator->user_comparator(), &iter, kNullStats,
+      true /* total_order_seek */, false /* have_first_key */,
+      index_key_includes_seq(), index_value_is_full());
+  iter.Seek(entry);
+  if (UNLIKELY(!iter.Valid())) {
+    // entry is larger than all the keys. However its prefix might still be
+    // present in the last partition. If this is called by PrefixMayMatch this
+    // is necessary for correct behavior. Otherwise it is unnecessary but safe.
+    // Assuming this is an unlikely case for full key search, the performance
+    // overhead should be negligible.
+    iter.SeekToLast();
+  }
+  assert(iter.Valid());
+  BlockHandle fltr_blk_handle = iter.value().handle;
+  return fltr_blk_handle;
+}
+
+Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
+    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle,
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<ParsedFullFilterBlock>* filter_block) const {
+  assert(table());
+  assert(filter_block);
+  assert(filter_block->IsEmpty());
+
+  if (!filter_map_.empty()) {
+    auto iter = filter_map_.find(fltr_blk_handle.offset());
+    // This is a possible scenario since block cache might not have had space
+    // for the partition
+    if (iter != filter_map_.end()) {
+      filter_block->SetUnownedValue(iter->second.GetValue());
+      return Status::OK();
+    }
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  const Status s =
+      table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle,
+                             UncompressionDict::GetEmptyDict(), filter_block,
+                             BlockType::kFilter, get_context, lookup_context,
+                             /* for_compaction */ false, /* use_cache */ true);
+
+  return s;
+}
+
+bool PartitionedFilterBlockReader::MayMatch(
+    const Slice& slice, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    FilterFunction filter_function) const {
+  CachableEntry<Block> filter_block;
+  Status s =
+      GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+  if (UNLIKELY(!s.ok())) {
+    return true;
+  }
+
+  if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
+    return true;
+  }
+
+  auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr);
+  if (UNLIKELY(filter_handle.size() == 0)) {  // key is out of range
+    return false;
+  }
+
+  CachableEntry<ParsedFullFilterBlock> filter_partition_block;
+  s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle,
+                              no_io, get_context, lookup_context,
+                              &filter_partition_block);
+  if (UNLIKELY(!s.ok())) {
+    return true;
+  }
+
+  FullFilterBlockReader filter_partition(table(),
+                                         std::move(filter_partition_block));
+  return (filter_partition.*filter_function)(
+      slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context,
+      lookup_context);
+}
+
+size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
+  size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<PartitionedFilterBlockReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
+  // TODO(myabandeh): better estimation for filter_map_ size
+}
+
+// TODO(myabandeh): merge this with the same function in IndexReader
+void PartitionedFilterBlockReader::CacheDependencies(bool pin) {
+  assert(table());
+
+  const BlockBasedTable::Rep* const rep = table()->get_rep();
+  assert(rep);
+
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+
+  CachableEntry<Block> filter_block;
+
+  Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
+                                  &lookup_context, &filter_block);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(rep->ioptions.info_log,
+                   "Error retrieving top-level filter block while trying to "
+                   "cache filter partitions: %s",
+                   s.ToString().c_str());
+    return;
+  }
+
+  // Before read partitions, prefetch them to avoid lots of IOs
+  assert(filter_block.GetValue());
+
+  IndexBlockIter biter;
+  const InternalKeyComparator* const comparator = internal_comparator();
+  Statistics* kNullStats = nullptr;
+  filter_block.GetValue()->NewIndexIterator(
+      comparator, comparator->user_comparator(), &biter, kNullStats,
+      true /* total_order_seek */, false /* have_first_key */,
+      index_key_includes_seq(), index_value_is_full());
+  // Index partitions are assumed to be consecuitive. Prefetch them all.
+  // Read the first block offset
+  biter.SeekToFirst();
+  BlockHandle handle = biter.value().handle;
+  uint64_t prefetch_off = handle.offset();
+
+  // Read the last block's offset
+  biter.SeekToLast();
+  handle = biter.value().handle;
+  uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
+  uint64_t prefetch_len = last_off - prefetch_off;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+
+  prefetch_buffer.reset(new FilePrefetchBuffer());
+  s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
+                                static_cast<size_t>(prefetch_len));
+
+  // After prefetch, read the partitions one by one
+  ReadOptions read_options;
+  for (biter.SeekToFirst(); biter.Valid(); biter.Next()) {
+    handle = biter.value().handle;
+
+    CachableEntry<ParsedFullFilterBlock> block;
+    // TODO: Support counter batch update for partitioned index and
+    // filter blocks
+    s = table()->MaybeReadBlockAndLoadToCache(
+        prefetch_buffer.get(), read_options, handle,
+        UncompressionDict::GetEmptyDict(), &block, BlockType::kFilter,
+        nullptr /* get_context */, &lookup_context, nullptr /* contents */);
+
+    assert(s.ok() || block.GetValue() == nullptr);
+    if (s.ok() && block.GetValue() != nullptr) {
+      if (block.IsCached()) {
+        if (pin) {
+          filter_map_[handle.offset()] = std::move(block);
+        }
+      }
+    }
+  }
+}
+
+const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator()
+    const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return &table()->get_rep()->internal_comparator;
+}
+
+bool PartitionedFilterBlockReader::index_key_includes_seq() const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return table()->get_rep()->index_key_includes_seq;
+}
+
+bool PartitionedFilterBlockReader::index_value_is_full() const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return table()->get_rep()->index_value_is_full;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block.h b/src/rocksdb/table/block_based/partitioned_filter_block.h
new file mode 100644
index 000000000..314297cab
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block.h
@@ -0,0 +1,122 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <list>
+#include <string>
+#include <unordered_map>
+#include "db/dbformat.h"
+#include "index_builder.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/block.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/block_based/full_filter_block.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
+ public:
+  explicit PartitionedFilterBlockBuilder(
+      const SliceTransform* prefix_extractor, bool whole_key_filtering,
+      FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+      const bool use_value_delta_encoding,
+      PartitionedIndexBuilder* const p_index_builder,
+      const uint32_t partition_size);
+
+  virtual ~PartitionedFilterBlockBuilder();
+
+  void AddKey(const Slice& key) override;
+  void Add(const Slice& key) override;
+
+  virtual Slice Finish(const BlockHandle& last_partition_block_handle,
+                       Status* status) override;
+
+ private:
+  // Filter data
+  BlockBuilder index_on_filter_block_builder_;  // top-level index builder
+  BlockBuilder
+      index_on_filter_block_builder_without_seq_;  // same for user keys
+  struct FilterEntry {
+    std::string key;
+    Slice filter;
+  };
+  std::list<FilterEntry> filters;  // list of partitioned indexes and their keys
+  std::unique_ptr<IndexBuilder> value;
+  std::vector<std::unique_ptr<const char[]>> filter_gc;
+  bool finishing_filters =
+      false;  // true if Finish is called once but not complete yet.
+  // The policy of when cut a filter block and Finish it
+  void MaybeCutAFilterBlock(const Slice* next_key);
+  // Currently we keep the same number of partitions for filters and indexes.
+  // This would allow for some potentioal optimizations in future. If such
+  // optimizations did not realize we can use different number of partitions and
+  // eliminate p_index_builder_
+  PartitionedIndexBuilder* const p_index_builder_;
+  // The desired number of keys per partition
+  uint32_t keys_per_partition_;
+  // The number of keys added to the last partition so far
+  uint32_t keys_added_to_partition_;
+  BlockHandle last_encoded_handle_;
+};
+
+class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
+ public:
+  PartitionedFilterBlockReader(const BlockBasedTable* t,
+                               CachableEntry<Block>&& filter_block);
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
+
+  bool IsBlockBased() override { return false; }
+  bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+                   uint64_t block_offset, const bool no_io,
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context) override;
+  bool PrefixMayMatch(const Slice& prefix,
+                      const SliceTransform* prefix_extractor,
+                      uint64_t block_offset, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context) override;
+
+  size_t ApproximateMemoryUsage() const override;
+
+ private:
+  BlockHandle GetFilterPartitionHandle(const CachableEntry<Block>& filter_block,
+                                       const Slice& entry) const;
+  Status GetFilterPartitionBlock(
+      FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle,
+      bool no_io, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      CachableEntry<ParsedFullFilterBlock>* filter_block) const;
+
+  using FilterFunction = bool (FullFilterBlockReader::*)(
+      const Slice& slice, const SliceTransform* prefix_extractor,
+      uint64_t block_offset, const bool no_io,
+      const Slice* const const_ikey_ptr, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context);
+  bool MayMatch(const Slice& slice, const SliceTransform* prefix_extractor,
+                uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr,
+                GetContext* get_context,
+                BlockCacheLookupContext* lookup_context,
+                FilterFunction filter_function) const;
+  void CacheDependencies(bool pin) override;
+
+  const InternalKeyComparator* internal_comparator() const;
+  bool index_key_includes_seq() const;
+  bool index_value_is_full() const;
+
+ protected:
+  std::unordered_map<uint64_t, CachableEntry<ParsedFullFilterBlock>>
+      filter_map_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block_test.cc b/src/rocksdb/table/block_based/partitioned_filter_block_test.cc
new file mode 100644
index 000000000..071bad9ca
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block_test.cc
@@ -0,0 +1,424 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <map>
+
+#include "rocksdb/filter_policy.h"
+
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+
+#include "index_builder.h"
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::map<uint64_t, std::string> blooms;
+
+class MockedBlockBasedTable : public BlockBasedTable {
+ public:
+  MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib)
+      : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) {
+    // Initialize what Open normally does as much as necessary for the test
+    rep->index_key_includes_seq = pib->seperator_is_key_plus_seq();
+    rep->index_value_is_full = !pib->get_use_value_delta_encoding();
+  }
+};
+
+class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader {
+ public:
+  MyPartitionedFilterBlockReader(BlockBasedTable* t,
+                                 CachableEntry<Block>&& filter_block)
+      : PartitionedFilterBlockReader(t, std::move(filter_block)) {
+    for (const auto& pair : blooms) {
+      const uint64_t offset = pair.first;
+      const std::string& bloom = pair.second;
+
+      assert(t);
+      assert(t->get_rep());
+      CachableEntry<ParsedFullFilterBlock> block(
+          new ParsedFullFilterBlock(
+              t->get_rep()->table_options.filter_policy.get(),
+              BlockContents(Slice(bloom))),
+          nullptr /* cache */, nullptr /* cache_handle */,
+          true /* own_value */);
+      filter_map_[offset] = std::move(block);
+    }
+  }
+};
+
+class PartitionedFilterBlockTest
+    : public testing::Test,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  EnvOptions env_options_;
+  BlockBasedTableOptions table_options_;
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+  std::shared_ptr<Cache> cache_;
+  int bits_per_key_;
+
+  PartitionedFilterBlockTest()
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator),
+        bits_per_key_(10) {
+    table_options_.filter_policy.reset(
+        NewBloomFilterPolicy(bits_per_key_, false));
+    table_options_.format_version = GetParam();
+    table_options_.index_block_restart_interval = 3;
+  }
+
+  ~PartitionedFilterBlockTest() override {}
+
+  const std::string keys[4] = {"afoo", "bar", "box", "hello"};
+  const std::string missing_keys[2] = {"missing", "other"};
+
+  uint64_t MaxIndexSize() {
+    int num_keys = sizeof(keys) / sizeof(*keys);
+    uint64_t max_key_size = 0;
+    for (int i = 1; i < num_keys; i++) {
+      max_key_size = std::max(max_key_size, static_cast<uint64_t>(keys[i].size()));
+    }
+    uint64_t max_index_size = num_keys * (max_key_size + 8 /*handle*/);
+    return max_index_size;
+  }
+
+  uint64_t MaxFilterSize() {
+    int num_keys = sizeof(keys) / sizeof(*keys);
+    // General, rough over-approximation
+    return num_keys * bits_per_key_ + (CACHE_LINE_SIZE * 8 + /*metadata*/ 5);
+  }
+
+  uint64_t last_offset = 10;
+  BlockHandle Write(const Slice& slice) {
+    BlockHandle bh(last_offset + 1, slice.size());
+    blooms[bh.offset()] = slice.ToString();
+    last_offset += bh.size();
+    return bh;
+  }
+
+  PartitionedIndexBuilder* NewIndexBuilder() {
+    const bool kValueDeltaEncoded = true;
+    return PartitionedIndexBuilder::CreateIndexBuilder(
+        &icomp_, !kValueDeltaEncoded, table_options_);
+  }
+
+  PartitionedFilterBlockBuilder* NewBuilder(
+      PartitionedIndexBuilder* const p_index_builder,
+      const SliceTransform* prefix_extractor = nullptr) {
+    assert(table_options_.block_size_deviation <= 100);
+    auto partition_size = static_cast<uint32_t>(
+             ((table_options_.metadata_block_size *
+               (100 - table_options_.block_size_deviation)) +
+              99) /
+             100);
+    partition_size = std::max(partition_size, static_cast<uint32_t>(1));
+    const bool kValueDeltaEncoded = true;
+    return new PartitionedFilterBlockBuilder(
+        prefix_extractor, table_options_.whole_key_filtering,
+        BloomFilterPolicy::GetBuilderFromContext(
+            FilterBuildingContext(table_options_)),
+        table_options_.index_block_restart_interval, !kValueDeltaEncoded,
+        p_index_builder, partition_size);
+  }
+
+  PartitionedFilterBlockReader* NewReader(
+      PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) {
+    BlockHandle bh;
+    Status status;
+    Slice slice;
+    do {
+      slice = builder->Finish(bh, &status);
+      bh = Write(slice);
+    } while (status.IsIncomplete());
+
+    constexpr bool skip_filters = false;
+    constexpr int level = 0;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockedBlockBasedTable(
+        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
+                                 icomp_, skip_filters, level, immortal_table),
+        pib));
+    BlockContents contents(slice);
+    CachableEntry<Block> block(
+        new Block(std::move(contents), kDisableGlobalSequenceNumber,
+                  0 /* read_amp_bytes_per_bit */, nullptr),
+        nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+    auto reader =
+        new MyPartitionedFilterBlockReader(table_.get(), std::move(block));
+    return reader;
+  }
+
+  void VerifyReader(PartitionedFilterBlockBuilder* builder,
+                    PartitionedIndexBuilder* pib, bool empty = false,
+                    const SliceTransform* prefix_extractor = nullptr) {
+    std::unique_ptr<PartitionedFilterBlockReader> reader(
+        NewReader(builder, pib));
+    // Querying added keys
+    const bool no_io = true;
+    for (auto key : keys) {
+      auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+      const Slice ikey_slice = Slice(*ikey.rep());
+      ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io,
+                                      &ikey_slice, /*get_context=*/nullptr,
+                                      /*lookup_context=*/nullptr));
+    }
+    {
+      // querying a key twice
+      auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue);
+      const Slice ikey_slice = Slice(*ikey.rep());
+      ASSERT_TRUE(reader->KeyMayMatch(
+          keys[0], prefix_extractor, kNotValid, !no_io, &ikey_slice,
+          /*get_context=*/nullptr, /*lookup_context=*/nullptr));
+    }
+    // querying missing keys
+    for (auto key : missing_keys) {
+      auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+      const Slice ikey_slice = Slice(*ikey.rep());
+      if (empty) {
+        ASSERT_TRUE(reader->KeyMayMatch(
+            key, prefix_extractor, kNotValid, !no_io, &ikey_slice,
+            /*get_context=*/nullptr, /*lookup_context=*/nullptr));
+      } else {
+        // assuming a good hash function
+        ASSERT_FALSE(reader->KeyMayMatch(
+            key, prefix_extractor, kNotValid, !no_io, &ikey_slice,
+            /*get_context=*/nullptr, /*lookup_context=*/nullptr));
+      }
+    }
+  }
+
+  int TestBlockPerKey() {
+    std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+    std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+        NewBuilder(pib.get()));
+    int i = 0;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i]);
+
+    VerifyReader(builder.get(), pib.get());
+    return CountNumOfIndexPartitions(pib.get());
+  }
+
+  void TestBlockPerTwoKeys(const SliceTransform* prefix_extractor = nullptr) {
+    std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+    std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+        NewBuilder(pib.get(), prefix_extractor));
+    int i = 0;
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i]);
+
+    VerifyReader(builder.get(), pib.get(), prefix_extractor);
+  }
+
+  void TestBlockPerAllKeys() {
+    std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+    std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+        NewBuilder(pib.get()));
+    int i = 0;
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i]);
+
+    VerifyReader(builder.get(), pib.get());
+  }
+
+  void CutABlock(PartitionedIndexBuilder* builder,
+                 const std::string& user_key) {
+    // Assuming a block is cut, add an entry to the index
+    std::string key =
+        std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep());
+    BlockHandle dont_care_block_handle(1, 1);
+    builder->AddIndexEntry(&key, nullptr, dont_care_block_handle);
+  }
+
+  void CutABlock(PartitionedIndexBuilder* builder, const std::string& user_key,
+                 const std::string& next_user_key) {
+    // Assuming a block is cut, add an entry to the index
+    std::string key =
+        std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep());
+    std::string next_key = std::string(
+        *InternalKey(next_user_key, 0, ValueType::kTypeValue).rep());
+    BlockHandle dont_care_block_handle(1, 1);
+    Slice slice = Slice(next_key.data(), next_key.size());
+    builder->AddIndexEntry(&key, &slice, dont_care_block_handle);
+  }
+
+  int CountNumOfIndexPartitions(PartitionedIndexBuilder* builder) {
+    IndexBuilder::IndexBlocks dont_care_ib;
+    BlockHandle dont_care_bh(10, 10);
+    Status s;
+    int cnt = 0;
+    do {
+      s = builder->Finish(&dont_care_ib, dont_care_bh);
+      cnt++;
+    } while (s.IsIncomplete());
+    return cnt - 1;  // 1 is 2nd level index
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, PartitionedFilterBlockTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, PartitionedFilterBlockTest,
+                        testing::Values(test::kLatestFormatVersion));
+
+TEST_P(PartitionedFilterBlockTest, EmptyBuilder) {
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(NewBuilder(pib.get()));
+  const bool empty = true;
+  VerifyReader(builder.get(), pib.get(), empty);
+}
+
+TEST_P(PartitionedFilterBlockTest, OneBlock) {
+  uint64_t max_index_size = MaxIndexSize();
+  for (uint64_t i = 1; i < max_index_size + 1; i++) {
+    table_options_.metadata_block_size = i;
+    TestBlockPerAllKeys();
+  }
+}
+
+TEST_P(PartitionedFilterBlockTest, TwoBlocksPerKey) {
+  uint64_t max_index_size = MaxIndexSize();
+  for (uint64_t i = 1; i < max_index_size + 1; i++) {
+    table_options_.metadata_block_size = i;
+    TestBlockPerTwoKeys();
+  }
+}
+
+// This reproduces the bug that a prefix is the same among multiple consecutive
+// blocks but the bug would add it only to the first block.
+TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
+  // some small number to cause partition cuts
+  table_options_.metadata_block_size = 1;
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(1));
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+      NewBuilder(pib.get(), prefix_extractor.get()));
+  const std::string pkeys[3] = {"p-key10", "p-key20", "p-key30"};
+  builder->Add(pkeys[0]);
+  CutABlock(pib.get(), pkeys[0], pkeys[1]);
+  builder->Add(pkeys[1]);
+  CutABlock(pib.get(), pkeys[1], pkeys[2]);
+  builder->Add(pkeys[2]);
+  CutABlock(pib.get(), pkeys[2]);
+  std::unique_ptr<PartitionedFilterBlockReader> reader(
+      NewReader(builder.get(), pib.get()));
+  for (auto key : pkeys) {
+    auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(
+        prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid,
+        /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+        /*lookup_context=*/nullptr));
+  }
+  // Non-existent keys but with the same prefix
+  const std::string pnonkeys[4] = {"p-key9", "p-key11", "p-key21", "p-key31"};
+  for (auto key : pnonkeys) {
+    auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(
+        prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid,
+        /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+        /*lookup_context=*/nullptr));
+  }
+}
+
+// This reproduces the bug in format_version=3 that the seeking the prefix will
+// lead us to the partition before the one that has filter for the prefix.
+TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) {
+  // some small number to cause partition cuts
+  table_options_.metadata_block_size = 1;
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(2));
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+      NewBuilder(pib.get(), prefix_extractor.get()));
+  // In the bug, searching for prefix "p3" on an index with format version 3,
+  // will give the key "p3" and the partition of the keys that are <= p3, i.e.,
+  // p2-keys, where the filter for prefix "p3" does not exist.
+  const std::string pkeys[] = {"p1-key1", "p2-key2", "p3-key3", "p4-key3",
+                               "p5-key3"};
+  builder->Add(pkeys[0]);
+  CutABlock(pib.get(), pkeys[0], pkeys[1]);
+  builder->Add(pkeys[1]);
+  CutABlock(pib.get(), pkeys[1], pkeys[2]);
+  builder->Add(pkeys[2]);
+  CutABlock(pib.get(), pkeys[2], pkeys[3]);
+  builder->Add(pkeys[3]);
+  CutABlock(pib.get(), pkeys[3], pkeys[4]);
+  builder->Add(pkeys[4]);
+  CutABlock(pib.get(), pkeys[4]);
+  std::unique_ptr<PartitionedFilterBlockReader> reader(
+      NewReader(builder.get(), pib.get()));
+  for (auto key : pkeys) {
+    auto prefix = prefix_extractor->Transform(key);
+    auto ikey = InternalKey(prefix, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(
+        prefix, prefix_extractor.get(), kNotValid,
+        /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+        /*lookup_context=*/nullptr));
+  }
+}
+
+TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) {
+  uint64_t max_index_size = MaxIndexSize();
+  for (uint64_t i = 1; i < max_index_size + 1; i++) {
+    table_options_.metadata_block_size = i;
+    TestBlockPerKey();
+  }
+}
+
+TEST_P(PartitionedFilterBlockTest, PartitionCount) {
+  int num_keys = sizeof(keys) / sizeof(*keys);
+  table_options_.metadata_block_size =
+      std::max(MaxIndexSize(), MaxFilterSize());
+  int partitions = TestBlockPerKey();
+  ASSERT_EQ(partitions, 1);
+  // A low number ensures cutting a block after each key
+  table_options_.metadata_block_size = 1;
+  partitions = TestBlockPerKey();
+  ASSERT_EQ(partitions, num_keys - 1 /* last two keys make one flush */);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/uncompression_dict_reader.cc b/src/rocksdb/table/block_based/uncompression_dict_reader.cc
new file mode 100644
index 000000000..78e2b93c1
--- /dev/null
+++ b/src/rocksdb/table/block_based/uncompression_dict_reader.cc
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/uncompression_dict_reader.h"
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status UncompressionDictReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+  assert(uncompression_dict_reader);
+
+  CachableEntry<UncompressionDict> uncompression_dict;
+  if (prefetch || !use_cache) {
+    const Status s = ReadUncompressionDictionary(
+        table, prefetch_buffer, ReadOptions(), use_cache,
+        nullptr /* get_context */, lookup_context, &uncompression_dict);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      uncompression_dict.Reset();
+    }
+  }
+
+  uncompression_dict_reader->reset(
+      new UncompressionDictReader(table, std::move(uncompression_dict)));
+
+  return Status::OK();
+}
+
+Status UncompressionDictReader::ReadUncompressionDictionary(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<UncompressionDict>* uncompression_dict) {
+  // TODO: add perf counter for compression dictionary read time
+
+  assert(table);
+  assert(uncompression_dict);
+  assert(uncompression_dict->IsEmpty());
+
+  const BlockBasedTable::Rep* const rep = table->get_rep();
+  assert(rep);
+  assert(!rep->compression_dict_handle.IsNull());
+
+  const Status s = table->RetrieveBlock(
+      prefetch_buffer, read_options, rep->compression_dict_handle,
+      UncompressionDict::GetEmptyDict(), uncompression_dict,
+      BlockType::kCompressionDictionary, get_context, lookup_context,
+      /* for_compaction */ false, use_cache);
+
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(
+        rep->ioptions.info_log,
+        "Encountered error while reading data from compression dictionary "
+        "block %s",
+        s.ToString().c_str());
+  }
+
+  return s;
+}
+
+Status UncompressionDictReader::GetOrReadUncompressionDictionary(
+    FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<UncompressionDict>* uncompression_dict) const {
+  assert(uncompression_dict);
+
+  if (!uncompression_dict_.IsEmpty()) {
+    uncompression_dict->SetUnownedValue(uncompression_dict_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadUncompressionDictionary(table_, prefetch_buffer, read_options,
+                                     cache_dictionary_blocks(), get_context,
+                                     lookup_context, uncompression_dict);
+}
+
+size_t UncompressionDictReader::ApproximateMemoryUsage() const {
+  assert(!uncompression_dict_.GetOwnValue() ||
+         uncompression_dict_.GetValue() != nullptr);
+  size_t usage = uncompression_dict_.GetOwnValue()
+                     ? uncompression_dict_.GetValue()->ApproximateMemoryUsage()
+                     : 0;
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<UncompressionDictReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+
+  return usage;
+}
+
+bool UncompressionDictReader::cache_dictionary_blocks() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/uncompression_dict_reader.h b/src/rocksdb/table/block_based/uncompression_dict_reader.h
new file mode 100644
index 000000000..3e7826179
--- /dev/null
+++ b/src/rocksdb/table/block_based/uncompression_dict_reader.h
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+#include "table/block_based/cachable_entry.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBasedTable;
+struct BlockCacheLookupContext;
+class FilePrefetchBuffer;
+class GetContext;
+struct ReadOptions;
+struct UncompressionDict;
+
+// Provides access to the uncompression dictionary regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class UncompressionDictReader {
+ public:
+  static Status Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context,
+      std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader);
+
+  Status GetOrReadUncompressionDictionary(
+      FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      CachableEntry<UncompressionDict>* uncompression_dict) const;
+
+  size_t ApproximateMemoryUsage() const;
+
+ private:
+  UncompressionDictReader(const BlockBasedTable* t,
+                          CachableEntry<UncompressionDict>&& uncompression_dict)
+      : table_(t), uncompression_dict_(std::move(uncompression_dict)) {
+    assert(table_);
+  }
+
+  bool cache_dictionary_blocks() const;
+
+  static Status ReadUncompressionDictionary(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      CachableEntry<UncompressionDict>* uncompression_dict);
+
+  const BlockBasedTable* table_;
+  CachableEntry<UncompressionDict> uncompression_dict_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/table/block_based
parent	Initial commit. (diff)
download	ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip