12 files changed, 3413 insertions, 0 deletions
diff --git a/src/rocksdb/table/plain/plain_table_bloom.cc b/src/rocksdb/table/plain/plain_table_bloom.cc
new file mode 100644
index 000000000..21441f616
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_bloom.cc
@@ -0,0 +1,78 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/plain/plain_table_bloom.h"
+
+#include <algorithm>
+#include <string>
+
+#include "memory/allocator.h"
+#include "util/dynamic_bloom.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_blocks =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_blocks an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_blocks % 2 == 0) {
+    num_blocks++;
+  }
+
+  return num_blocks * (CACHE_LINE_SIZE * 8);
+}
+}  // namespace
+
+PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes)
+    : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
+
+void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits,
+                                   uint32_t num_blocks) {
+  data_ = raw_data;
+  kTotalBits = total_bits;
+  kNumBlocks = num_blocks;
+}
+
+void PlainTableBloomV1::SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                                     uint32_t locality,
+                                     size_t huge_page_tlb_size,
+                                     Logger* logger) {
+  kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
+                              : (total_bits + 7) / 8 * 8;
+  kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
+
+  assert(kNumBlocks > 0 || kTotalBits > 0);
+  assert(kNumProbes > 0);
+
+  uint32_t sz = kTotalBits / 8;
+  if (kNumBlocks > 0) {
+    sz += CACHE_LINE_SIZE - 1;
+  }
+  assert(allocator);
+
+  char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
+  memset(raw, 0, sz);
+  auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE;
+  if (kNumBlocks > 0 && cache_line_offset > 0) {
+    raw += CACHE_LINE_SIZE - cache_line_offset;
+  }
+  data_ = raw;
+}
+
+void BloomBlockBuilder::AddKeysHashes(
+    const std::vector<uint32_t>& keys_hashes) {
+  for (auto hash : keys_hashes) {
+    bloom_.AddHash(hash);
+  }
+}
+
+Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
+
+const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_bloom.h b/src/rocksdb/table/plain/plain_table_bloom.h
new file mode 100644
index 000000000..460e7ec39
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_bloom.h
@@ -0,0 +1,132 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/bloom_impl.h"
+#include "util/hash.h"
+#include "util/math.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Slice;
+class Allocator;
+class Logger;
+
+// A legacy Bloom filter implementation used by Plain Table db format, for
+// schema backward compatibility. Not for use in new filter applications.
+class PlainTableBloomV1 {
+ public:
+  // allocator: pass allocator to bloom filter, hence trace the usage of memory
+  // total_bits: fixed total bits for the bloom
+  // num_probes: number of hash probes for a single key
+  // locality:  If positive, optimize for cache line locality, 0 otherwise.
+  // hash_func:  customized hash function
+  // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
+  //                      within this page size. Need to reserve huge pages for
+  //                      it to be allocated, like:
+  //                         sysctl -w vm.nr_hugepages=20
+  //                     See linux doc Documentation/vm/hugetlbpage.txt
+  explicit PlainTableBloomV1(uint32_t num_probes = 6);
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger);
+
+  ~PlainTableBloomV1() {}
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t hash);
+
+  // Multithreaded access to this function is OK
+  bool MayContainHash(uint32_t hash) const;
+
+  void Prefetch(uint32_t hash);
+
+  uint32_t GetNumBlocks() const { return kNumBlocks; }
+
+  Slice GetRawData() const { return Slice(data_, GetTotalBits() / 8); }
+
+  void SetRawData(char* raw_data, uint32_t total_bits, uint32_t num_blocks = 0);
+
+  uint32_t GetTotalBits() const { return kTotalBits; }
+
+  bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
+
+ private:
+  uint32_t kTotalBits;
+  uint32_t kNumBlocks;
+  const uint32_t kNumProbes;
+
+  char* data_;
+
+  static constexpr int LOG2_CACHE_LINE_SIZE =
+      ConstexprFloorLog2(CACHE_LINE_SIZE);
+};
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// local variable is initialized but not referenced
+#pragma warning(disable : 4189)
+#endif
+inline void PlainTableBloomV1::Prefetch(uint32_t h) {
+  if (kNumBlocks != 0) {
+    uint32_t ignored;
+    LegacyLocalityBloomImpl</*ExtraRotates*/ true>::PrepareHashMayMatch(
+        h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE);
+  }
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const {
+  assert(IsInitialized());
+  if (kNumBlocks != 0) {
+    return LegacyLocalityBloomImpl<true>::HashMayMatch(
+        h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE);
+  } else {
+    return LegacyNoLocalityBloomImpl::HashMayMatch(h, kTotalBits, kNumProbes,
+                                                   data_);
+  }
+}
+
+inline void PlainTableBloomV1::AddHash(uint32_t h) {
+  assert(IsInitialized());
+  if (kNumBlocks != 0) {
+    LegacyLocalityBloomImpl<true>::AddHash(h, kNumBlocks, kNumProbes, data_,
+                                           LOG2_CACHE_LINE_SIZE);
+  } else {
+    LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_);
+  }
+}
+
+class BloomBlockBuilder {
+ public:
+  static const std::string kBloomBlock;
+
+  explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {}
+
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger) {
+    bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size,
+                        logger);
+  }
+
+  uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
+
+  void AddKeysHashes(const std::vector<uint32_t>& keys_hashes);
+
+  Slice Finish();
+
+ private:
+  PlainTableBloomV1 bloom_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_builder.cc b/src/rocksdb/table/plain/plain_table_builder.cc
new file mode 100644
index 000000000..04723955c
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_builder.cc
@@ -0,0 +1,337 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_builder.h"
+
+#include <assert.h>
+
+#include <limits>
+#include <map>
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// a utility that helps writing block content to the file
+//   @offset will advance if @block_contents was successfully written.
+//   @block_handle the block handle this particular block.
+IOStatus WriteBlock(const Slice& block_contents, WritableFileWriter* file,
+                    uint64_t* offset, BlockHandle* block_handle) {
+  block_handle->set_offset(*offset);
+  block_handle->set_size(block_contents.size());
+  IOStatus io_s = file->Append(block_contents);
+
+  if (io_s.ok()) {
+    *offset += block_contents.size();
+  }
+  return io_s;
+}
+
+}  // namespace
+
+// kPlainTableMagicNumber was picked by running
+//    echo rocksdb.table.plain | sha1sum
+// and taking the leading 64 bits.
+extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
+extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
+
+PlainTableBuilder::PlainTableBuilder(
+    const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+    const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+    uint32_t column_family_id, int level_at_creation, WritableFileWriter* file,
+    uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness,
+    uint32_t bloom_bits_per_key, const std::string& column_family_name,
+    uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio,
+    bool store_index_in_file, const std::string& db_id,
+    const std::string& db_session_id, uint64_t file_number)
+    : ioptions_(ioptions),
+      moptions_(moptions),
+      bloom_block_(num_probes),
+      file_(file),
+      bloom_bits_per_key_(bloom_bits_per_key),
+      huge_page_tlb_size_(huge_page_tlb_size),
+      encoder_(encoding_type, user_key_len, moptions.prefix_extractor.get(),
+               index_sparseness),
+      store_index_in_file_(store_index_in_file),
+      prefix_extractor_(moptions.prefix_extractor.get()) {
+  // Build index block and save it in the file if hash_table_ratio > 0
+  if (store_index_in_file_) {
+    assert(hash_table_ratio > 0 || IsTotalOrderMode());
+    index_builder_.reset(new PlainTableIndexBuilder(
+        &arena_, ioptions, moptions.prefix_extractor.get(), index_sparseness,
+        hash_table_ratio, huge_page_tlb_size_));
+    properties_
+        .user_collected_properties[PlainTablePropertyNames::kBloomVersion] =
+        "1";  // For future use
+  }
+
+  properties_.fixed_key_len = user_key_len;
+
+  // for plain table, we put all the data in a big chuck.
+  properties_.num_data_blocks = 1;
+  // Fill it later if store_index_in_file_ == true
+  properties_.index_size = 0;
+  properties_.filter_size = 0;
+  // To support roll-back to previous version, now still use version 0 for
+  // plain encoding.
+  properties_.format_version = (encoding_type == kPlain) ? 0 : 1;
+  properties_.column_family_id = column_family_id;
+  properties_.column_family_name = column_family_name;
+  properties_.db_id = db_id;
+  properties_.db_session_id = db_session_id;
+  properties_.db_host_id = ioptions.db_host_id;
+  if (!ReifyDbHostIdProperty(ioptions_.env, &properties_.db_host_id).ok()) {
+    ROCKS_LOG_INFO(ioptions_.logger, "db_host_id property will not be set");
+  }
+  properties_.orig_file_number = file_number;
+  properties_.prefix_extractor_name =
+      moptions_.prefix_extractor != nullptr
+          ? moptions_.prefix_extractor->AsString()
+          : "nullptr";
+
+  std::string val;
+  PutFixed32(&val, static_cast<uint32_t>(encoder_.GetEncodingType()));
+  properties_
+      .user_collected_properties[PlainTablePropertyNames::kEncodingType] = val;
+
+  assert(int_tbl_prop_collector_factories);
+  for (auto& factory : *int_tbl_prop_collector_factories) {
+    assert(factory);
+
+    table_properties_collectors_.emplace_back(
+        factory->CreateIntTblPropCollector(column_family_id,
+                                           level_at_creation));
+  }
+}
+
+PlainTableBuilder::~PlainTableBuilder() {
+  // They are supposed to have been passed to users through Finish()
+  // if the file succeeds.
+  status_.PermitUncheckedError();
+  io_status_.PermitUncheckedError();
+}
+
+void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
+  // temp buffer for metadata bytes between key and value.
+  char meta_bytes_buf[6];
+  size_t meta_bytes_buf_size = 0;
+
+  ParsedInternalKey internal_key;
+  if (!ParseInternalKey(key, &internal_key, false /* log_err_key */)
+           .ok()) {  // TODO
+    assert(false);
+    return;
+  }
+  if (internal_key.type == kTypeRangeDeletion) {
+    status_ = Status::NotSupported("Range deletion unsupported");
+    return;
+  }
+
+  // Store key hash
+  if (store_index_in_file_) {
+    if (moptions_.prefix_extractor == nullptr) {
+      keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
+    } else {
+      Slice prefix =
+          moptions_.prefix_extractor->Transform(internal_key.user_key);
+      keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
+    }
+  }
+
+  // Write value
+  assert(offset_ <= std::numeric_limits<uint32_t>::max());
+  auto prev_offset = static_cast<uint32_t>(offset_);
+  // Write out the key
+  io_status_ = encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
+                                  &meta_bytes_buf_size);
+  if (SaveIndexInFile()) {
+    index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset);
+  }
+
+  // Write value length
+  uint32_t value_size = static_cast<uint32_t>(value.size());
+  if (io_status_.ok()) {
+    char* end_ptr =
+        EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size);
+    assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf));
+    meta_bytes_buf_size = end_ptr - meta_bytes_buf;
+    io_status_ = file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size));
+  }
+
+  // Write value
+  if (io_status_.ok()) {
+    io_status_ = file_->Append(value);
+    offset_ += value_size + meta_bytes_buf_size;
+  }
+
+  if (io_status_.ok()) {
+    properties_.num_entries++;
+    properties_.raw_key_size += key.size();
+    properties_.raw_value_size += value.size();
+    if (internal_key.type == kTypeDeletion ||
+        internal_key.type == kTypeSingleDeletion) {
+      properties_.num_deletions++;
+    } else if (internal_key.type == kTypeMerge) {
+      properties_.num_merge_operands++;
+    }
+  }
+
+  // notify property collectors
+  NotifyCollectTableCollectorsOnAdd(
+      key, value, offset_, table_properties_collectors_, ioptions_.logger);
+  status_ = io_status_;
+}
+
+Status PlainTableBuilder::Finish() {
+  assert(!closed_);
+  closed_ = true;
+
+  properties_.data_size = offset_;
+
+  //  Write the following blocks
+  //  1. [meta block: bloom] - optional
+  //  2. [meta block: index] - optional
+  //  3. [meta block: properties]
+  //  4. [metaindex block]
+  //  5. [footer]
+
+  MetaIndexBuilder meta_index_builer;
+
+  if (store_index_in_file_ && (properties_.num_entries > 0)) {
+    assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max());
+    BlockHandle bloom_block_handle;
+    if (bloom_bits_per_key_ > 0) {
+      bloom_block_.SetTotalBits(
+          &arena_,
+          static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_,
+          ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.logger);
+
+      PutVarint32(&properties_.user_collected_properties
+                       [PlainTablePropertyNames::kNumBloomBlocks],
+                  bloom_block_.GetNumBlocks());
+
+      bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_);
+
+      Slice bloom_finish_result = bloom_block_.Finish();
+
+      properties_.filter_size = bloom_finish_result.size();
+      io_status_ =
+          WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle);
+
+      if (!io_status_.ok()) {
+        status_ = io_status_;
+        return status_;
+      }
+      meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
+    }
+    BlockHandle index_block_handle;
+    Slice index_finish_result = index_builder_->Finish();
+
+    properties_.index_size = index_finish_result.size();
+    io_status_ =
+        WriteBlock(index_finish_result, file_, &offset_, &index_block_handle);
+
+    if (!io_status_.ok()) {
+      status_ = io_status_;
+      return status_;
+    }
+
+    meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
+                          index_block_handle);
+  }
+
+  // Calculate bloom block size and index block size
+  PropertyBlockBuilder property_block_builder;
+  // -- Add basic properties
+  property_block_builder.AddTableProperty(properties_);
+
+  property_block_builder.Add(properties_.user_collected_properties);
+
+  // -- Add user collected properties
+  NotifyCollectTableCollectorsOnFinish(
+      table_properties_collectors_, ioptions_.logger, &property_block_builder);
+
+  // -- Write property block
+  BlockHandle property_block_handle;
+  IOStatus s = WriteBlock(property_block_builder.Finish(), file_, &offset_,
+                          &property_block_handle);
+  if (!s.ok()) {
+    return static_cast<Status>(s);
+  }
+  meta_index_builer.Add(kPropertiesBlockName, property_block_handle);
+
+  // -- write metaindex block
+  BlockHandle metaindex_block_handle;
+  io_status_ = WriteBlock(meta_index_builer.Finish(), file_, &offset_,
+                          &metaindex_block_handle);
+  if (!io_status_.ok()) {
+    status_ = io_status_;
+    return status_;
+  }
+
+  // Write Footer
+  // no need to write out new footer if we're using default checksum
+  FooterBuilder footer;
+  footer.Build(kPlainTableMagicNumber, /* format_version */ 0, offset_,
+               kNoChecksum, metaindex_block_handle);
+  io_status_ = file_->Append(footer.GetSlice());
+  if (io_status_.ok()) {
+    offset_ += footer.GetSlice().size();
+  }
+  status_ = io_status_;
+  return status_;
+}
+
+void PlainTableBuilder::Abandon() { closed_ = true; }
+
+uint64_t PlainTableBuilder::NumEntries() const {
+  return properties_.num_entries;
+}
+
+uint64_t PlainTableBuilder::FileSize() const { return offset_; }
+
+std::string PlainTableBuilder::GetFileChecksum() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
+const char* PlainTableBuilder::GetFileChecksumFuncName() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksumFuncName();
+  } else {
+    return kUnknownFileChecksumFuncName;
+  }
+}
+void PlainTableBuilder::SetSeqnoTimeTableProperties(const std::string& string,
+                                                    uint64_t uint_64) {
+  // TODO: storing seqno to time mapping is not yet support for plain table.
+  TableBuilder::SetSeqnoTimeTableProperties(string, uint_64);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_builder.h b/src/rocksdb/table/plain/plain_table_builder.h
new file mode 100644
index 000000000..445491c2a
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_builder.h
@@ -0,0 +1,154 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_index.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/table_builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+class TableBuilder;
+
+// The builder class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableBuilder : public TableBuilder {
+ public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish(). The output file
+  // will be part of level specified by 'level'.  A value of -1 means
+  // that the caller does not know which level the output file will reside.
+  PlainTableBuilder(
+      const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+      const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+      uint32_t column_family_id, int level_at_creation,
+      WritableFileWriter* file, uint32_t user_key_size,
+      EncodingType encoding_type, size_t index_sparseness,
+      uint32_t bloom_bits_per_key, const std::string& column_family_name,
+      uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
+      double hash_table_ratio = 0, bool store_index_in_file = false,
+      const std::string& db_id = "", const std::string& db_session_id = "",
+      uint64_t file_number = 0);
+  // No copying allowed
+  PlainTableBuilder(const PlainTableBuilder&) = delete;
+  void operator=(const PlainTableBuilder&) = delete;
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~PlainTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override { return status_; }
+
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override { return io_status_; }
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+  TableProperties GetTableProperties() const override { return properties_; }
+
+  bool SaveIndexInFile() const { return store_index_in_file_; }
+
+  // Get file checksum
+  std::string GetFileChecksum() const override;
+
+  // Get file checksum function name
+  const char* GetFileChecksumFuncName() const override;
+
+  void SetSeqnoTimeTableProperties(const std::string& string,
+                                   uint64_t uint_64) override;
+
+ private:
+  Arena arena_;
+  const ImmutableOptions& ioptions_;
+  const MutableCFOptions& moptions_;
+  std::vector<std::unique_ptr<IntTblPropCollector>>
+      table_properties_collectors_;
+
+  BloomBlockBuilder bloom_block_;
+  std::unique_ptr<PlainTableIndexBuilder> index_builder_;
+
+  WritableFileWriter* file_;
+  uint64_t offset_ = 0;
+  uint32_t bloom_bits_per_key_;
+  size_t huge_page_tlb_size_;
+  Status status_;
+  IOStatus io_status_;
+  TableProperties properties_;
+  PlainTableKeyEncoder encoder_;
+
+  bool store_index_in_file_;
+
+  std::vector<uint32_t> keys_or_prefixes_hashes_;
+  bool closed_ = false;  // Either Finish() or Abandon() has been called.
+
+  const SliceTransform* prefix_extractor_;
+
+  Slice GetPrefix(const Slice& target) const {
+    assert(target.size() >= 8);  // target is internal key
+    return GetPrefixFromUserKey(ExtractUserKey(target));
+  }
+
+  Slice GetPrefix(const ParsedInternalKey& target) const {
+    return GetPrefixFromUserKey(target.user_key);
+  }
+
+  Slice GetPrefixFromUserKey(const Slice& user_key) const {
+    if (!IsTotalOrderMode()) {
+      return prefix_extractor_->Transform(user_key);
+    } else {
+      // Use empty slice as prefix if prefix_extractor is not set.
+      // In that case,
+      // it falls back to pure binary search and
+      // total iterator seek is supported.
+      return Slice();
+    }
+  }
+
+  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_factory.cc b/src/rocksdb/table/plain/plain_table_factory.cc
new file mode 100644
index 000000000..dfe5241a5
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_factory.cc
@@ -0,0 +1,350 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain/plain_table_factory.h"
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/plain/plain_table_builder.h"
+#include "table/plain/plain_table_reader.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
+    {"user_key_len",
+     {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"bloom_bits_per_key",
+     {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"hash_table_ratio",
+     {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"index_sparseness",
+     {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"huge_page_tlb_size",
+     {offsetof(struct PlainTableOptions, huge_page_tlb_size),
+      OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"encoding_type",
+     {offsetof(struct PlainTableOptions, encoding_type),
+      OptionType::kEncodingType, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"full_scan_mode",
+     {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"store_index_in_file",
+     {offsetof(struct PlainTableOptions, store_index_in_file),
+      OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+};
+
+PlainTableFactory::PlainTableFactory(const PlainTableOptions& options)
+    : table_options_(options) {
+  RegisterOptions(&table_options_, &plain_table_type_info);
+}
+
+Status PlainTableFactory::NewTableReader(
+    const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table,
+    bool /*prefetch_index_and_filter_in_cache*/) const {
+  return PlainTableReader::Open(
+      table_reader_options.ioptions, table_reader_options.env_options,
+      table_reader_options.internal_comparator, std::move(file), file_size,
+      table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio,
+      table_options_.index_sparseness, table_options_.huge_page_tlb_size,
+      table_options_.full_scan_mode, table_reader_options.immortal,
+      table_reader_options.prefix_extractor.get());
+}
+
+TableBuilder* PlainTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFileWriter* file) const {
+  // Ignore the skip_filters flag. PlainTable format is optimized for small
+  // in-memory dbs. The skip_filters optimization is not useful for plain
+  // tables
+  //
+  return new PlainTableBuilder(
+      table_builder_options.ioptions, table_builder_options.moptions,
+      table_builder_options.int_tbl_prop_collector_factories,
+      table_builder_options.column_family_id,
+      table_builder_options.level_at_creation, file,
+      table_options_.user_key_len, table_options_.encoding_type,
+      table_options_.index_sparseness, table_options_.bloom_bits_per_key,
+      table_builder_options.column_family_name, 6,
+      table_options_.huge_page_tlb_size, table_options_.hash_table_ratio,
+      table_options_.store_index_in_file, table_builder_options.db_id,
+      table_builder_options.db_session_id, table_builder_options.cur_file_num);
+}
+
+std::string PlainTableFactory::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "  user_key_len: %u\n",
+           table_options_.user_key_len);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  bloom_bits_per_key: %d\n",
+           table_options_.bloom_bits_per_key);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  hash_table_ratio: %lf\n",
+           table_options_.hash_table_ratio);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_sparseness: %" ROCKSDB_PRIszt "\n",
+           table_options_.index_sparseness);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  huge_page_tlb_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.huge_page_tlb_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  encoding_type: %d\n",
+           table_options_.encoding_type);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  full_scan_mode: %d\n",
+           table_options_.full_scan_mode);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  store_index_in_file: %d\n",
+           table_options_.store_index_in_file);
+  ret.append(buffer);
+  return ret;
+}
+
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+  config_options.invoke_prepare_options = false;
+  return GetPlainTableOptionsFromString(config_options, table_options, opts_str,
+                                        new_table_options);
+}
+
+Status GetPlainTableOptionsFromString(const ConfigOptions& config_options,
+                                      const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = GetPlainTableOptionsFromMap(config_options, table_options, opts_map,
+                                  new_table_options);
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
+  }
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library,
+                                             const std::string& /*arg*/) {
+  // The MemTableRepFactory built-in classes will be either a class
+  // (VectorRepFactory) or a nickname (vector), followed optionally by ":#",
+  // where # is the "size" of the factory.
+  auto AsPattern = [](const std::string& name, const std::string& alt) {
+    auto pattern = ObjectLibrary::PatternEntry(name, true);
+    pattern.AnotherName(alt);
+    pattern.AddNumber(":");
+    return pattern;
+  };
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern(VectorRepFactory::kClassName(), VectorRepFactory::kNickName()),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(new VectorRepFactory(count));
+        } else {
+          guard->reset(new VectorRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern(SkipListFactory::kClassName(), SkipListFactory::kNickName()),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t lookahead = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(new SkipListFactory(lookahead));
+        } else {
+          guard->reset(new SkipListFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern("HashLinkListRepFactory", "hash_linkedlist"),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        // Expecting format: hash_linkedlist:<hash_bucket_count>
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(NewHashLinkListRepFactory(hash_bucket_count));
+        } else {
+          guard->reset(NewHashLinkListRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern("HashSkipListRepFactory", "prefix_hash"),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        // Expecting format: prefix_hash:<hash_bucket_count>
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(NewHashSkipListRepFactory(hash_bucket_count));
+        } else {
+          guard->reset(NewHashSkipListRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      "cuckoo",
+      [](const std::string& /*uri*/,
+         std::unique_ptr<MemTableRepFactory>* /*guard*/, std::string* errmsg) {
+        *errmsg = "cuckoo hash memtable is not supported anymore.";
+        return nullptr;
+      });
+
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+Status GetMemTableRepFactoryFromString(
+    const std::string& opts_str, std::unique_ptr<MemTableRepFactory>* result) {
+  ConfigOptions config_options;
+  config_options.ignore_unsupported_options = false;
+  config_options.ignore_unknown_options = false;
+  return MemTableRepFactory::CreateFromString(config_options, opts_str, result);
+}
+
+Status MemTableRepFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::unique_ptr<MemTableRepFactory>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinMemTableRepFactory(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (value.empty()) {
+    // No Id and no options.  Clear the object
+    result->reset();
+    return Status::OK();
+  } else if (id.empty()) {  // We have no Id but have options.  Not good
+    return Status::NotSupported("Cannot reset object ", id);
+  } else {
+#ifndef ROCKSDB_LITE
+    status = NewUniqueObject<MemTableRepFactory>(config_options, id, opt_map,
+                                                 result);
+#else
+    // To make it possible to configure the memtables in LITE mode, the ID
+    // is of the form <name>:<size>, where name is the name of the class and
+    // <size> is the length of the object (e.g. skip_list:10).
+    std::vector<std::string> opts_list = StringSplit(id, ':');
+    if (opts_list.empty() || opts_list.size() > 2 || !opt_map.empty()) {
+      status = Status::InvalidArgument("Can't parse memtable_factory option ",
+                                       value);
+    } else if (opts_list[0] == SkipListFactory::kNickName() ||
+               opts_list[0] == SkipListFactory::kClassName()) {
+      // Expecting format
+      // skip_list:<lookahead>
+      if (opts_list.size() == 2) {
+        size_t lookahead = ParseSizeT(opts_list[1]);
+        result->reset(new SkipListFactory(lookahead));
+      } else {
+        result->reset(new SkipListFactory());
+      }
+    } else if (!config_options.ignore_unsupported_options) {
+      status = Status::NotSupported("Cannot load object in LITE mode ", id);
+    }
+#endif  // ROCKSDB_LITE
+  }
+  return status;
+}
+
+Status MemTableRepFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<MemTableRepFactory>* result) {
+  std::unique_ptr<MemTableRepFactory> factory;
+  Status s = CreateFromString(config_options, value, &factory);
+  if (factory && s.ok()) {
+    result->reset(factory.release());
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status GetPlainTableOptionsFromMap(
+    const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options, bool input_strings_escaped,
+    bool ignore_unknown_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = input_strings_escaped;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  return GetPlainTableOptionsFromMap(config_options, table_options, opts_map,
+                                     new_table_options);
+}
+
+Status GetPlainTableOptionsFromMap(
+    const ConfigOptions& config_options, const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options) {
+  assert(new_table_options);
+  PlainTableFactory ptf(table_options);
+  Status s = ptf.ConfigureFromMap(config_options, opts_map);
+  if (s.ok()) {
+    *new_table_options = *(ptf.GetOptions<PlainTableOptions>());
+  } else {
+    // Restore "new_options" to the default "base_options".
+    *new_table_options = table_options;
+  }
+  return s;
+}
+
+extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
+  return new PlainTableFactory(options);
+}
+
+const std::string PlainTablePropertyNames::kEncodingType =
+    "rocksdb.plain.table.encoding.type";
+
+const std::string PlainTablePropertyNames::kBloomVersion =
+    "rocksdb.plain.table.bloom.version";
+
+const std::string PlainTablePropertyNames::kNumBloomBlocks =
+    "rocksdb.plain.table.bloom.numblocks";
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_factory.h b/src/rocksdb/table/plain/plain_table_factory.h
new file mode 100644
index 000000000..ce60b9d19
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_factory.h
@@ -0,0 +1,182 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct EnvOptions;
+
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+// PlainTableFactory is the entrance function to the PlainTable format of
+// SST files. It returns instances PlainTableBuilder as the builder
+// class and PlainTableReader as the reader class, where the format is
+// actually implemented.
+//
+// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
+// Data is not organized in blocks, which allows fast access. Because of
+// following downsides
+// 1. Data compression is not supported.
+// 2. Data is not checksumed.
+// it is not recommended to use this format on other type of file systems.
+//
+// PlainTable requires fixed length key, configured as a constructor
+// parameter of the factory class. Output file format:
+// +-------------+-----------------+
+// | version     | user_key_length |
+// +------------++------------+-----------------+  <= key1 offset
+// |  encoded key1            | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value1                                     |
+// |                                            |
+// +--------------------------+-------------+---+  <= key2 offset
+// | encoded key2             | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+
+//
+// When the key encoding type is kPlain. Key part is encoded as:
+// +------------+--------------------+
+// | [key_size] |  internal key      |
+// +------------+--------------------+
+// for the case of user_key_len = kPlainTableVariableLength case,
+// and simply:
+// +----------------------+
+// |  internal key        |
+// +----------------------+
+// for user_key_len != kPlainTableVariableLength case.
+//
+// If key encoding type is kPrefix. Keys are encoding in this format.
+// There are three ways to encode a key:
+// (1) Full Key
+// +---------------+---------------+-------------------+
+// | Full Key Flag | Full Key Size | Full Internal Key |
+// +---------------+---------------+-------------------+
+// which simply encodes a full key
+//
+// (2) A key shared the same prefix as the previous key, which is encoded as
+//     format of (1).
+// +-------------+-------------+-------------+-------------+------------+
+// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
+// +-------------+-------------+-------------+-------------+------------+
+// where key is the suffix part of the key, including the internal bytes.
+// the actual key will be constructed by concatenating prefix part of the
+// previous key, with the suffix part of the key here, with sizes given here.
+//
+// (3) A key shared the same prefix as the previous key, which is encoded as
+//     the format of (2).
+// +-----------------+-----------------+------------------------+
+// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
+// +-----------------+-----------------+------------------------+
+// The key will be constructed by concatenating previous key's prefix (which is
+// also a prefix which the last key encoded in the format of (1)) and the
+// key given here.
+//
+// For example, we for following keys (prefix and suffix are separated by
+// spaces):
+//   0000 0001
+//   0000 00021
+//   0000 0002
+//   00011 00
+//   0002 0001
+// Will be encoded like this:
+//   FK 8 00000001
+//   PF 4 SF 5 00021
+//   SF 4 0002
+//   FK 7 0001100
+//   FK 8 00020001
+// (where FK means full key flag, PF means prefix flag and SF means suffix flag)
+//
+// All those "key flag + key size" shown above are in this format:
+// The 8 bits of the first byte:
+// +----+----+----+----+----+----+----+----+
+// |  Type   |            Size             |
+// +----+----+----+----+----+----+----+----+
+// Type indicates: full key, prefix, or suffix.
+// The last 6 bits are for size. If the size bits are not all 1, it means the
+// size of the key. Otherwise, varint32 is read after this byte. This varint
+// value + 0x3F (the value of all 1) will be the key size.
+//
+// For example, full key with length 16 will be encoded as (binary):
+//     00 010000
+// (00 means full key)
+// and a prefix with 100 bytes will be encoded as:
+//     01 111111    00100101
+//         (63)       (37)
+// (01 means key suffix)
+//
+// All the internal keys above (including kPlain and kPrefix) are encoded in
+// this format:
+// There are two types:
+// (1) normal internal key format
+// +----------- ...... -------------+----+---+---+---+---+---+---+---+
+// |       user key                 |type|      sequence ID          |
+// +----------- ..... --------------+----+---+---+---+---+---+---+---+
+// (2) Special case for keys whose sequence ID is 0 and is value type
+// +----------- ...... -------------+----+
+// |       user key                 |0x80|
+// +----------- ..... --------------+----+
+// To save 7 bytes for the special case where sequence ID = 0.
+//
+//
+class PlainTableFactory : public TableFactory {
+ public:
+  ~PlainTableFactory() {}
+  // user_key_len is the length of the user key. If it is set to be
+  // kPlainTableVariableLength, then it means variable length. Otherwise, all
+  // the keys need to have the fix length of this value. bloom_bits_per_key is
+  // number of bits used for bloom filer per key. hash_table_ratio is
+  // the desired utilization of the hash table used for prefix hashing.
+  // hash_table_ratio = number of prefixes / #buckets in the hash table
+  // hash_table_ratio = 0 means skip hash table but only replying on binary
+  // search.
+  // index_sparseness determines index interval for keys
+  // inside the same prefix. It will be the maximum number of linear search
+  // required after hash and binary search.
+  // index_sparseness = 0 means index for every key.
+  // huge_page_tlb_size determines whether to allocate hash indexes from huge
+  // page TLB and the page size if allocating from there. See comments of
+  // Arena::AllocateAligned() for details.
+  explicit PlainTableFactory(
+      const PlainTableOptions& _table_options = PlainTableOptions());
+
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kPlainTableName(); }
+  const char* Name() const override { return kPlainTableName(); }
+  using TableFactory::NewTableReader;
+  Status NewTableReader(const ReadOptions& ro,
+                        const TableReaderOptions& table_reader_options,
+                        std::unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size, std::unique_ptr<TableReader>* table,
+                        bool prefetch_index_and_filter_in_cache) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const override;
+
+  std::string GetPrintableOptions() const override;
+  static const char kValueTypeSeqId0 = char(~0);
+
+ private:
+  PlainTableOptions table_options_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_index.cc b/src/rocksdb/table/plain/plain_table_index.cc
new file mode 100644
index 000000000..b7e07cfb2
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_index.cc
@@ -0,0 +1,213 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_index.h"
+
+#include <cinttypes>
+
+#include "logging/logging.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+  assert(num_buckets > 0);
+  return hash % num_buckets;
+}
+}  // namespace
+
+Status PlainTableIndex::InitFromRawData(Slice data) {
+  if (!GetVarint32(&data, &index_size_)) {
+    return Status::Corruption("Couldn't read the index size!");
+  }
+  assert(index_size_ > 0);
+  if (!GetVarint32(&data, &num_prefixes_)) {
+    return Status::Corruption("Couldn't read the index size!");
+  }
+  sub_index_size_ =
+      static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
+
+  char* index_data_begin = const_cast<char*>(data.data());
+  index_ = reinterpret_cast<uint32_t*>(index_data_begin);
+  sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
+  return Status::OK();
+}
+
+PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
+    uint32_t prefix_hash, uint32_t* bucket_value) const {
+  int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
+  GetUnaligned(index_ + bucket, bucket_value);
+  if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
+    *bucket_value ^= kSubIndexMask;
+    return kSubindex;
+  }
+  if (*bucket_value >= kMaxFileSize) {
+    return kNoPrefixForBucket;
+  } else {
+    // point directly to the file
+    return kDirectToFile;
+  }
+}
+
+void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
+                                                        uint32_t offset) {
+  if (num_records_in_current_group_ == kNumRecordsPerGroup) {
+    current_group_ = AllocateNewGroup();
+    num_records_in_current_group_ = 0;
+  }
+  auto& new_record = current_group_[num_records_in_current_group_++];
+  new_record.hash = hash;
+  new_record.offset = offset;
+  new_record.next = nullptr;
+}
+
+void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
+                                          uint32_t key_offset) {
+  if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
+    ++num_prefixes_;
+    if (!is_first_record_) {
+      keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+    }
+    num_keys_per_prefix_ = 0;
+    prev_key_prefix_ = key_prefix_slice.ToString();
+    prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
+    due_index_ = true;
+  }
+
+  if (due_index_) {
+    // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+    record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
+    due_index_ = false;
+  }
+
+  num_keys_per_prefix_++;
+  if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
+    due_index_ = true;
+  }
+  is_first_record_ = false;
+}
+
+Slice PlainTableIndexBuilder::Finish() {
+  AllocateIndex();
+  std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
+  std::vector<uint32_t> entries_per_bucket(index_size_, 0);
+  BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
+
+  keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+  ROCKS_LOG_INFO(ioptions_.logger, "Number of Keys per prefix Histogram: %s",
+                 keys_per_prefix_hist_.ToString().c_str());
+
+  // From the temp data structure, populate indexes.
+  return FillIndexes(hash_to_offsets, entries_per_bucket);
+}
+
+void PlainTableIndexBuilder::AllocateIndex() {
+  if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
+    // Fall back to pure binary search if the user fails to specify a prefix
+    // extractor.
+    index_size_ = 1;
+  } else {
+    double hash_table_size_multipier = 1.0 / hash_table_ratio_;
+    index_size_ =
+        static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
+    assert(index_size_ > 0);
+  }
+}
+
+void PlainTableIndexBuilder::BucketizeIndexes(
+    std::vector<IndexRecord*>* hash_to_offsets,
+    std::vector<uint32_t>* entries_per_bucket) {
+  bool first = true;
+  uint32_t prev_hash = 0;
+  size_t num_records = record_list_.GetNumRecords();
+  for (size_t i = 0; i < num_records; i++) {
+    IndexRecord* index_record = record_list_.At(i);
+    uint32_t cur_hash = index_record->hash;
+    if (first || prev_hash != cur_hash) {
+      prev_hash = cur_hash;
+      first = false;
+    }
+    uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
+    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
+    index_record->next = prev_bucket_head;
+    (*hash_to_offsets)[bucket] = index_record;
+    (*entries_per_bucket)[bucket]++;
+  }
+
+  sub_index_size_ = 0;
+  for (auto entry_count : *entries_per_bucket) {
+    if (entry_count <= 1) {
+      continue;
+    }
+    // Only buckets with more than 1 entry will have subindex.
+    sub_index_size_ += VarintLength(entry_count);
+    // total bytes needed to store these entries' in-file offsets.
+    sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
+  }
+}
+
+Slice PlainTableIndexBuilder::FillIndexes(
+    const std::vector<IndexRecord*>& hash_to_offsets,
+    const std::vector<uint32_t>& entries_per_bucket) {
+  ROCKS_LOG_DEBUG(ioptions_.logger,
+                  "Reserving %" PRIu32 " bytes for plain table's sub_index",
+                  sub_index_size_);
+  auto total_allocate_size = GetTotalSize();
+  char* allocated = arena_->AllocateAligned(
+      total_allocate_size, huge_page_tlb_size_, ioptions_.logger);
+
+  auto temp_ptr = EncodeVarint32(allocated, index_size_);
+  uint32_t* index =
+      reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
+  char* sub_index = reinterpret_cast<char*>(index + index_size_);
+
+  uint32_t sub_index_offset = 0;
+  for (uint32_t i = 0; i < index_size_; i++) {
+    uint32_t num_keys_for_bucket = entries_per_bucket[i];
+    switch (num_keys_for_bucket) {
+      case 0:
+        // No key for bucket
+        PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
+        break;
+      case 1:
+        // point directly to the file offset
+        PutUnaligned(index + i, hash_to_offsets[i]->offset);
+        break;
+      default:
+        // point to second level indexes.
+        PutUnaligned(index + i,
+                     sub_index_offset | PlainTableIndex::kSubIndexMask);
+        char* prev_ptr = &sub_index[sub_index_offset];
+        char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+        sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
+        char* sub_index_pos = &sub_index[sub_index_offset];
+        IndexRecord* record = hash_to_offsets[i];
+        int j;
+        for (j = num_keys_for_bucket - 1; j >= 0 && record;
+             j--, record = record->next) {
+          EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
+        }
+        assert(j == -1 && record == nullptr);
+        sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
+        assert(sub_index_offset <= sub_index_size_);
+        break;
+    }
+  }
+  assert(sub_index_offset == sub_index_size_);
+
+  ROCKS_LOG_DEBUG(ioptions_.logger,
+                  "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
+                  index_size_, sub_index_size_);
+  return Slice(allocated, GetTotalSize());
+}
+
+const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
+    "PlainTableIndexBlock";
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_index.h b/src/rocksdb/table/plain/plain_table_index.h
new file mode 100644
index 000000000..9f5f0eeff
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_index.h
@@ -0,0 +1,248 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "memory/arena.h"
+#include "monitoring/histogram.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains two classes PlainTableIndex and PlainTableIndexBuilder
+// The two classes implement the index format of PlainTable.
+// For description of PlainTable format, see comments of class
+// PlainTableFactory
+//
+//
+// PlainTableIndex contains buckets size of index_size_, each is a
+// 32-bit integer. The lower 31 bits contain an offset value (explained below)
+// and the first bit of the integer indicates type of the offset.
+//
+// +--------------+------------------------------------------------------+
+// | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
+// +--------------+------------------------------------------------------+
+//
+// Explanation for the "flag bit":
+//
+// 0 indicates that the bucket contains only one prefix (no conflict when
+//   hashing this prefix), whose first row starts from this offset of the
+// file.
+// 1 indicates that the bucket contains more than one prefixes, or there
+//   are too many rows for one prefix so we need a binary search for it. In
+//   this case, the offset indicates the offset of sub_index_ holding the
+//   binary search indexes of keys for those rows. Those binary search indexes
+//   are organized in this way:
+//
+// The first 4 bytes, indicate how many indexes (N) are stored after it. After
+// it, there are N 32-bit integers, each points of an offset of the file,
+// which
+// points to starting of a row. Those offsets need to be guaranteed to be in
+// ascending order so the keys they are pointing to are also in ascending
+// order
+// to make sure we can use them to do binary searches. Below is visual
+// presentation of a bucket.
+//
+// <begin>
+//   number_of_records:  varint32
+//   record 1 file offset:  fixedint32
+//   record 2 file offset:  fixedint32
+//    ....
+//   record N file offset:  fixedint32
+// <end>
+
+// The class loads the index block from a PlainTable SST file, and executes
+// the index lookup.
+// The class is used by PlainTableReader class.
+class PlainTableIndex {
+ public:
+  enum IndexSearchResult {
+    kNoPrefixForBucket = 0,
+    kDirectToFile = 1,
+    kSubindex = 2
+  };
+
+  explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
+
+  PlainTableIndex()
+      : index_size_(0),
+        sub_index_size_(0),
+        num_prefixes_(0),
+        index_(nullptr),
+        sub_index_(nullptr) {}
+
+  // The function that executes the lookup the hash table.
+  // The hash key is `prefix_hash`. The function fills the hash bucket
+  // content in `bucket_value`, which is up to the caller to interpret.
+  IndexSearchResult GetOffset(uint32_t prefix_hash,
+                              uint32_t* bucket_value) const;
+
+  // Initialize data from `index_data`, which points to raw data for
+  // index stored in the SST file.
+  Status InitFromRawData(Slice index_data);
+
+  // Decode the sub index for specific hash bucket.
+  // The `offset` is the value returned as `bucket_value` by GetOffset()
+  // and is only valid when the return value is `kSubindex`.
+  // The return value is the pointer to the starting address of the
+  // sub-index. `upper_bound` is filled with the value indicating how many
+  // entries the sub-index has.
+  const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
+                                              uint32_t* upper_bound) const {
+    const char* index_ptr = &sub_index_[offset];
+    return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
+  }
+
+  uint32_t GetIndexSize() const { return index_size_; }
+
+  uint32_t GetSubIndexSize() const { return sub_index_size_; }
+
+  uint32_t GetNumPrefixes() const { return num_prefixes_; }
+
+  static const uint64_t kMaxFileSize = (1u << 31) - 1;
+  static const uint32_t kSubIndexMask = 0x80000000;
+  static const size_t kOffsetLen = sizeof(uint32_t);
+
+ private:
+  uint32_t index_size_;
+  uint32_t sub_index_size_;
+  uint32_t num_prefixes_;
+
+  uint32_t* index_;
+  char* sub_index_;
+};
+
+// PlainTableIndexBuilder is used to create plain table index.
+// After calling Finish(), it returns Slice, which is usually
+// used either to initialize PlainTableIndex or
+// to save index to sst file.
+// For more details about the index, please refer to:
+// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
+// #wiki-in-memory-index-format
+// The class is used by PlainTableBuilder class.
+class PlainTableIndexBuilder {
+ public:
+  PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions,
+                         const SliceTransform* prefix_extractor,
+                         size_t index_sparseness, double hash_table_ratio,
+                         size_t huge_page_tlb_size)
+      : arena_(arena),
+        ioptions_(ioptions),
+        record_list_(kRecordsPerGroup),
+        is_first_record_(true),
+        due_index_(false),
+        num_prefixes_(0),
+        num_keys_per_prefix_(0),
+        prev_key_prefix_hash_(0),
+        index_sparseness_(index_sparseness),
+        index_size_(0),
+        sub_index_size_(0),
+        prefix_extractor_(prefix_extractor),
+        hash_table_ratio_(hash_table_ratio),
+        huge_page_tlb_size_(huge_page_tlb_size) {}
+
+  void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
+
+  Slice Finish();
+
+  uint32_t GetTotalSize() const {
+    return VarintLength(index_size_) + VarintLength(num_prefixes_) +
+           PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
+  }
+
+  static const std::string kPlainTableIndexBlock;
+
+ private:
+  struct IndexRecord {
+    uint32_t hash;    // hash of the prefix
+    uint32_t offset;  // offset of a row
+    IndexRecord* next;
+  };
+
+  // Helper class to track all the index records
+  class IndexRecordList {
+   public:
+    explicit IndexRecordList(size_t num_records_per_group)
+        : kNumRecordsPerGroup(num_records_per_group),
+          current_group_(nullptr),
+          num_records_in_current_group_(num_records_per_group) {}
+
+    ~IndexRecordList() {
+      for (size_t i = 0; i < groups_.size(); i++) {
+        delete[] groups_[i];
+      }
+    }
+
+    void AddRecord(uint32_t hash, uint32_t offset);
+
+    size_t GetNumRecords() const {
+      return (groups_.size() - 1) * kNumRecordsPerGroup +
+             num_records_in_current_group_;
+    }
+    IndexRecord* At(size_t index) {
+      return &(
+          groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
+    }
+
+   private:
+    IndexRecord* AllocateNewGroup() {
+      IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
+      groups_.push_back(result);
+      return result;
+    }
+
+    // Each group in `groups_` contains fix-sized records (determined by
+    // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
+    // occurs.
+    const size_t kNumRecordsPerGroup;
+    IndexRecord* current_group_;
+    // List of arrays allocated
+    std::vector<IndexRecord*> groups_;
+    size_t num_records_in_current_group_;
+  };
+
+  void AllocateIndex();
+
+  // Internal helper function to bucket index record list to hash buckets.
+  void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
+                        std::vector<uint32_t>* entries_per_bucket);
+
+  // Internal helper class to fill the indexes and bloom filters to internal
+  // data structures.
+  Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
+                    const std::vector<uint32_t>& entries_per_bucket);
+
+  Arena* arena_;
+  const ImmutableOptions ioptions_;
+  HistogramImpl keys_per_prefix_hist_;
+  IndexRecordList record_list_;
+  bool is_first_record_;
+  bool due_index_;
+  uint32_t num_prefixes_;
+  uint32_t num_keys_per_prefix_;
+
+  uint32_t prev_key_prefix_hash_;
+  size_t index_sparseness_;
+  uint32_t index_size_;
+  uint32_t sub_index_size_;
+
+  const SliceTransform* prefix_extractor_;
+  double hash_table_ratio_;
+  size_t huge_page_tlb_size_;
+
+  std::string prev_key_prefix_;
+
+  static const size_t kRecordsPerGroup = 256;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_key_coding.cc b/src/rocksdb/table/plain/plain_table_key_coding.cc
new file mode 100644
index 000000000..800d8d76f
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_key_coding.cc
@@ -0,0 +1,509 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_key_coding.h"
+
+#include <algorithm>
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum PlainTableEntryType : unsigned char {
+  kFullKey = 0,
+  kPrefixFromPreviousKey = 1,
+  kKeySuffix = 2,
+};
+
+namespace {
+
+// Control byte:
+// First two bits indicate type of entry
+// Other bytes are inlined sizes. If all bits are 1 (0x03F), overflow bytes
+// are used. key_size-0x3F will be encoded as a variint32 after this bytes.
+
+const unsigned char kSizeInlineLimit = 0x3F;
+
+// Return 0 for error
+size_t EncodeSize(PlainTableEntryType type, uint32_t key_size,
+                  char* out_buffer) {
+  out_buffer[0] = type << 6;
+
+  if (key_size < static_cast<uint32_t>(kSizeInlineLimit)) {
+    // size inlined
+    out_buffer[0] |= static_cast<char>(key_size);
+    return 1;
+  } else {
+    out_buffer[0] |= kSizeInlineLimit;
+    char* ptr = EncodeVarint32(out_buffer + 1, key_size - kSizeInlineLimit);
+    return ptr - out_buffer;
+  }
+}
+}  // namespace
+
+// Fill bytes_read with number of bytes read.
+inline Status PlainTableKeyDecoder::DecodeSize(uint32_t start_offset,
+                                               PlainTableEntryType* entry_type,
+                                               uint32_t* key_size,
+                                               uint32_t* bytes_read) {
+  Slice next_byte_slice;
+  bool success = file_reader_.Read(start_offset, 1, &next_byte_slice);
+  if (!success) {
+    return file_reader_.status();
+  }
+  *entry_type = static_cast<PlainTableEntryType>(
+      (static_cast<unsigned char>(next_byte_slice[0]) & ~kSizeInlineLimit) >>
+      6);
+  char inline_key_size = next_byte_slice[0] & kSizeInlineLimit;
+  if (inline_key_size < kSizeInlineLimit) {
+    *key_size = inline_key_size;
+    *bytes_read = 1;
+    return Status::OK();
+  } else {
+    uint32_t extra_size;
+    uint32_t tmp_bytes_read;
+    success = file_reader_.ReadVarint32(start_offset + 1, &extra_size,
+                                        &tmp_bytes_read);
+    if (!success) {
+      return file_reader_.status();
+    }
+    assert(tmp_bytes_read > 0);
+    *key_size = kSizeInlineLimit + extra_size;
+    *bytes_read = tmp_bytes_read + 1;
+    return Status::OK();
+  }
+}
+
+IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key,
+                                         WritableFileWriter* file,
+                                         uint64_t* offset, char* meta_bytes_buf,
+                                         size_t* meta_bytes_buf_size) {
+  ParsedInternalKey parsed_key;
+  Status pik_status =
+      ParseInternalKey(key, &parsed_key, false /* log_err_key */);  // TODO
+  if (!pik_status.ok()) {
+    return IOStatus::Corruption(pik_status.getState());
+  }
+
+  Slice key_to_write = key;  // Portion of internal key to write out.
+
+  uint32_t user_key_size = static_cast<uint32_t>(key.size() - 8);
+  if (encoding_type_ == kPlain) {
+    if (fixed_user_key_len_ == kPlainTableVariableLength) {
+      // Write key length
+      char key_size_buf[5];  // tmp buffer for key size as varint32
+      char* ptr = EncodeVarint32(key_size_buf, user_key_size);
+      assert(ptr <= key_size_buf + sizeof(key_size_buf));
+      auto len = ptr - key_size_buf;
+      IOStatus io_s = file->Append(Slice(key_size_buf, len));
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      *offset += len;
+    }
+  } else {
+    assert(encoding_type_ == kPrefix);
+    char size_bytes[12];
+    size_t size_bytes_pos = 0;
+
+    Slice prefix =
+        prefix_extractor_->Transform(Slice(key.data(), user_key_size));
+    if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetUserKey() ||
+        key_count_for_prefix_ % index_sparseness_ == 0) {
+      key_count_for_prefix_ = 1;
+      pre_prefix_.SetUserKey(prefix);
+      size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes);
+      IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos));
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      *offset += size_bytes_pos;
+    } else {
+      key_count_for_prefix_++;
+      if (key_count_for_prefix_ == 2) {
+        // For second key within a prefix, need to encode prefix length
+        size_bytes_pos +=
+            EncodeSize(kPrefixFromPreviousKey,
+                       static_cast<uint32_t>(pre_prefix_.GetUserKey().size()),
+                       size_bytes + size_bytes_pos);
+      }
+      uint32_t prefix_len =
+          static_cast<uint32_t>(pre_prefix_.GetUserKey().size());
+      size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len,
+                                   size_bytes + size_bytes_pos);
+      IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos));
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      *offset += size_bytes_pos;
+      key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len);
+    }
+  }
+
+  // Encode full key
+  // For value size as varint32 (up to 5 bytes).
+  // If the row is of value type with seqId 0, flush the special flag together
+  // in this buffer to safe one file append call, which takes 1 byte.
+  if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
+    IOStatus io_s =
+        file->Append(Slice(key_to_write.data(), key_to_write.size() - 8));
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    *offset += key_to_write.size() - 8;
+    meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0;
+    *meta_bytes_buf_size += 1;
+  } else {
+    IOStatus io_s = file->Append(key_to_write);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    *offset += key_to_write.size();
+  }
+
+  return IOStatus::OK();
+}
+
+Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset,
+                                          uint32_t len) {
+  assert(file_offset + len <= file_info_->data_end_offset);
+  return Slice(buffer->buf.get() + (file_offset - buffer->buf_start_offset),
+               len);
+}
+
+bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len,
+                                       Slice* out) {
+  const uint32_t kPrefetchSize = 256u;
+
+  // Try to read from buffers.
+  for (uint32_t i = 0; i < num_buf_; i++) {
+    Buffer* buffer = buffers_[num_buf_ - 1 - i].get();
+    if (file_offset >= buffer->buf_start_offset &&
+        file_offset + len <= buffer->buf_start_offset + buffer->buf_len) {
+      *out = GetFromBuffer(buffer, file_offset, len);
+      return true;
+    }
+  }
+
+  Buffer* new_buffer;
+  // Data needed is not in any of the buffer. Allocate a new buffer.
+  if (num_buf_ < buffers_.size()) {
+    // Add a new buffer
+    new_buffer = new Buffer();
+    buffers_[num_buf_++].reset(new_buffer);
+  } else {
+    // Now simply replace the last buffer. Can improve the placement policy
+    // if needed.
+    new_buffer = buffers_[num_buf_ - 1].get();
+  }
+
+  assert(file_offset + len <= file_info_->data_end_offset);
+  uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset,
+                                   std::max(kPrefetchSize, len));
+  if (size_to_read > new_buffer->buf_capacity) {
+    new_buffer->buf.reset(new char[size_to_read]);
+    new_buffer->buf_capacity = size_to_read;
+    new_buffer->buf_len = 0;
+  }
+  Slice read_result;
+  // TODO: rate limit plain table reads.
+  Status s =
+      file_info_->file->Read(IOOptions(), file_offset, size_to_read,
+                             &read_result, new_buffer->buf.get(), nullptr,
+                             Env::IO_TOTAL /* rate_limiter_priority */);
+  if (!s.ok()) {
+    status_ = s;
+    return false;
+  }
+  new_buffer->buf_start_offset = file_offset;
+  new_buffer->buf_len = size_to_read;
+  *out = GetFromBuffer(new_buffer, file_offset, len);
+  return true;
+}
+
+inline bool PlainTableFileReader::ReadVarint32(uint32_t offset, uint32_t* out,
+                                               uint32_t* bytes_read) {
+  if (file_info_->is_mmap_mode) {
+    const char* start = file_info_->file_data.data() + offset;
+    const char* limit =
+        file_info_->file_data.data() + file_info_->data_end_offset;
+    const char* key_ptr = GetVarint32Ptr(start, limit, out);
+    assert(key_ptr != nullptr);
+    *bytes_read = static_cast<uint32_t>(key_ptr - start);
+    return true;
+  } else {
+    return ReadVarint32NonMmap(offset, out, bytes_read);
+  }
+}
+
+bool PlainTableFileReader::ReadVarint32NonMmap(uint32_t offset, uint32_t* out,
+                                               uint32_t* bytes_read) {
+  const char* start;
+  const char* limit;
+  const uint32_t kMaxVarInt32Size = 6u;
+  uint32_t bytes_to_read =
+      std::min(file_info_->data_end_offset - offset, kMaxVarInt32Size);
+  Slice bytes;
+  if (!Read(offset, bytes_to_read, &bytes)) {
+    return false;
+  }
+  start = bytes.data();
+  limit = bytes.data() + bytes.size();
+
+  const char* key_ptr = GetVarint32Ptr(start, limit, out);
+  *bytes_read =
+      (key_ptr != nullptr) ? static_cast<uint32_t>(key_ptr - start) : 0;
+  return true;
+}
+
+Status PlainTableKeyDecoder::ReadInternalKey(
+    uint32_t file_offset, uint32_t user_key_size, ParsedInternalKey* parsed_key,
+    uint32_t* bytes_read, bool* internal_key_valid, Slice* internal_key) {
+  Slice tmp_slice;
+  bool success = file_reader_.Read(file_offset, user_key_size + 1, &tmp_slice);
+  if (!success) {
+    return file_reader_.status();
+  }
+  if (tmp_slice[user_key_size] == PlainTableFactory::kValueTypeSeqId0) {
+    // Special encoding for the row with seqID=0
+    parsed_key->user_key = Slice(tmp_slice.data(), user_key_size);
+    parsed_key->sequence = 0;
+    parsed_key->type = kTypeValue;
+    *bytes_read += user_key_size + 1;
+    *internal_key_valid = false;
+  } else {
+    success = file_reader_.Read(file_offset, user_key_size + 8, internal_key);
+    if (!success) {
+      return file_reader_.status();
+    }
+    *internal_key_valid = true;
+    Status pik_status = ParseInternalKey(*internal_key, parsed_key,
+                                         false /* log_err_key */);  // TODO
+    if (!pik_status.ok()) {
+      return Status::Corruption(
+          Slice("Corrupted key found during next key read. "),
+          pik_status.getState());
+    }
+    *bytes_read += user_key_size + 8;
+  }
+  return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset,
+                                                  ParsedInternalKey* parsed_key,
+                                                  Slice* internal_key,
+                                                  uint32_t* bytes_read,
+                                                  bool* /*seekable*/) {
+  uint32_t user_key_size = 0;
+  Status s;
+  if (fixed_user_key_len_ != kPlainTableVariableLength) {
+    user_key_size = fixed_user_key_len_;
+  } else {
+    uint32_t tmp_size = 0;
+    uint32_t tmp_read;
+    bool success =
+        file_reader_.ReadVarint32(start_offset, &tmp_size, &tmp_read);
+    if (!success) {
+      return file_reader_.status();
+    }
+    assert(tmp_read > 0);
+    user_key_size = tmp_size;
+    *bytes_read = tmp_read;
+  }
+  // dummy initial value to avoid compiler complain
+  bool decoded_internal_key_valid = true;
+  Slice decoded_internal_key;
+  s = ReadInternalKey(start_offset + *bytes_read, user_key_size, parsed_key,
+                      bytes_read, &decoded_internal_key_valid,
+                      &decoded_internal_key);
+  if (!s.ok()) {
+    return s;
+  }
+  if (!file_reader_.file_info()->is_mmap_mode) {
+    cur_key_.SetInternalKey(*parsed_key);
+    parsed_key->user_key =
+        Slice(cur_key_.GetInternalKey().data(), user_key_size);
+    if (internal_key != nullptr) {
+      *internal_key = cur_key_.GetInternalKey();
+    }
+  } else if (internal_key != nullptr) {
+    if (decoded_internal_key_valid) {
+      *internal_key = decoded_internal_key;
+    } else {
+      // Need to copy out the internal key
+      cur_key_.SetInternalKey(*parsed_key);
+      *internal_key = cur_key_.GetInternalKey();
+    }
+  }
+  return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextPrefixEncodingKey(
+    uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key,
+    uint32_t* bytes_read, bool* seekable) {
+  PlainTableEntryType entry_type;
+
+  bool expect_suffix = false;
+  Status s;
+  do {
+    uint32_t size = 0;
+    // dummy initial value to avoid compiler complain
+    bool decoded_internal_key_valid = true;
+    uint32_t my_bytes_read = 0;
+    s = DecodeSize(start_offset + *bytes_read, &entry_type, &size,
+                   &my_bytes_read);
+    if (!s.ok()) {
+      return s;
+    }
+    if (my_bytes_read == 0) {
+      return Status::Corruption("Unexpected EOF when reading size of the key");
+    }
+    *bytes_read += my_bytes_read;
+
+    switch (entry_type) {
+      case kFullKey: {
+        expect_suffix = false;
+        Slice decoded_internal_key;
+        s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+                            bytes_read, &decoded_internal_key_valid,
+                            &decoded_internal_key);
+        if (!s.ok()) {
+          return s;
+        }
+        if (!file_reader_.file_info()->is_mmap_mode ||
+            (internal_key != nullptr && !decoded_internal_key_valid)) {
+          // In non-mmap mode, always need to make a copy of keys returned to
+          // users, because after reading value for the key, the key might
+          // be invalid.
+          cur_key_.SetInternalKey(*parsed_key);
+          saved_user_key_ = cur_key_.GetUserKey();
+          if (!file_reader_.file_info()->is_mmap_mode) {
+            parsed_key->user_key =
+                Slice(cur_key_.GetInternalKey().data(), size);
+          }
+          if (internal_key != nullptr) {
+            *internal_key = cur_key_.GetInternalKey();
+          }
+        } else {
+          if (internal_key != nullptr) {
+            *internal_key = decoded_internal_key;
+          }
+          saved_user_key_ = parsed_key->user_key;
+        }
+        break;
+      }
+      case kPrefixFromPreviousKey: {
+        if (seekable != nullptr) {
+          *seekable = false;
+        }
+        prefix_len_ = size;
+        assert(prefix_extractor_ == nullptr ||
+               prefix_extractor_->Transform(saved_user_key_).size() ==
+                   prefix_len_);
+        // Need read another size flag for suffix
+        expect_suffix = true;
+        break;
+      }
+      case kKeySuffix: {
+        expect_suffix = false;
+        if (seekable != nullptr) {
+          *seekable = false;
+        }
+
+        Slice tmp_slice;
+        s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+                            bytes_read, &decoded_internal_key_valid,
+                            &tmp_slice);
+        if (!s.ok()) {
+          return s;
+        }
+        if (!file_reader_.file_info()->is_mmap_mode) {
+          // In non-mmap mode, we need to make a copy of keys returned to
+          // users, because after reading value for the key, the key might
+          // be invalid.
+          // saved_user_key_ points to cur_key_. We are making a copy of
+          // the prefix part to another string, and construct the current
+          // key from the prefix part and the suffix part back to cur_key_.
+          std::string tmp =
+              Slice(saved_user_key_.data(), prefix_len_).ToString();
+          cur_key_.Reserve(prefix_len_ + size);
+          cur_key_.SetInternalKey(tmp, *parsed_key);
+          parsed_key->user_key =
+              Slice(cur_key_.GetInternalKey().data(), prefix_len_ + size);
+          saved_user_key_ = cur_key_.GetUserKey();
+        } else {
+          cur_key_.Reserve(prefix_len_ + size);
+          cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_),
+                                  *parsed_key);
+        }
+        parsed_key->user_key = cur_key_.GetUserKey();
+        if (internal_key != nullptr) {
+          *internal_key = cur_key_.GetInternalKey();
+        }
+        break;
+      }
+      default:
+        return Status::Corruption("Un-identified size flag.");
+    }
+  } while (expect_suffix);  // Another round if suffix is expected.
+  return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextKey(uint32_t start_offset,
+                                     ParsedInternalKey* parsed_key,
+                                     Slice* internal_key, Slice* value,
+                                     uint32_t* bytes_read, bool* seekable) {
+  assert(value != nullptr);
+  Status s = NextKeyNoValue(start_offset, parsed_key, internal_key, bytes_read,
+                            seekable);
+  if (s.ok()) {
+    assert(bytes_read != nullptr);
+    uint32_t value_size;
+    uint32_t value_size_bytes;
+    bool success = file_reader_.ReadVarint32(start_offset + *bytes_read,
+                                             &value_size, &value_size_bytes);
+    if (!success) {
+      return file_reader_.status();
+    }
+    if (value_size_bytes == 0) {
+      return Status::Corruption(
+          "Unexpected EOF when reading the next value's size.");
+    }
+    *bytes_read += value_size_bytes;
+    success = file_reader_.Read(start_offset + *bytes_read, value_size, value);
+    if (!success) {
+      return file_reader_.status();
+    }
+    *bytes_read += value_size;
+  }
+  return s;
+}
+
+Status PlainTableKeyDecoder::NextKeyNoValue(uint32_t start_offset,
+                                            ParsedInternalKey* parsed_key,
+                                            Slice* internal_key,
+                                            uint32_t* bytes_read,
+                                            bool* seekable) {
+  *bytes_read = 0;
+  if (seekable != nullptr) {
+    *seekable = true;
+  }
+  if (encoding_type_ == kPlain) {
+    return NextPlainEncodingKey(start_offset, parsed_key, internal_key,
+                                bytes_read, seekable);
+  } else {
+    assert(encoding_type_ == kPrefix);
+    return NextPrefixEncodingKey(start_offset, parsed_key, internal_key,
+                                 bytes_read, seekable);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LIT
diff --git a/src/rocksdb/table/plain/plain_table_key_coding.h b/src/rocksdb/table/plain/plain_table_key_coding.h
new file mode 100644
index 000000000..9cda7df32
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_key_coding.h
@@ -0,0 +1,201 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <array>
+
+#include "rocksdb/slice.h"
+#include "table/plain/plain_table_reader.h"
+
+// The file contains three helper classes of PlainTable format,
+// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
+// These classes issue the lowest level of operations of PlainTable.
+// Actual data format of the key is documented in comments of class
+// PlainTableFactory.
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFile;
+struct ParsedInternalKey;
+struct PlainTableReaderFileInfo;
+enum PlainTableEntryType : unsigned char;
+
+// Helper class for PlainTable format to write out a key to an output file
+// The class is used in PlainTableBuilder.
+class PlainTableKeyEncoder {
+ public:
+  explicit PlainTableKeyEncoder(EncodingType encoding_type,
+                                uint32_t user_key_len,
+                                const SliceTransform* prefix_extractor,
+                                size_t index_sparseness)
+      : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
+        fixed_user_key_len_(user_key_len),
+        prefix_extractor_(prefix_extractor),
+        index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
+        key_count_for_prefix_(0) {}
+  // key: the key to write out, in the format of internal key.
+  // file: the output file to write out
+  // offset: offset in the file. Needs to be updated after appending bytes
+  //         for the key
+  // meta_bytes_buf: buffer for extra meta bytes
+  // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
+  //                      if meta_bytes_buf is updated.
+  IOStatus AppendKey(const Slice& key, WritableFileWriter* file,
+                     uint64_t* offset, char* meta_bytes_buf,
+                     size_t* meta_bytes_buf_size);
+
+  // Return actual encoding type to be picked
+  EncodingType GetEncodingType() { return encoding_type_; }
+
+ private:
+  EncodingType encoding_type_;
+  uint32_t fixed_user_key_len_;
+  const SliceTransform* prefix_extractor_;
+  const size_t index_sparseness_;
+  size_t key_count_for_prefix_;
+  IterKey pre_prefix_;
+};
+
+// The class does raw file reads for PlainTableReader.
+// It hides whether it is a mmap-read, or a non-mmap read.
+// The class is implemented in a way to favor the performance of mmap case.
+// The class is used by PlainTableReader.
+class PlainTableFileReader {
+ public:
+  explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
+      : file_info_(_file_info), num_buf_(0) {}
+
+  ~PlainTableFileReader() {
+    // Should fix.
+    status_.PermitUncheckedError();
+  }
+
+  // In mmaped mode, the results point to mmaped area of the file, which
+  // means it is always valid before closing the file.
+  // In non-mmap mode, the results point to an internal buffer. If the caller
+  // makes another read call, the results may not be valid. So callers should
+  // make a copy when needed.
+  // In order to save read calls to files, we keep two internal buffers:
+  // the first read and the most recent read. This is efficient because it
+  // columns these two common use cases:
+  // (1) hash index only identify one location, we read the key to verify
+  //     the location, and read key and value if it is the right location.
+  // (2) after hash index checking, we identify two locations (because of
+  //     hash bucket conflicts), we binary search the two location to see
+  //     which one is what we need and start to read from the location.
+  // These two most common use cases will be covered by the two buffers
+  // so that we don't need to re-read the same location.
+  // Currently we keep a fixed size buffer. If a read doesn't exactly fit
+  // the buffer, we replace the second buffer with the location user reads.
+  //
+  // If return false, status code is stored in status_.
+  bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
+    if (file_info_->is_mmap_mode) {
+      assert(file_offset + len <= file_info_->data_end_offset);
+      *out = Slice(file_info_->file_data.data() + file_offset, len);
+      return true;
+    } else {
+      return ReadNonMmap(file_offset, len, out);
+    }
+  }
+
+  // If return false, status code is stored in status_.
+  bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
+
+  // *bytes_read = 0 means eof. false means failure and status is saved
+  // in status_. Not directly returning Status to save copying status
+  // object to map previous performance of mmap mode.
+  inline bool ReadVarint32(uint32_t offset, uint32_t* output,
+                           uint32_t* bytes_read);
+
+  bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
+                           uint32_t* bytes_read);
+
+  Status status() const { return status_; }
+
+  const PlainTableReaderFileInfo* file_info() { return file_info_; }
+
+ private:
+  const PlainTableReaderFileInfo* file_info_;
+
+  struct Buffer {
+    Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
+    std::unique_ptr<char[]> buf;
+    uint32_t buf_start_offset;
+    uint32_t buf_len;
+    uint32_t buf_capacity;
+  };
+
+  // Keep buffers for two recent reads.
+  std::array<std::unique_ptr<Buffer>, 2> buffers_;
+  uint32_t num_buf_;
+  Status status_;
+
+  Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
+};
+
+// A helper class to decode keys from input buffer
+// The class is used by PlainTableBuilder.
+class PlainTableKeyDecoder {
+ public:
+  explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
+                                EncodingType encoding_type,
+                                uint32_t user_key_len,
+                                const SliceTransform* prefix_extractor)
+      : file_reader_(file_info),
+        encoding_type_(encoding_type),
+        prefix_len_(0),
+        fixed_user_key_len_(user_key_len),
+        prefix_extractor_(prefix_extractor),
+        in_prefix_(false) {}
+
+  // Find the next key.
+  // start: char array where the key starts.
+  // limit: boundary of the char array
+  // parsed_key: the output of the result key
+  // internal_key: if not null, fill with the output of the result key in
+  //               un-parsed format
+  // bytes_read: how many bytes read from start. Output
+  // seekable: whether key can be read from this place. Used when building
+  //           indexes. Output.
+  Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
+                 Slice* internal_key, Slice* value, uint32_t* bytes_read,
+                 bool* seekable = nullptr);
+
+  Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
+                        Slice* internal_key, uint32_t* bytes_read,
+                        bool* seekable = nullptr);
+
+  PlainTableFileReader file_reader_;
+  EncodingType encoding_type_;
+  uint32_t prefix_len_;
+  uint32_t fixed_user_key_len_;
+  Slice saved_user_key_;
+  IterKey cur_key_;
+  const SliceTransform* prefix_extractor_;
+  bool in_prefix_;
+
+ private:
+  Status NextPlainEncodingKey(uint32_t start_offset,
+                              ParsedInternalKey* parsed_key,
+                              Slice* internal_key, uint32_t* bytes_read,
+                              bool* seekable = nullptr);
+  Status NextPrefixEncodingKey(uint32_t start_offset,
+                               ParsedInternalKey* parsed_key,
+                               Slice* internal_key, uint32_t* bytes_read,
+                               bool* seekable = nullptr);
+  Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
+                         ParsedInternalKey* parsed_key, uint32_t* bytes_read,
+                         bool* internal_key_valid, Slice* internal_key);
+  inline Status DecodeSize(uint32_t start_offset,
+                           PlainTableEntryType* entry_type, uint32_t* key_size,
+                           uint32_t* bytes_read);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_reader.cc b/src/rocksdb/table/plain/plain_table_reader.cc
new file mode 100644
index 000000000..6ce3d0ab9
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_reader.cc
@@ -0,0 +1,765 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "table/plain/plain_table_reader.h"
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "memory/arena.h"
+#include "monitoring/histogram.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "table/block_based/block.h"
+#include "table/block_based/filter_block.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Safely getting a uint32_t element from a char array, where, starting from
+// `base`, every 4 bytes are considered as an fixed 32 bit integer.
+inline uint32_t GetFixed32Element(const char* base, size_t offset) {
+  return DecodeFixed32(base + offset * sizeof(uint32_t));
+}
+}  // namespace
+
+// Iterator to iterate IndexedTable
+class PlainTableIterator : public InternalIterator {
+ public:
+  explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
+  // No copying allowed
+  PlainTableIterator(const PlainTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+
+  ~PlainTableIterator() override;
+
+  bool Valid() const override;
+
+  void SeekToFirst() override;
+
+  void SeekToLast() override;
+
+  void Seek(const Slice& target) override;
+
+  void SeekForPrev(const Slice& target) override;
+
+  void Next() override;
+
+  void Prev() override;
+
+  Slice key() const override;
+
+  Slice value() const override;
+
+  Status status() const override;
+
+ private:
+  PlainTableReader* table_;
+  PlainTableKeyDecoder decoder_;
+  bool use_prefix_seek_;
+  uint32_t offset_;
+  uint32_t next_offset_;
+  Slice key_;
+  Slice value_;
+  Status status_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+PlainTableReader::PlainTableReader(
+    const ImmutableOptions& ioptions,
+    std::unique_ptr<RandomAccessFileReader>&& file,
+    const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
+    EncodingType encoding_type, uint64_t file_size,
+    const TableProperties* table_properties,
+    const SliceTransform* prefix_extractor)
+    : internal_comparator_(icomparator),
+      encoding_type_(encoding_type),
+      full_scan_mode_(false),
+      user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
+      prefix_extractor_(prefix_extractor),
+      enable_bloom_(false),
+      bloom_(6),
+      file_info_(std::move(file), storage_options,
+                 static_cast<uint32_t>(table_properties->data_size)),
+      ioptions_(ioptions),
+      file_size_(file_size),
+      table_properties_(nullptr) {}
+
+PlainTableReader::~PlainTableReader() {
+  // Should fix?
+  status_.PermitUncheckedError();
+}
+
+Status PlainTableReader::Open(
+    const ImmutableOptions& ioptions, const EnvOptions& env_options,
+    const InternalKeyComparator& internal_comparator,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
+    double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
+    bool full_scan_mode, const bool immortal_table,
+    const SliceTransform* prefix_extractor) {
+  if (file_size > PlainTableIndex::kMaxFileSize) {
+    return Status::NotSupported("File is too large for PlainTableReader!");
+  }
+
+  std::unique_ptr<TableProperties> props;
+  auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                               ioptions, &props);
+  if (!s.ok()) {
+    return s;
+  }
+
+  assert(hash_table_ratio >= 0.0);
+  auto& user_props = props->user_collected_properties;
+  auto prefix_extractor_in_file = props->prefix_extractor_name;
+
+  if (!full_scan_mode &&
+      !prefix_extractor_in_file.empty() /* old version sst file*/
+      && prefix_extractor_in_file != "nullptr") {
+    if (!prefix_extractor) {
+      return Status::InvalidArgument(
+          "Prefix extractor is missing when opening a PlainTable built "
+          "using a prefix extractor");
+    } else if (prefix_extractor_in_file != prefix_extractor->AsString()) {
+      return Status::InvalidArgument(
+          "Prefix extractor given doesn't match the one used to build "
+          "PlainTable");
+    }
+  }
+
+  EncodingType encoding_type = kPlain;
+  auto encoding_type_prop =
+      user_props.find(PlainTablePropertyNames::kEncodingType);
+  if (encoding_type_prop != user_props.end()) {
+    encoding_type = static_cast<EncodingType>(
+        DecodeFixed32(encoding_type_prop->second.c_str()));
+  }
+
+  std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
+      ioptions, std::move(file), env_options, internal_comparator,
+      encoding_type, file_size, props.get(), prefix_extractor));
+
+  s = new_reader->MmapDataIfNeeded();
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (!full_scan_mode) {
+    s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
+                                  hash_table_ratio, index_sparseness,
+                                  huge_page_tlb_size);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    // Flag to indicate it is a full scan mode so that none of the indexes
+    // can be used.
+    new_reader->full_scan_mode_ = true;
+  }
+  // PopulateIndex can add to the props, so don't store them until now
+  new_reader->table_properties_ = std::move(props);
+
+  if (immortal_table && new_reader->file_info_.is_mmap_mode) {
+    new_reader->dummy_cleanable_.reset(new Cleanable());
+  }
+
+  *table_reader = std::move(new_reader);
+  return s;
+}
+
+void PlainTableReader::SetupForCompaction() {}
+
+InternalIterator* PlainTableReader::NewIterator(
+    const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
+    Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
+    size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
+  // Not necessarily used here, but make sure this has been initialized
+  assert(table_properties_);
+
+  // Auto prefix mode is not implemented in PlainTable.
+  bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
+                         !options.auto_prefix_mode;
+  if (arena == nullptr) {
+    return new PlainTableIterator(this, use_prefix_seek);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
+    return new (mem) PlainTableIterator(this, use_prefix_seek);
+  }
+}
+
+Status PlainTableReader::PopulateIndexRecordList(
+    PlainTableIndexBuilder* index_builder,
+    std::vector<uint32_t>* prefix_hashes) {
+  Slice prev_key_prefix_slice;
+  std::string prev_key_prefix_buf;
+  uint32_t pos = data_start_offset_;
+
+  bool is_first_record = true;
+  Slice key_prefix_slice;
+  PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
+                               prefix_extractor_);
+  while (pos < file_info_.data_end_offset) {
+    uint32_t key_offset = pos;
+    ParsedInternalKey key;
+    Slice value_slice;
+    bool seekable = false;
+    Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
+    if (!s.ok()) {
+      return s;
+    }
+
+    key_prefix_slice = GetPrefix(key);
+    if (enable_bloom_) {
+      bloom_.AddHash(GetSliceHash(key.user_key));
+    } else {
+      if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
+        if (!is_first_record) {
+          prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
+        }
+        if (file_info_.is_mmap_mode) {
+          prev_key_prefix_slice = key_prefix_slice;
+        } else {
+          prev_key_prefix_buf = key_prefix_slice.ToString();
+          prev_key_prefix_slice = prev_key_prefix_buf;
+        }
+      }
+    }
+
+    index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
+
+    if (!seekable && is_first_record) {
+      return Status::Corruption("Key for a prefix is not seekable");
+    }
+
+    is_first_record = false;
+  }
+
+  prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
+  auto s = index_.InitFromRawData(index_builder->Finish());
+  return s;
+}
+
+void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
+                                     size_t huge_page_tlb_size) {
+  uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
+  if (bloom_total_bits > 0) {
+    enable_bloom_ = true;
+    bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
+                        huge_page_tlb_size, ioptions_.logger);
+  }
+}
+
+void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
+  assert(bloom_.IsInitialized());
+  for (const auto prefix_hash : prefix_hashes) {
+    bloom_.AddHash(prefix_hash);
+  }
+}
+
+Status PlainTableReader::MmapDataIfNeeded() {
+  if (file_info_.is_mmap_mode) {
+    // Get mmapped memory.
+    return file_info_.file->Read(
+        IOOptions(), 0, static_cast<size_t>(file_size_), &file_info_.file_data,
+        nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+  }
+  return Status::OK();
+}
+
+Status PlainTableReader::PopulateIndex(TableProperties* props,
+                                       int bloom_bits_per_key,
+                                       double hash_table_ratio,
+                                       size_t index_sparseness,
+                                       size_t huge_page_tlb_size) {
+  assert(props != nullptr);
+
+  BlockContents index_block_contents;
+  Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+                           file_size_, kPlainTableMagicNumber, ioptions_,
+                           PlainTableIndexBuilder::kPlainTableIndexBlock,
+                           BlockType::kIndex, &index_block_contents);
+
+  bool index_in_file = s.ok();
+
+  BlockContents bloom_block_contents;
+  bool bloom_in_file = false;
+  // We only need to read the bloom block if index block is in file.
+  if (index_in_file) {
+    s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+                      file_size_, kPlainTableMagicNumber, ioptions_,
+                      BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
+                      &bloom_block_contents);
+    bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
+  }
+
+  Slice* bloom_block;
+  if (bloom_in_file) {
+    // If bloom_block_contents.allocation is not empty (which will be the case
+    // for non-mmap mode), it holds the alloated memory for the bloom block.
+    // It needs to be kept alive to keep `bloom_block` valid.
+    bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
+    bloom_block = &bloom_block_contents.data;
+  } else {
+    bloom_block = nullptr;
+  }
+
+  Slice* index_block;
+  if (index_in_file) {
+    // If index_block_contents.allocation is not empty (which will be the case
+    // for non-mmap mode), it holds the alloated memory for the index block.
+    // It needs to be kept alive to keep `index_block` valid.
+    index_block_alloc_ = std::move(index_block_contents.allocation);
+    index_block = &index_block_contents.data;
+  } else {
+    index_block = nullptr;
+  }
+
+  if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
+    // moptions.prefix_extractor is requried for a hash-based look-up.
+    return Status::NotSupported(
+        "PlainTable requires a prefix extractor enable prefix hash mode.");
+  }
+
+  // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
+  // for a prefix (starting from the first one), generate a record of (hash,
+  // offset) and append it to IndexRecordList, which is a data structure created
+  // to store them.
+
+  if (!index_in_file) {
+    // Allocate bloom filter here for total order mode.
+    if (IsTotalOrderMode()) {
+      AllocateBloom(bloom_bits_per_key,
+                    static_cast<uint32_t>(props->num_entries),
+                    huge_page_tlb_size);
+    }
+  } else if (bloom_in_file) {
+    enable_bloom_ = true;
+    auto num_blocks_property = props->user_collected_properties.find(
+        PlainTablePropertyNames::kNumBloomBlocks);
+
+    uint32_t num_blocks = 0;
+    if (num_blocks_property != props->user_collected_properties.end()) {
+      Slice temp_slice(num_blocks_property->second);
+      if (!GetVarint32(&temp_slice, &num_blocks)) {
+        num_blocks = 0;
+      }
+    }
+    // cast away const qualifier, because bloom_ won't be changed
+    bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
+                      static_cast<uint32_t>(bloom_block->size()) * 8,
+                      num_blocks);
+  } else {
+    // Index in file but no bloom in file. Disable bloom filter in this case.
+    enable_bloom_ = false;
+    bloom_bits_per_key = 0;
+  }
+
+  PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
+                                       index_sparseness, hash_table_ratio,
+                                       huge_page_tlb_size);
+
+  std::vector<uint32_t> prefix_hashes;
+  if (!index_in_file) {
+    // Populates _bloom if enabled (total order mode)
+    s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    s = index_.InitFromRawData(*index_block);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (!index_in_file) {
+    if (!IsTotalOrderMode()) {
+      // Calculated bloom filter size and allocate memory for
+      // bloom filter based on the number of prefixes, then fill it.
+      AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
+                    huge_page_tlb_size);
+      if (enable_bloom_) {
+        FillBloom(prefix_hashes);
+      }
+    }
+  }
+
+  // Fill two table properties.
+  if (!index_in_file) {
+    props->user_collected_properties["plain_table_hash_table_size"] =
+        std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
+    props->user_collected_properties["plain_table_sub_index_size"] =
+        std::to_string(index_.GetSubIndexSize());
+  } else {
+    props->user_collected_properties["plain_table_hash_table_size"] =
+        std::to_string(0);
+    props->user_collected_properties["plain_table_sub_index_size"] =
+        std::to_string(0);
+  }
+
+  return Status::OK();
+}
+
+Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
+                                   const Slice& target, const Slice& prefix,
+                                   uint32_t prefix_hash, bool& prefix_matched,
+                                   uint32_t* offset) const {
+  prefix_matched = false;
+  uint32_t prefix_index_offset;
+  auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
+  if (res == PlainTableIndex::kNoPrefixForBucket) {
+    *offset = file_info_.data_end_offset;
+    return Status::OK();
+  } else if (res == PlainTableIndex::kDirectToFile) {
+    *offset = prefix_index_offset;
+    return Status::OK();
+  }
+
+  // point to sub-index, need to do a binary search
+  uint32_t upper_bound = 0;
+  const char* base_ptr =
+      index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
+  uint32_t low = 0;
+  uint32_t high = upper_bound;
+  ParsedInternalKey mid_key;
+  ParsedInternalKey parsed_target;
+  Status s = ParseInternalKey(target, &parsed_target,
+                              false /* log_err_key */);  // TODO
+  if (!s.ok()) return s;
+
+  // The key is between [low, high). Do a binary search between it.
+  while (high - low > 1) {
+    uint32_t mid = (high + low) / 2;
+    uint32_t file_offset = GetFixed32Element(base_ptr, mid);
+    uint32_t tmp;
+    s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
+    if (!s.ok()) {
+      return s;
+    }
+    int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
+    if (cmp_result < 0) {
+      low = mid;
+    } else {
+      if (cmp_result == 0) {
+        // Happen to have found the exact key or target is smaller than the
+        // first key after base_offset.
+        prefix_matched = true;
+        *offset = file_offset;
+        return Status::OK();
+      } else {
+        high = mid;
+      }
+    }
+  }
+  // Both of the key at the position low or low+1 could share the same
+  // prefix as target. We need to rule out one of them to avoid to go
+  // to the wrong prefix.
+  ParsedInternalKey low_key;
+  uint32_t tmp;
+  uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
+  s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (GetPrefix(low_key) == prefix) {
+    prefix_matched = true;
+    *offset = low_key_offset;
+  } else if (low + 1 < upper_bound) {
+    // There is possible a next prefix, return it
+    prefix_matched = false;
+    *offset = GetFixed32Element(base_ptr, low + 1);
+  } else {
+    // target is larger than a key of the last prefix in this bucket
+    // but with a different prefix. Key does not exist.
+    *offset = file_info_.data_end_offset;
+  }
+  return Status::OK();
+}
+
+bool PlainTableReader::MatchBloom(uint32_t hash) const {
+  if (!enable_bloom_) {
+    return true;
+  }
+
+  if (bloom_.MayContainHash(hash)) {
+    PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+    return true;
+  } else {
+    PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+    return false;
+  }
+}
+
+Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+                              ParsedInternalKey* parsed_key,
+                              Slice* internal_key, Slice* value,
+                              bool* seekable) const {
+  if (*offset == file_info_.data_end_offset) {
+    *offset = file_info_.data_end_offset;
+    return Status::OK();
+  }
+
+  if (*offset > file_info_.data_end_offset) {
+    return Status::Corruption("Offset is out of file size");
+  }
+
+  uint32_t bytes_read;
+  Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
+                              &bytes_read, seekable);
+  if (!s.ok()) {
+    return s;
+  }
+  *offset = *offset + bytes_read;
+  return Status::OK();
+}
+
+void PlainTableReader::Prepare(const Slice& target) {
+  if (enable_bloom_) {
+    uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
+    bloom_.Prefetch(prefix_hash);
+  }
+}
+
+Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
+                             GetContext* get_context,
+                             const SliceTransform* /* prefix_extractor */,
+                             bool /*skip_filters*/) {
+  // Check bloom filter first.
+  Slice prefix_slice;
+  uint32_t prefix_hash;
+  if (IsTotalOrderMode()) {
+    if (full_scan_mode_) {
+      status_ =
+          Status::InvalidArgument("Get() is not allowed in full scan mode.");
+    }
+    // Match whole user key for bloom filter check.
+    if (!MatchBloom(GetSliceHash(ExtractUserKey(target)))) {
+      return Status::OK();
+    }
+    // in total order mode, there is only one bucket 0, and we always use empty
+    // prefix.
+    prefix_slice = Slice();
+    prefix_hash = 0;
+  } else {
+    prefix_slice = GetPrefix(target);
+    prefix_hash = GetSliceHash(prefix_slice);
+    if (!MatchBloom(prefix_hash)) {
+      return Status::OK();
+    }
+  }
+  uint32_t offset;
+  bool prefix_match;
+  PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
+                               prefix_extractor_);
+  Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
+                       prefix_match, &offset);
+
+  if (!s.ok()) {
+    return s;
+  }
+  ParsedInternalKey found_key;
+  ParsedInternalKey parsed_target;
+  s = ParseInternalKey(target, &parsed_target,
+                       false /* log_err_key */);  // TODO
+  if (!s.ok()) return s;
+
+  Slice found_value;
+  while (offset < file_info_.data_end_offset) {
+    s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
+    if (!s.ok()) {
+      return s;
+    }
+    if (!prefix_match) {
+      // Need to verify prefix for the first key found if it is not yet
+      // checked.
+      if (GetPrefix(found_key) != prefix_slice) {
+        return Status::OK();
+      }
+      prefix_match = true;
+    }
+    // TODO(ljin): since we know the key comparison result here,
+    // can we enable the fast path?
+    if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
+      bool dont_care __attribute__((__unused__));
+      if (!get_context->SaveValue(found_key, found_value, &dont_care,
+                                  dummy_cleanable_.get())) {
+        break;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
+                                               TableReaderCaller /*caller*/) {
+  return 0;
+}
+
+uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
+                                           const Slice& /*end*/,
+                                           TableReaderCaller /*caller*/) {
+  return 0;
+}
+
+PlainTableIterator::PlainTableIterator(PlainTableReader* table,
+                                       bool use_prefix_seek)
+    : table_(table),
+      decoder_(&table_->file_info_, table_->encoding_type_,
+               table_->user_key_len_, table_->prefix_extractor_),
+      use_prefix_seek_(use_prefix_seek) {
+  next_offset_ = offset_ = table_->file_info_.data_end_offset;
+}
+
+PlainTableIterator::~PlainTableIterator() {}
+
+bool PlainTableIterator::Valid() const {
+  return offset_ < table_->file_info_.data_end_offset &&
+         offset_ >= table_->data_start_offset_;
+}
+
+void PlainTableIterator::SeekToFirst() {
+  status_ = Status::OK();
+  next_offset_ = table_->data_start_offset_;
+  if (next_offset_ >= table_->file_info_.data_end_offset) {
+    next_offset_ = offset_ = table_->file_info_.data_end_offset;
+  } else {
+    Next();
+  }
+}
+
+void PlainTableIterator::SeekToLast() {
+  assert(false);
+  status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
+  next_offset_ = offset_ = table_->file_info_.data_end_offset;
+}
+
+void PlainTableIterator::Seek(const Slice& target) {
+  if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
+    // This check is done here instead of NewIterator() to permit creating an
+    // iterator with total_order_seek = true even if we won't be able to Seek()
+    // it. This is needed for compaction: it creates iterator with
+    // total_order_seek = true but usually never does Seek() on it,
+    // only SeekToFirst().
+    status_ = Status::InvalidArgument(
+        "total_order_seek not implemented for PlainTable.");
+    offset_ = next_offset_ = table_->file_info_.data_end_offset;
+    return;
+  }
+
+  // If the user doesn't set prefix seek option and we are not able to do a
+  // total Seek(). assert failure.
+  if (table_->IsTotalOrderMode()) {
+    if (table_->full_scan_mode_) {
+      status_ =
+          Status::InvalidArgument("Seek() is not allowed in full scan mode.");
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+      return;
+    } else if (table_->GetIndexSize() > 1) {
+      assert(false);
+      status_ = Status::NotSupported(
+          "PlainTable cannot issue non-prefix seek unless in total order "
+          "mode.");
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+      return;
+    }
+  }
+
+  Slice prefix_slice = table_->GetPrefix(target);
+  uint32_t prefix_hash = 0;
+  // Bloom filter is ignored in total-order mode.
+  if (!table_->IsTotalOrderMode()) {
+    prefix_hash = GetSliceHash(prefix_slice);
+    if (!table_->MatchBloom(prefix_hash)) {
+      status_ = Status::OK();
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+      return;
+    }
+  }
+  bool prefix_match;
+  status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash,
+                              prefix_match, &next_offset_);
+  if (!status_.ok()) {
+    offset_ = next_offset_ = table_->file_info_.data_end_offset;
+    return;
+  }
+
+  if (next_offset_ < table_->file_info_.data_end_offset) {
+    for (Next(); status_.ok() && Valid(); Next()) {
+      if (!prefix_match) {
+        // Need to verify the first key's prefix
+        if (table_->GetPrefix(key()) != prefix_slice) {
+          offset_ = next_offset_ = table_->file_info_.data_end_offset;
+          break;
+        }
+        prefix_match = true;
+      }
+      if (table_->internal_comparator_.Compare(key(), target) >= 0) {
+        break;
+      }
+    }
+  } else {
+    offset_ = table_->file_info_.data_end_offset;
+  }
+}
+
+void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
+  assert(false);
+  status_ =
+      Status::NotSupported("SeekForPrev() is not supported in PlainTable");
+  offset_ = next_offset_ = table_->file_info_.data_end_offset;
+}
+
+void PlainTableIterator::Next() {
+  offset_ = next_offset_;
+  if (offset_ < table_->file_info_.data_end_offset) {
+    Slice tmp_slice;
+    ParsedInternalKey parsed_key;
+    status_ =
+        table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
+    if (!status_.ok()) {
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+    }
+  }
+}
+
+void PlainTableIterator::Prev() { assert(false); }
+
+Slice PlainTableIterator::key() const {
+  assert(Valid());
+  return key_;
+}
+
+Slice PlainTableIterator::value() const {
+  assert(Valid());
+  return value_;
+}
+
+Status PlainTableIterator::status() const { return status_; }
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_reader.h b/src/rocksdb/table/plain/plain_table_reader.h
new file mode 100644
index 000000000..62bda693a
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_reader.h
@@ -0,0 +1,244 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Block;
+struct BlockContents;
+class BlockHandle;
+class Footer;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class TableReader;
+class InternalKeyComparator;
+class PlainTableKeyDecoder;
+class GetContext;
+
+extern const uint32_t kPlainTableVariableLength;
+
+struct PlainTableReaderFileInfo {
+  bool is_mmap_mode;
+  Slice file_data;
+  uint32_t data_end_offset;
+  std::unique_ptr<RandomAccessFileReader> file;
+
+  PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file,
+                           const EnvOptions& storage_options,
+                           uint32_t _data_size_offset)
+      : is_mmap_mode(storage_options.use_mmap_reads),
+        data_end_offset(_data_size_offset),
+        file(std::move(_file)) {}
+};
+
+// The reader class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableReader : public TableReader {
+ public:
+  // Based on following output file format shown in plain_table_factory.h
+  // When opening the output file, PlainTableReader creates a hash table
+  // from key prefixes to offset of the output file. PlainTable will decide
+  // whether it points to the data offset of the first key with the key prefix
+  // or the offset of it. If there are too many keys share this prefix, it will
+  // create a binary search-able index from the suffix to offset on disk.
+  static Status Open(const ImmutableOptions& ioptions,
+                     const EnvOptions& env_options,
+                     const InternalKeyComparator& internal_comparator,
+                     std::unique_ptr<RandomAccessFileReader>&& file,
+                     uint64_t file_size, std::unique_ptr<TableReader>* table,
+                     const int bloom_bits_per_key, double hash_table_ratio,
+                     size_t index_sparseness, size_t huge_page_tlb_size,
+                     bool full_scan_mode, const bool immortal_table = false,
+                     const SliceTransform* prefix_extractor = nullptr);
+
+  // Returns new iterator over table contents
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
+
+  void Prepare(const Slice& target) override;
+
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& key,
+                               TableReaderCaller caller) override;
+
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           TableReaderCaller caller) override;
+
+  uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
+  void SetupForCompaction() override;
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override {
+    return table_properties_;
+  }
+
+  virtual size_t ApproximateMemoryUsage() const override {
+    return arena_.MemoryAllocatedBytes();
+  }
+
+  PlainTableReader(const ImmutableOptions& ioptions,
+                   std::unique_ptr<RandomAccessFileReader>&& file,
+                   const EnvOptions& env_options,
+                   const InternalKeyComparator& internal_comparator,
+                   EncodingType encoding_type, uint64_t file_size,
+                   const TableProperties* table_properties,
+                   const SliceTransform* prefix_extractor);
+  virtual ~PlainTableReader();
+
+ protected:
+  // Check bloom filter to see whether it might contain this prefix.
+  // The hash of the prefix is given, since it can be reused for index lookup
+  // too.
+  virtual bool MatchBloom(uint32_t hash) const;
+
+  // PopulateIndex() builds index of keys. It must be called before any query
+  // to the table.
+  //
+  // props: the table properties object that need to be stored. Ownership of
+  //        the object will be passed.
+  //
+
+  Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
+                       double hash_table_ratio, size_t index_sparseness,
+                       size_t huge_page_tlb_size);
+
+  Status MmapDataIfNeeded();
+
+ private:
+  const InternalKeyComparator internal_comparator_;
+  EncodingType encoding_type_;
+  // represents plain table's current status.
+  Status status_;
+
+  PlainTableIndex index_;
+  bool full_scan_mode_;
+
+  // data_start_offset_ and data_end_offset_ defines the range of the
+  // sst file that stores data.
+  const uint32_t data_start_offset_ = 0;
+  const uint32_t user_key_len_;
+  const SliceTransform* prefix_extractor_;
+
+  static const size_t kNumInternalBytes = 8;
+
+  // Bloom filter is used to rule out non-existent key
+  bool enable_bloom_;
+  PlainTableBloomV1 bloom_;
+  PlainTableReaderFileInfo file_info_;
+  Arena arena_;
+  CacheAllocationPtr index_block_alloc_;
+  CacheAllocationPtr bloom_block_alloc_;
+
+  const ImmutableOptions& ioptions_;
+  std::unique_ptr<Cleanable> dummy_cleanable_;
+  uint64_t file_size_;
+
+ protected:  // for testing
+  std::shared_ptr<const TableProperties> table_properties_;
+
+ private:
+  bool IsFixedLength() const {
+    return user_key_len_ != kPlainTableVariableLength;
+  }
+
+  size_t GetFixedInternalKeyLength() const {
+    return user_key_len_ + kNumInternalBytes;
+  }
+
+  Slice GetPrefix(const Slice& target) const {
+    assert(target.size() >= 8);  // target is internal key
+    return GetPrefixFromUserKey(ExtractUserKey(target));
+  }
+
+  Slice GetPrefix(const ParsedInternalKey& target) const {
+    return GetPrefixFromUserKey(target.user_key);
+  }
+
+  Slice GetPrefixFromUserKey(const Slice& user_key) const {
+    if (!IsTotalOrderMode()) {
+      return prefix_extractor_->Transform(user_key);
+    } else {
+      // Use empty slice as prefix if prefix_extractor is not set.
+      // In that case,
+      // it falls back to pure binary search and
+      // total iterator seek is supported.
+      return Slice();
+    }
+  }
+
+  friend class TableCache;
+  friend class PlainTableIterator;
+
+  // Internal helper function to generate an IndexRecordList object from all
+  // the rows, which contains index records as a list.
+  // If bloom_ is not null, all the keys' full-key hash will be added to the
+  // bloom filter.
+  Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
+                                 std::vector<uint32_t>* prefix_hashes);
+
+  // Internal helper function to allocate memory for bloom filter
+  void AllocateBloom(int bloom_bits_per_key, int num_prefixes,
+                     size_t huge_page_tlb_size);
+
+  void FillBloom(const std::vector<uint32_t>& prefix_hashes);
+
+  // Read the key and value at `offset` to parameters for keys, the and
+  // `seekable`.
+  // On success, `offset` will be updated as the offset for the next key.
+  // `parsed_key` will be key in parsed format.
+  // if `internal_key` is not empty, it will be filled with key with slice
+  // format.
+  // if `seekable` is not null, it will return whether we can directly read
+  // data using this offset.
+  Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+              ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
+              bool* seekable = nullptr) const;
+  // Get file offset for key target.
+  // return value prefix_matched is set to true if the offset is confirmed
+  // for a key with the same prefix as target.
+  Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target,
+                   const Slice& prefix, uint32_t prefix_hash,
+                   bool& prefix_matched, uint32_t* offset) const;
+
+  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+
+  // No copying allowed
+  explicit PlainTableReader(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE