summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/table/plain
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/rocksdb/table/plain/plain_table_bloom.cc78
-rw-r--r--src/rocksdb/table/plain/plain_table_bloom.h132
-rw-r--r--src/rocksdb/table/plain/plain_table_builder.cc337
-rw-r--r--src/rocksdb/table/plain/plain_table_builder.h154
-rw-r--r--src/rocksdb/table/plain/plain_table_factory.cc350
-rw-r--r--src/rocksdb/table/plain/plain_table_factory.h182
-rw-r--r--src/rocksdb/table/plain/plain_table_index.cc213
-rw-r--r--src/rocksdb/table/plain/plain_table_index.h248
-rw-r--r--src/rocksdb/table/plain/plain_table_key_coding.cc509
-rw-r--r--src/rocksdb/table/plain/plain_table_key_coding.h201
-rw-r--r--src/rocksdb/table/plain/plain_table_reader.cc765
-rw-r--r--src/rocksdb/table/plain/plain_table_reader.h244
12 files changed, 3413 insertions, 0 deletions
diff --git a/src/rocksdb/table/plain/plain_table_bloom.cc b/src/rocksdb/table/plain/plain_table_bloom.cc
new file mode 100644
index 000000000..21441f616
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_bloom.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "table/plain/plain_table_bloom.h"
+
+#include <algorithm>
+#include <string>
+
+#include "memory/allocator.h"
+#include "util/dynamic_bloom.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
+ uint32_t num_blocks =
+ (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+ // Make num_blocks an odd number to make sure more bits are involved
+ // when determining which block.
+ if (num_blocks % 2 == 0) {
+ num_blocks++;
+ }
+
+ return num_blocks * (CACHE_LINE_SIZE * 8);
+}
+} // namespace
+
+PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes)
+ : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
+
+void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits,
+ uint32_t num_blocks) {
+ data_ = raw_data;
+ kTotalBits = total_bits;
+ kNumBlocks = num_blocks;
+}
+
+void PlainTableBloomV1::SetTotalBits(Allocator* allocator, uint32_t total_bits,
+ uint32_t locality,
+ size_t huge_page_tlb_size,
+ Logger* logger) {
+ kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
+ : (total_bits + 7) / 8 * 8;
+ kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
+
+ assert(kNumBlocks > 0 || kTotalBits > 0);
+ assert(kNumProbes > 0);
+
+ uint32_t sz = kTotalBits / 8;
+ if (kNumBlocks > 0) {
+ sz += CACHE_LINE_SIZE - 1;
+ }
+ assert(allocator);
+
+ char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
+ memset(raw, 0, sz);
+ auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE;
+ if (kNumBlocks > 0 && cache_line_offset > 0) {
+ raw += CACHE_LINE_SIZE - cache_line_offset;
+ }
+ data_ = raw;
+}
+
+void BloomBlockBuilder::AddKeysHashes(
+ const std::vector<uint32_t>& keys_hashes) {
+ for (auto hash : keys_hashes) {
+ bloom_.AddHash(hash);
+ }
+}
+
+Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
+
+const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_bloom.h b/src/rocksdb/table/plain/plain_table_bloom.h
new file mode 100644
index 000000000..460e7ec39
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_bloom.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/bloom_impl.h"
+#include "util/hash.h"
+#include "util/math.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Slice;
+class Allocator;
+class Logger;
+
+// A legacy Bloom filter implementation used by Plain Table db format, for
+// schema backward compatibility. Not for use in new filter applications.
+class PlainTableBloomV1 {
+ public:
+ // allocator: pass allocator to bloom filter, hence trace the usage of memory
+ // total_bits: fixed total bits for the bloom
+ // num_probes: number of hash probes for a single key
+ // locality: If positive, optimize for cache line locality, 0 otherwise.
+ // hash_func: customized hash function
+ // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB
+ // within this page size. Need to reserve huge pages for
+ // it to be allocated, like:
+ // sysctl -w vm.nr_hugepages=20
+ // See linux doc Documentation/vm/hugetlbpage.txt
+ explicit PlainTableBloomV1(uint32_t num_probes = 6);
+ void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+ uint32_t locality, size_t huge_page_tlb_size,
+ Logger* logger);
+
+ ~PlainTableBloomV1() {}
+
+ // Assuming single threaded access to this function.
+ void AddHash(uint32_t hash);
+
+ // Multithreaded access to this function is OK
+ bool MayContainHash(uint32_t hash) const;
+
+ void Prefetch(uint32_t hash);
+
+ uint32_t GetNumBlocks() const { return kNumBlocks; }
+
+ Slice GetRawData() const { return Slice(data_, GetTotalBits() / 8); }
+
+ void SetRawData(char* raw_data, uint32_t total_bits, uint32_t num_blocks = 0);
+
+ uint32_t GetTotalBits() const { return kTotalBits; }
+
+ bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
+
+ private:
+ uint32_t kTotalBits;
+ uint32_t kNumBlocks;
+ const uint32_t kNumProbes;
+
+ char* data_;
+
+ static constexpr int LOG2_CACHE_LINE_SIZE =
+ ConstexprFloorLog2(CACHE_LINE_SIZE);
+};
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// local variable is initialized but not referenced
+#pragma warning(disable : 4189)
+#endif
+inline void PlainTableBloomV1::Prefetch(uint32_t h) {
+ if (kNumBlocks != 0) {
+ uint32_t ignored;
+ LegacyLocalityBloomImpl</*ExtraRotates*/ true>::PrepareHashMayMatch(
+ h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE);
+ }
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const {
+ assert(IsInitialized());
+ if (kNumBlocks != 0) {
+ return LegacyLocalityBloomImpl<true>::HashMayMatch(
+ h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE);
+ } else {
+ return LegacyNoLocalityBloomImpl::HashMayMatch(h, kTotalBits, kNumProbes,
+ data_);
+ }
+}
+
+inline void PlainTableBloomV1::AddHash(uint32_t h) {
+ assert(IsInitialized());
+ if (kNumBlocks != 0) {
+ LegacyLocalityBloomImpl<true>::AddHash(h, kNumBlocks, kNumProbes, data_,
+ LOG2_CACHE_LINE_SIZE);
+ } else {
+ LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_);
+ }
+}
+
+class BloomBlockBuilder {
+ public:
+ static const std::string kBloomBlock;
+
+ explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {}
+
+ void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+ uint32_t locality, size_t huge_page_tlb_size,
+ Logger* logger) {
+ bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size,
+ logger);
+ }
+
+ uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
+
+ void AddKeysHashes(const std::vector<uint32_t>& keys_hashes);
+
+ Slice Finish();
+
+ private:
+ PlainTableBloomV1 bloom_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_builder.cc b/src/rocksdb/table/plain/plain_table_builder.cc
new file mode 100644
index 000000000..04723955c
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_builder.cc
@@ -0,0 +1,337 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_builder.h"
+
+#include <assert.h>
+
+#include <limits>
+#include <map>
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// a utility that helps writing block content to the file
+// @offset will advance if @block_contents was successfully written.
+// @block_handle the block handle this particular block.
+IOStatus WriteBlock(const Slice& block_contents, WritableFileWriter* file,
+ uint64_t* offset, BlockHandle* block_handle) {
+ block_handle->set_offset(*offset);
+ block_handle->set_size(block_contents.size());
+ IOStatus io_s = file->Append(block_contents);
+
+ if (io_s.ok()) {
+ *offset += block_contents.size();
+ }
+ return io_s;
+}
+
+} // namespace
+
+// kPlainTableMagicNumber was picked by running
+// echo rocksdb.table.plain | sha1sum
+// and taking the leading 64 bits.
+extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
+extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
+
+PlainTableBuilder::PlainTableBuilder(
+ const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+ const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+ uint32_t column_family_id, int level_at_creation, WritableFileWriter* file,
+ uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness,
+ uint32_t bloom_bits_per_key, const std::string& column_family_name,
+ uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio,
+ bool store_index_in_file, const std::string& db_id,
+ const std::string& db_session_id, uint64_t file_number)
+ : ioptions_(ioptions),
+ moptions_(moptions),
+ bloom_block_(num_probes),
+ file_(file),
+ bloom_bits_per_key_(bloom_bits_per_key),
+ huge_page_tlb_size_(huge_page_tlb_size),
+ encoder_(encoding_type, user_key_len, moptions.prefix_extractor.get(),
+ index_sparseness),
+ store_index_in_file_(store_index_in_file),
+ prefix_extractor_(moptions.prefix_extractor.get()) {
+ // Build index block and save it in the file if hash_table_ratio > 0
+ if (store_index_in_file_) {
+ assert(hash_table_ratio > 0 || IsTotalOrderMode());
+ index_builder_.reset(new PlainTableIndexBuilder(
+ &arena_, ioptions, moptions.prefix_extractor.get(), index_sparseness,
+ hash_table_ratio, huge_page_tlb_size_));
+ properties_
+ .user_collected_properties[PlainTablePropertyNames::kBloomVersion] =
+ "1"; // For future use
+ }
+
+ properties_.fixed_key_len = user_key_len;
+
+ // for plain table, we put all the data in a big chuck.
+ properties_.num_data_blocks = 1;
+ // Fill it later if store_index_in_file_ == true
+ properties_.index_size = 0;
+ properties_.filter_size = 0;
+ // To support roll-back to previous version, now still use version 0 for
+ // plain encoding.
+ properties_.format_version = (encoding_type == kPlain) ? 0 : 1;
+ properties_.column_family_id = column_family_id;
+ properties_.column_family_name = column_family_name;
+ properties_.db_id = db_id;
+ properties_.db_session_id = db_session_id;
+ properties_.db_host_id = ioptions.db_host_id;
+ if (!ReifyDbHostIdProperty(ioptions_.env, &properties_.db_host_id).ok()) {
+ ROCKS_LOG_INFO(ioptions_.logger, "db_host_id property will not be set");
+ }
+ properties_.orig_file_number = file_number;
+ properties_.prefix_extractor_name =
+ moptions_.prefix_extractor != nullptr
+ ? moptions_.prefix_extractor->AsString()
+ : "nullptr";
+
+ std::string val;
+ PutFixed32(&val, static_cast<uint32_t>(encoder_.GetEncodingType()));
+ properties_
+ .user_collected_properties[PlainTablePropertyNames::kEncodingType] = val;
+
+ assert(int_tbl_prop_collector_factories);
+ for (auto& factory : *int_tbl_prop_collector_factories) {
+ assert(factory);
+
+ table_properties_collectors_.emplace_back(
+ factory->CreateIntTblPropCollector(column_family_id,
+ level_at_creation));
+ }
+}
+
+PlainTableBuilder::~PlainTableBuilder() {
+ // They are supposed to have been passed to users through Finish()
+ // if the file succeeds.
+ status_.PermitUncheckedError();
+ io_status_.PermitUncheckedError();
+}
+
+void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
+ // temp buffer for metadata bytes between key and value.
+ char meta_bytes_buf[6];
+ size_t meta_bytes_buf_size = 0;
+
+ ParsedInternalKey internal_key;
+ if (!ParseInternalKey(key, &internal_key, false /* log_err_key */)
+ .ok()) { // TODO
+ assert(false);
+ return;
+ }
+ if (internal_key.type == kTypeRangeDeletion) {
+ status_ = Status::NotSupported("Range deletion unsupported");
+ return;
+ }
+
+ // Store key hash
+ if (store_index_in_file_) {
+ if (moptions_.prefix_extractor == nullptr) {
+ keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
+ } else {
+ Slice prefix =
+ moptions_.prefix_extractor->Transform(internal_key.user_key);
+ keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
+ }
+ }
+
+ // Write value
+ assert(offset_ <= std::numeric_limits<uint32_t>::max());
+ auto prev_offset = static_cast<uint32_t>(offset_);
+ // Write out the key
+ io_status_ = encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
+ &meta_bytes_buf_size);
+ if (SaveIndexInFile()) {
+ index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset);
+ }
+
+ // Write value length
+ uint32_t value_size = static_cast<uint32_t>(value.size());
+ if (io_status_.ok()) {
+ char* end_ptr =
+ EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size);
+ assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf));
+ meta_bytes_buf_size = end_ptr - meta_bytes_buf;
+ io_status_ = file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size));
+ }
+
+ // Write value
+ if (io_status_.ok()) {
+ io_status_ = file_->Append(value);
+ offset_ += value_size + meta_bytes_buf_size;
+ }
+
+ if (io_status_.ok()) {
+ properties_.num_entries++;
+ properties_.raw_key_size += key.size();
+ properties_.raw_value_size += value.size();
+ if (internal_key.type == kTypeDeletion ||
+ internal_key.type == kTypeSingleDeletion) {
+ properties_.num_deletions++;
+ } else if (internal_key.type == kTypeMerge) {
+ properties_.num_merge_operands++;
+ }
+ }
+
+ // notify property collectors
+ NotifyCollectTableCollectorsOnAdd(
+ key, value, offset_, table_properties_collectors_, ioptions_.logger);
+ status_ = io_status_;
+}
+
+Status PlainTableBuilder::Finish() {
+ assert(!closed_);
+ closed_ = true;
+
+ properties_.data_size = offset_;
+
+ // Write the following blocks
+ // 1. [meta block: bloom] - optional
+ // 2. [meta block: index] - optional
+ // 3. [meta block: properties]
+ // 4. [metaindex block]
+ // 5. [footer]
+
+ MetaIndexBuilder meta_index_builer;
+
+ if (store_index_in_file_ && (properties_.num_entries > 0)) {
+ assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max());
+ BlockHandle bloom_block_handle;
+ if (bloom_bits_per_key_ > 0) {
+ bloom_block_.SetTotalBits(
+ &arena_,
+ static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_,
+ ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.logger);
+
+ PutVarint32(&properties_.user_collected_properties
+ [PlainTablePropertyNames::kNumBloomBlocks],
+ bloom_block_.GetNumBlocks());
+
+ bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_);
+
+ Slice bloom_finish_result = bloom_block_.Finish();
+
+ properties_.filter_size = bloom_finish_result.size();
+ io_status_ =
+ WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle);
+
+ if (!io_status_.ok()) {
+ status_ = io_status_;
+ return status_;
+ }
+ meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
+ }
+ BlockHandle index_block_handle;
+ Slice index_finish_result = index_builder_->Finish();
+
+ properties_.index_size = index_finish_result.size();
+ io_status_ =
+ WriteBlock(index_finish_result, file_, &offset_, &index_block_handle);
+
+ if (!io_status_.ok()) {
+ status_ = io_status_;
+ return status_;
+ }
+
+ meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
+ index_block_handle);
+ }
+
+ // Calculate bloom block size and index block size
+ PropertyBlockBuilder property_block_builder;
+ // -- Add basic properties
+ property_block_builder.AddTableProperty(properties_);
+
+ property_block_builder.Add(properties_.user_collected_properties);
+
+ // -- Add user collected properties
+ NotifyCollectTableCollectorsOnFinish(
+ table_properties_collectors_, ioptions_.logger, &property_block_builder);
+
+ // -- Write property block
+ BlockHandle property_block_handle;
+ IOStatus s = WriteBlock(property_block_builder.Finish(), file_, &offset_,
+ &property_block_handle);
+ if (!s.ok()) {
+ return static_cast<Status>(s);
+ }
+ meta_index_builer.Add(kPropertiesBlockName, property_block_handle);
+
+ // -- write metaindex block
+ BlockHandle metaindex_block_handle;
+ io_status_ = WriteBlock(meta_index_builer.Finish(), file_, &offset_,
+ &metaindex_block_handle);
+ if (!io_status_.ok()) {
+ status_ = io_status_;
+ return status_;
+ }
+
+ // Write Footer
+ // no need to write out new footer if we're using default checksum
+ FooterBuilder footer;
+ footer.Build(kPlainTableMagicNumber, /* format_version */ 0, offset_,
+ kNoChecksum, metaindex_block_handle);
+ io_status_ = file_->Append(footer.GetSlice());
+ if (io_status_.ok()) {
+ offset_ += footer.GetSlice().size();
+ }
+ status_ = io_status_;
+ return status_;
+}
+
+void PlainTableBuilder::Abandon() { closed_ = true; }
+
+uint64_t PlainTableBuilder::NumEntries() const {
+ return properties_.num_entries;
+}
+
+uint64_t PlainTableBuilder::FileSize() const { return offset_; }
+
+std::string PlainTableBuilder::GetFileChecksum() const {
+ if (file_ != nullptr) {
+ return file_->GetFileChecksum();
+ } else {
+ return kUnknownFileChecksum;
+ }
+}
+
+const char* PlainTableBuilder::GetFileChecksumFuncName() const {
+ if (file_ != nullptr) {
+ return file_->GetFileChecksumFuncName();
+ } else {
+ return kUnknownFileChecksumFuncName;
+ }
+}
+void PlainTableBuilder::SetSeqnoTimeTableProperties(const std::string& string,
+ uint64_t uint_64) {
+ // TODO: storing seqno to time mapping is not yet support for plain table.
+ TableBuilder::SetSeqnoTimeTableProperties(string, uint_64);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_builder.h b/src/rocksdb/table/plain/plain_table_builder.h
new file mode 100644
index 000000000..445491c2a
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_builder.h
@@ -0,0 +1,154 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_index.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/table_builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+class TableBuilder;
+
+// The builder class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableBuilder : public TableBuilder {
+ public:
+ // Create a builder that will store the contents of the table it is
+ // building in *file. Does not close the file. It is up to the
+ // caller to close the file after calling Finish(). The output file
+ // will be part of level specified by 'level'. A value of -1 means
+ // that the caller does not know which level the output file will reside.
+ PlainTableBuilder(
+ const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+ const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+ uint32_t column_family_id, int level_at_creation,
+ WritableFileWriter* file, uint32_t user_key_size,
+ EncodingType encoding_type, size_t index_sparseness,
+ uint32_t bloom_bits_per_key, const std::string& column_family_name,
+ uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
+ double hash_table_ratio = 0, bool store_index_in_file = false,
+ const std::string& db_id = "", const std::string& db_session_id = "",
+ uint64_t file_number = 0);
+ // No copying allowed
+ PlainTableBuilder(const PlainTableBuilder&) = delete;
+ void operator=(const PlainTableBuilder&) = delete;
+
+ // REQUIRES: Either Finish() or Abandon() has been called.
+ ~PlainTableBuilder();
+
+ // Add key,value to the table being constructed.
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Add(const Slice& key, const Slice& value) override;
+
+ // Return non-ok iff some error has been detected.
+ Status status() const override { return status_; }
+
+ // Return non-ok iff some error happens during IO.
+ IOStatus io_status() const override { return io_status_; }
+
+ // Finish building the table. Stops using the file passed to the
+ // constructor after this function returns.
+ // REQUIRES: Finish(), Abandon() have not been called
+ Status Finish() override;
+
+ // Indicate that the contents of this builder should be abandoned. Stops
+ // using the file passed to the constructor after this function returns.
+ // If the caller is not going to call Finish(), it must call Abandon()
+ // before destroying this builder.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Abandon() override;
+
+ // Number of calls to Add() so far.
+ uint64_t NumEntries() const override;
+
+ // Size of the file generated so far. If invoked after a successful
+ // Finish() call, returns the size of the final generated file.
+ uint64_t FileSize() const override;
+
+ TableProperties GetTableProperties() const override { return properties_; }
+
+ bool SaveIndexInFile() const { return store_index_in_file_; }
+
+ // Get file checksum
+ std::string GetFileChecksum() const override;
+
+ // Get file checksum function name
+ const char* GetFileChecksumFuncName() const override;
+
+ void SetSeqnoTimeTableProperties(const std::string& string,
+ uint64_t uint_64) override;
+
+ private:
+ Arena arena_;
+ const ImmutableOptions& ioptions_;
+ const MutableCFOptions& moptions_;
+ std::vector<std::unique_ptr<IntTblPropCollector>>
+ table_properties_collectors_;
+
+ BloomBlockBuilder bloom_block_;
+ std::unique_ptr<PlainTableIndexBuilder> index_builder_;
+
+ WritableFileWriter* file_;
+ uint64_t offset_ = 0;
+ uint32_t bloom_bits_per_key_;
+ size_t huge_page_tlb_size_;
+ Status status_;
+ IOStatus io_status_;
+ TableProperties properties_;
+ PlainTableKeyEncoder encoder_;
+
+ bool store_index_in_file_;
+
+ std::vector<uint32_t> keys_or_prefixes_hashes_;
+ bool closed_ = false; // Either Finish() or Abandon() has been called.
+
+ const SliceTransform* prefix_extractor_;
+
+ Slice GetPrefix(const Slice& target) const {
+ assert(target.size() >= 8); // target is internal key
+ return GetPrefixFromUserKey(ExtractUserKey(target));
+ }
+
+ Slice GetPrefix(const ParsedInternalKey& target) const {
+ return GetPrefixFromUserKey(target.user_key);
+ }
+
+ Slice GetPrefixFromUserKey(const Slice& user_key) const {
+ if (!IsTotalOrderMode()) {
+ return prefix_extractor_->Transform(user_key);
+ } else {
+ // Use empty slice as prefix if prefix_extractor is not set.
+ // In that case,
+ // it falls back to pure binary search and
+ // total iterator seek is supported.
+ return Slice();
+ }
+ }
+
+ bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_factory.cc b/src/rocksdb/table/plain/plain_table_factory.cc
new file mode 100644
index 000000000..dfe5241a5
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_factory.cc
@@ -0,0 +1,350 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain/plain_table_factory.h"
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/plain/plain_table_builder.h"
+#include "table/plain/plain_table_reader.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
+ {"user_key_len",
+ {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"bloom_bits_per_key",
+ {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"hash_table_ratio",
+ {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"index_sparseness",
+ {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"huge_page_tlb_size",
+ {offsetof(struct PlainTableOptions, huge_page_tlb_size),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"encoding_type",
+ {offsetof(struct PlainTableOptions, encoding_type),
+ OptionType::kEncodingType, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"full_scan_mode",
+ {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+ {"store_index_in_file",
+ {offsetof(struct PlainTableOptions, store_index_in_file),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+};
+
+PlainTableFactory::PlainTableFactory(const PlainTableOptions& options)
+ : table_options_(options) {
+ RegisterOptions(&table_options_, &plain_table_type_info);
+}
+
+Status PlainTableFactory::NewTableReader(
+ const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table,
+ bool /*prefetch_index_and_filter_in_cache*/) const {
+ return PlainTableReader::Open(
+ table_reader_options.ioptions, table_reader_options.env_options,
+ table_reader_options.internal_comparator, std::move(file), file_size,
+ table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio,
+ table_options_.index_sparseness, table_options_.huge_page_tlb_size,
+ table_options_.full_scan_mode, table_reader_options.immortal,
+ table_reader_options.prefix_extractor.get());
+}
+
+TableBuilder* PlainTableFactory::NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ WritableFileWriter* file) const {
+ // Ignore the skip_filters flag. PlainTable format is optimized for small
+ // in-memory dbs. The skip_filters optimization is not useful for plain
+ // tables
+ //
+ return new PlainTableBuilder(
+ table_builder_options.ioptions, table_builder_options.moptions,
+ table_builder_options.int_tbl_prop_collector_factories,
+ table_builder_options.column_family_id,
+ table_builder_options.level_at_creation, file,
+ table_options_.user_key_len, table_options_.encoding_type,
+ table_options_.index_sparseness, table_options_.bloom_bits_per_key,
+ table_builder_options.column_family_name, 6,
+ table_options_.huge_page_tlb_size, table_options_.hash_table_ratio,
+ table_options_.store_index_in_file, table_builder_options.db_id,
+ table_builder_options.db_session_id, table_builder_options.cur_file_num);
+}
+
+std::string PlainTableFactory::GetPrintableOptions() const {
+ std::string ret;
+ ret.reserve(20000);
+ const int kBufferSize = 200;
+ char buffer[kBufferSize];
+
+ snprintf(buffer, kBufferSize, " user_key_len: %u\n",
+ table_options_.user_key_len);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " bloom_bits_per_key: %d\n",
+ table_options_.bloom_bits_per_key);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n",
+ table_options_.hash_table_ratio);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " index_sparseness: %" ROCKSDB_PRIszt "\n",
+ table_options_.index_sparseness);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " huge_page_tlb_size: %" ROCKSDB_PRIszt "\n",
+ table_options_.huge_page_tlb_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " encoding_type: %d\n",
+ table_options_.encoding_type);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " full_scan_mode: %d\n",
+ table_options_.full_scan_mode);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " store_index_in_file: %d\n",
+ table_options_.store_index_in_file);
+ ret.append(buffer);
+ return ret;
+}
+
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+ const std::string& opts_str,
+ PlainTableOptions* new_table_options) {
+ ConfigOptions config_options;
+ config_options.input_strings_escaped = false;
+ config_options.ignore_unknown_options = false;
+ config_options.invoke_prepare_options = false;
+ return GetPlainTableOptionsFromString(config_options, table_options, opts_str,
+ new_table_options);
+}
+
+Status GetPlainTableOptionsFromString(const ConfigOptions& config_options,
+ const PlainTableOptions& table_options,
+ const std::string& opts_str,
+ PlainTableOptions* new_table_options) {
+ std::unordered_map<std::string, std::string> opts_map;
+ Status s = StringToMap(opts_str, &opts_map);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = GetPlainTableOptionsFromMap(config_options, table_options, opts_map,
+ new_table_options);
+ // Translate any errors (NotFound, NotSupported, to InvalidArgument
+ if (s.ok() || s.IsInvalidArgument()) {
+ return s;
+ } else {
+ return Status::InvalidArgument(s.getState());
+ }
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library,
+ const std::string& /*arg*/) {
+ // The MemTableRepFactory built-in classes will be either a class
+ // (VectorRepFactory) or a nickname (vector), followed optionally by ":#",
+ // where # is the "size" of the factory.
+ auto AsPattern = [](const std::string& name, const std::string& alt) {
+ auto pattern = ObjectLibrary::PatternEntry(name, true);
+ pattern.AnotherName(alt);
+ pattern.AddNumber(":");
+ return pattern;
+ };
+ library.AddFactory<MemTableRepFactory>(
+ AsPattern(VectorRepFactory::kClassName(), VectorRepFactory::kNickName()),
+ [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+ std::string* /*errmsg*/) {
+ auto colon = uri.find(":");
+ if (colon != std::string::npos) {
+ size_t count = ParseSizeT(uri.substr(colon + 1));
+ guard->reset(new VectorRepFactory(count));
+ } else {
+ guard->reset(new VectorRepFactory());
+ }
+ return guard->get();
+ });
+ library.AddFactory<MemTableRepFactory>(
+ AsPattern(SkipListFactory::kClassName(), SkipListFactory::kNickName()),
+ [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+ std::string* /*errmsg*/) {
+ auto colon = uri.find(":");
+ if (colon != std::string::npos) {
+ size_t lookahead = ParseSizeT(uri.substr(colon + 1));
+ guard->reset(new SkipListFactory(lookahead));
+ } else {
+ guard->reset(new SkipListFactory());
+ }
+ return guard->get();
+ });
+ library.AddFactory<MemTableRepFactory>(
+ AsPattern("HashLinkListRepFactory", "hash_linkedlist"),
+ [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+ std::string* /*errmsg*/) {
+ // Expecting format: hash_linkedlist:<hash_bucket_count>
+ auto colon = uri.find(":");
+ if (colon != std::string::npos) {
+ size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1));
+ guard->reset(NewHashLinkListRepFactory(hash_bucket_count));
+ } else {
+ guard->reset(NewHashLinkListRepFactory());
+ }
+ return guard->get();
+ });
+ library.AddFactory<MemTableRepFactory>(
+ AsPattern("HashSkipListRepFactory", "prefix_hash"),
+ [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+ std::string* /*errmsg*/) {
+ // Expecting format: prefix_hash:<hash_bucket_count>
+ auto colon = uri.find(":");
+ if (colon != std::string::npos) {
+ size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1));
+ guard->reset(NewHashSkipListRepFactory(hash_bucket_count));
+ } else {
+ guard->reset(NewHashSkipListRepFactory());
+ }
+ return guard->get();
+ });
+ library.AddFactory<MemTableRepFactory>(
+ "cuckoo",
+ [](const std::string& /*uri*/,
+ std::unique_ptr<MemTableRepFactory>* /*guard*/, std::string* errmsg) {
+ *errmsg = "cuckoo hash memtable is not supported anymore.";
+ return nullptr;
+ });
+
+ size_t num_types;
+ return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif // ROCKSDB_LITE
+
+Status GetMemTableRepFactoryFromString(
+ const std::string& opts_str, std::unique_ptr<MemTableRepFactory>* result) {
+ ConfigOptions config_options;
+ config_options.ignore_unsupported_options = false;
+ config_options.ignore_unknown_options = false;
+ return MemTableRepFactory::CreateFromString(config_options, opts_str, result);
+}
+
+Status MemTableRepFactory::CreateFromString(
+ const ConfigOptions& config_options, const std::string& value,
+ std::unique_ptr<MemTableRepFactory>* result) {
+#ifndef ROCKSDB_LITE
+ static std::once_flag once;
+ std::call_once(once, [&]() {
+ RegisterBuiltinMemTableRepFactory(*(ObjectLibrary::Default().get()), "");
+ });
+#endif // ROCKSDB_LITE
+ std::string id;
+ std::unordered_map<std::string, std::string> opt_map;
+ Status status = Customizable::GetOptionsMap(config_options, result->get(),
+ value, &id, &opt_map);
+ if (!status.ok()) { // GetOptionsMap failed
+ return status;
+ } else if (value.empty()) {
+ // No Id and no options. Clear the object
+ result->reset();
+ return Status::OK();
+ } else if (id.empty()) { // We have no Id but have options. Not good
+ return Status::NotSupported("Cannot reset object ", id);
+ } else {
+#ifndef ROCKSDB_LITE
+ status = NewUniqueObject<MemTableRepFactory>(config_options, id, opt_map,
+ result);
+#else
+ // To make it possible to configure the memtables in LITE mode, the ID
+ // is of the form <name>:<size>, where name is the name of the class and
+ // <size> is the length of the object (e.g. skip_list:10).
+ std::vector<std::string> opts_list = StringSplit(id, ':');
+ if (opts_list.empty() || opts_list.size() > 2 || !opt_map.empty()) {
+ status = Status::InvalidArgument("Can't parse memtable_factory option ",
+ value);
+ } else if (opts_list[0] == SkipListFactory::kNickName() ||
+ opts_list[0] == SkipListFactory::kClassName()) {
+ // Expecting format
+ // skip_list:<lookahead>
+ if (opts_list.size() == 2) {
+ size_t lookahead = ParseSizeT(opts_list[1]);
+ result->reset(new SkipListFactory(lookahead));
+ } else {
+ result->reset(new SkipListFactory());
+ }
+ } else if (!config_options.ignore_unsupported_options) {
+ status = Status::NotSupported("Cannot load object in LITE mode ", id);
+ }
+#endif // ROCKSDB_LITE
+ }
+ return status;
+}
+
+Status MemTableRepFactory::CreateFromString(
+ const ConfigOptions& config_options, const std::string& value,
+ std::shared_ptr<MemTableRepFactory>* result) {
+ std::unique_ptr<MemTableRepFactory> factory;
+ Status s = CreateFromString(config_options, value, &factory);
+ if (factory && s.ok()) {
+ result->reset(factory.release());
+ }
+ return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status GetPlainTableOptionsFromMap(
+ const PlainTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ PlainTableOptions* new_table_options, bool input_strings_escaped,
+ bool ignore_unknown_options) {
+ ConfigOptions config_options;
+ config_options.input_strings_escaped = input_strings_escaped;
+ config_options.ignore_unknown_options = ignore_unknown_options;
+ return GetPlainTableOptionsFromMap(config_options, table_options, opts_map,
+ new_table_options);
+}
+
+Status GetPlainTableOptionsFromMap(
+ const ConfigOptions& config_options, const PlainTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ PlainTableOptions* new_table_options) {
+ assert(new_table_options);
+ PlainTableFactory ptf(table_options);
+ Status s = ptf.ConfigureFromMap(config_options, opts_map);
+ if (s.ok()) {
+ *new_table_options = *(ptf.GetOptions<PlainTableOptions>());
+ } else {
+ // Restore "new_options" to the default "base_options".
+ *new_table_options = table_options;
+ }
+ return s;
+}
+
+extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
+ return new PlainTableFactory(options);
+}
+
+const std::string PlainTablePropertyNames::kEncodingType =
+ "rocksdb.plain.table.encoding.type";
+
+const std::string PlainTablePropertyNames::kBloomVersion =
+ "rocksdb.plain.table.bloom.version";
+
+const std::string PlainTablePropertyNames::kNumBloomBlocks =
+ "rocksdb.plain.table.bloom.numblocks";
+
+#endif // ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_factory.h b/src/rocksdb/table/plain/plain_table_factory.h
new file mode 100644
index 000000000..ce60b9d19
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_factory.h
@@ -0,0 +1,182 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct EnvOptions;
+
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+// PlainTableFactory is the entrance function to the PlainTable format of
+// SST files. It returns instances PlainTableBuilder as the builder
+// class and PlainTableReader as the reader class, where the format is
+// actually implemented.
+//
+// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
+// Data is not organized in blocks, which allows fast access. Because of
+// following downsides
+// 1. Data compression is not supported.
+// 2. Data is not checksumed.
+// it is not recommended to use this format on other type of file systems.
+//
+// PlainTable requires fixed length key, configured as a constructor
+// parameter of the factory class. Output file format:
+// +-------------+-----------------+
+// | version | user_key_length |
+// +------------++------------+-----------------+ <= key1 offset
+// | encoded key1 | value_size | |
+// +------------+-------------+-------------+ |
+// | value1 |
+// | |
+// +--------------------------+-------------+---+ <= key2 offset
+// | encoded key2 | value_size | |
+// +------------+-------------+-------------+ |
+// | value2 |
+// | |
+// | ...... |
+// +-----------------+--------------------------+
+//
+// When the key encoding type is kPlain. Key part is encoded as:
+// +------------+--------------------+
+// | [key_size] | internal key |
+// +------------+--------------------+
+// for the case of user_key_len = kPlainTableVariableLength case,
+// and simply:
+// +----------------------+
+// | internal key |
+// +----------------------+
+// for user_key_len != kPlainTableVariableLength case.
+//
+// If key encoding type is kPrefix. Keys are encoding in this format.
+// There are three ways to encode a key:
+// (1) Full Key
+// +---------------+---------------+-------------------+
+// | Full Key Flag | Full Key Size | Full Internal Key |
+// +---------------+---------------+-------------------+
+// which simply encodes a full key
+//
+// (2) A key shared the same prefix as the previous key, which is encoded as
+// format of (1).
+// +-------------+-------------+-------------+-------------+------------+
+// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
+// +-------------+-------------+-------------+-------------+------------+
+// where key is the suffix part of the key, including the internal bytes.
+// the actual key will be constructed by concatenating prefix part of the
+// previous key, with the suffix part of the key here, with sizes given here.
+//
+// (3) A key shared the same prefix as the previous key, which is encoded as
+// the format of (2).
+// +-----------------+-----------------+------------------------+
+// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
+// +-----------------+-----------------+------------------------+
+// The key will be constructed by concatenating previous key's prefix (which is
+// also a prefix which the last key encoded in the format of (1)) and the
+// key given here.
+//
+// For example, we for following keys (prefix and suffix are separated by
+// spaces):
+// 0000 0001
+// 0000 00021
+// 0000 0002
+// 00011 00
+// 0002 0001
+// Will be encoded like this:
+// FK 8 00000001
+// PF 4 SF 5 00021
+// SF 4 0002
+// FK 7 0001100
+// FK 8 00020001
+// (where FK means full key flag, PF means prefix flag and SF means suffix flag)
+//
+// All those "key flag + key size" shown above are in this format:
+// The 8 bits of the first byte:
+// +----+----+----+----+----+----+----+----+
+// | Type | Size |
+// +----+----+----+----+----+----+----+----+
+// Type indicates: full key, prefix, or suffix.
+// The last 6 bits are for size. If the size bits are not all 1, it means the
+// size of the key. Otherwise, varint32 is read after this byte. This varint
+// value + 0x3F (the value of all 1) will be the key size.
+//
+// For example, full key with length 16 will be encoded as (binary):
+// 00 010000
+// (00 means full key)
+// and a prefix with 100 bytes will be encoded as:
+// 01 111111 00100101
+// (63) (37)
+// (01 means key suffix)
+//
+// All the internal keys above (including kPlain and kPrefix) are encoded in
+// this format:
+// There are two types:
+// (1) normal internal key format
+// +----------- ...... -------------+----+---+---+---+---+---+---+---+
+// | user key |type| sequence ID |
+// +----------- ..... --------------+----+---+---+---+---+---+---+---+
+// (2) Special case for keys whose sequence ID is 0 and is value type
+// +----------- ...... -------------+----+
+// | user key |0x80|
+// +----------- ..... --------------+----+
+// To save 7 bytes for the special case where sequence ID = 0.
+//
+//
+class PlainTableFactory : public TableFactory {
+ public:
+ ~PlainTableFactory() {}
+ // user_key_len is the length of the user key. If it is set to be
+ // kPlainTableVariableLength, then it means variable length. Otherwise, all
+ // the keys need to have the fix length of this value. bloom_bits_per_key is
+ // number of bits used for bloom filer per key. hash_table_ratio is
+ // the desired utilization of the hash table used for prefix hashing.
+ // hash_table_ratio = number of prefixes / #buckets in the hash table
+ // hash_table_ratio = 0 means skip hash table but only replying on binary
+ // search.
+ // index_sparseness determines index interval for keys
+ // inside the same prefix. It will be the maximum number of linear search
+ // required after hash and binary search.
+ // index_sparseness = 0 means index for every key.
+ // huge_page_tlb_size determines whether to allocate hash indexes from huge
+ // page TLB and the page size if allocating from there. See comments of
+ // Arena::AllocateAligned() for details.
+ explicit PlainTableFactory(
+ const PlainTableOptions& _table_options = PlainTableOptions());
+
+ // Method to allow CheckedCast to work for this class
+ static const char* kClassName() { return kPlainTableName(); }
+ const char* Name() const override { return kPlainTableName(); }
+ using TableFactory::NewTableReader;
+ Status NewTableReader(const ReadOptions& ro,
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ uint64_t file_size, std::unique_ptr<TableReader>* table,
+ bool prefetch_index_and_filter_in_cache) const override;
+
+ TableBuilder* NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ WritableFileWriter* file) const override;
+
+ std::string GetPrintableOptions() const override;
+ static const char kValueTypeSeqId0 = char(~0);
+
+ private:
+ PlainTableOptions table_options_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_index.cc b/src/rocksdb/table/plain/plain_table_index.cc
new file mode 100644
index 000000000..b7e07cfb2
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_index.cc
@@ -0,0 +1,213 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_index.h"
+
+#include <cinttypes>
+
+#include "logging/logging.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+ assert(num_buckets > 0);
+ return hash % num_buckets;
+}
+} // namespace
+
+Status PlainTableIndex::InitFromRawData(Slice data) {
+ if (!GetVarint32(&data, &index_size_)) {
+ return Status::Corruption("Couldn't read the index size!");
+ }
+ assert(index_size_ > 0);
+ if (!GetVarint32(&data, &num_prefixes_)) {
+ return Status::Corruption("Couldn't read the index size!");
+ }
+ sub_index_size_ =
+ static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
+
+ char* index_data_begin = const_cast<char*>(data.data());
+ index_ = reinterpret_cast<uint32_t*>(index_data_begin);
+ sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
+ return Status::OK();
+}
+
+PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
+ uint32_t prefix_hash, uint32_t* bucket_value) const {
+ int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
+ GetUnaligned(index_ + bucket, bucket_value);
+ if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
+ *bucket_value ^= kSubIndexMask;
+ return kSubindex;
+ }
+ if (*bucket_value >= kMaxFileSize) {
+ return kNoPrefixForBucket;
+ } else {
+ // point directly to the file
+ return kDirectToFile;
+ }
+}
+
+void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
+ uint32_t offset) {
+ if (num_records_in_current_group_ == kNumRecordsPerGroup) {
+ current_group_ = AllocateNewGroup();
+ num_records_in_current_group_ = 0;
+ }
+ auto& new_record = current_group_[num_records_in_current_group_++];
+ new_record.hash = hash;
+ new_record.offset = offset;
+ new_record.next = nullptr;
+}
+
+void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
+ uint32_t key_offset) {
+ if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
+ ++num_prefixes_;
+ if (!is_first_record_) {
+ keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+ }
+ num_keys_per_prefix_ = 0;
+ prev_key_prefix_ = key_prefix_slice.ToString();
+ prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
+ due_index_ = true;
+ }
+
+ if (due_index_) {
+ // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+ record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
+ due_index_ = false;
+ }
+
+ num_keys_per_prefix_++;
+ if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
+ due_index_ = true;
+ }
+ is_first_record_ = false;
+}
+
+Slice PlainTableIndexBuilder::Finish() {
+ AllocateIndex();
+ std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
+ std::vector<uint32_t> entries_per_bucket(index_size_, 0);
+ BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
+
+ keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+ ROCKS_LOG_INFO(ioptions_.logger, "Number of Keys per prefix Histogram: %s",
+ keys_per_prefix_hist_.ToString().c_str());
+
+ // From the temp data structure, populate indexes.
+ return FillIndexes(hash_to_offsets, entries_per_bucket);
+}
+
+void PlainTableIndexBuilder::AllocateIndex() {
+ if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
+ // Fall back to pure binary search if the user fails to specify a prefix
+ // extractor.
+ index_size_ = 1;
+ } else {
+ double hash_table_size_multipier = 1.0 / hash_table_ratio_;
+ index_size_ =
+ static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
+ assert(index_size_ > 0);
+ }
+}
+
+void PlainTableIndexBuilder::BucketizeIndexes(
+ std::vector<IndexRecord*>* hash_to_offsets,
+ std::vector<uint32_t>* entries_per_bucket) {
+ bool first = true;
+ uint32_t prev_hash = 0;
+ size_t num_records = record_list_.GetNumRecords();
+ for (size_t i = 0; i < num_records; i++) {
+ IndexRecord* index_record = record_list_.At(i);
+ uint32_t cur_hash = index_record->hash;
+ if (first || prev_hash != cur_hash) {
+ prev_hash = cur_hash;
+ first = false;
+ }
+ uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
+ IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
+ index_record->next = prev_bucket_head;
+ (*hash_to_offsets)[bucket] = index_record;
+ (*entries_per_bucket)[bucket]++;
+ }
+
+ sub_index_size_ = 0;
+ for (auto entry_count : *entries_per_bucket) {
+ if (entry_count <= 1) {
+ continue;
+ }
+ // Only buckets with more than 1 entry will have subindex.
+ sub_index_size_ += VarintLength(entry_count);
+ // total bytes needed to store these entries' in-file offsets.
+ sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
+ }
+}
+
+Slice PlainTableIndexBuilder::FillIndexes(
+ const std::vector<IndexRecord*>& hash_to_offsets,
+ const std::vector<uint32_t>& entries_per_bucket) {
+ ROCKS_LOG_DEBUG(ioptions_.logger,
+ "Reserving %" PRIu32 " bytes for plain table's sub_index",
+ sub_index_size_);
+ auto total_allocate_size = GetTotalSize();
+ char* allocated = arena_->AllocateAligned(
+ total_allocate_size, huge_page_tlb_size_, ioptions_.logger);
+
+ auto temp_ptr = EncodeVarint32(allocated, index_size_);
+ uint32_t* index =
+ reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
+ char* sub_index = reinterpret_cast<char*>(index + index_size_);
+
+ uint32_t sub_index_offset = 0;
+ for (uint32_t i = 0; i < index_size_; i++) {
+ uint32_t num_keys_for_bucket = entries_per_bucket[i];
+ switch (num_keys_for_bucket) {
+ case 0:
+ // No key for bucket
+ PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
+ break;
+ case 1:
+ // point directly to the file offset
+ PutUnaligned(index + i, hash_to_offsets[i]->offset);
+ break;
+ default:
+ // point to second level indexes.
+ PutUnaligned(index + i,
+ sub_index_offset | PlainTableIndex::kSubIndexMask);
+ char* prev_ptr = &sub_index[sub_index_offset];
+ char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+ sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
+ char* sub_index_pos = &sub_index[sub_index_offset];
+ IndexRecord* record = hash_to_offsets[i];
+ int j;
+ for (j = num_keys_for_bucket - 1; j >= 0 && record;
+ j--, record = record->next) {
+ EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
+ }
+ assert(j == -1 && record == nullptr);
+ sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
+ assert(sub_index_offset <= sub_index_size_);
+ break;
+ }
+ }
+ assert(sub_index_offset == sub_index_size_);
+
+ ROCKS_LOG_DEBUG(ioptions_.logger,
+ "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
+ index_size_, sub_index_size_);
+ return Slice(allocated, GetTotalSize());
+}
+
+const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
+ "PlainTableIndexBlock";
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_index.h b/src/rocksdb/table/plain/plain_table_index.h
new file mode 100644
index 000000000..9f5f0eeff
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_index.h
@@ -0,0 +1,248 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "memory/arena.h"
+#include "monitoring/histogram.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains two classes PlainTableIndex and PlainTableIndexBuilder
+// The two classes implement the index format of PlainTable.
+// For description of PlainTable format, see comments of class
+// PlainTableFactory
+//
+//
+// PlainTableIndex contains buckets size of index_size_, each is a
+// 32-bit integer. The lower 31 bits contain an offset value (explained below)
+// and the first bit of the integer indicates type of the offset.
+//
+// +--------------+------------------------------------------------------+
+// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
+// +--------------+------------------------------------------------------+
+//
+// Explanation for the "flag bit":
+//
+// 0 indicates that the bucket contains only one prefix (no conflict when
+// hashing this prefix), whose first row starts from this offset of the
+// file.
+// 1 indicates that the bucket contains more than one prefixes, or there
+// are too many rows for one prefix so we need a binary search for it. In
+// this case, the offset indicates the offset of sub_index_ holding the
+// binary search indexes of keys for those rows. Those binary search indexes
+// are organized in this way:
+//
+// The first 4 bytes, indicate how many indexes (N) are stored after it. After
+// it, there are N 32-bit integers, each points of an offset of the file,
+// which
+// points to starting of a row. Those offsets need to be guaranteed to be in
+// ascending order so the keys they are pointing to are also in ascending
+// order
+// to make sure we can use them to do binary searches. Below is visual
+// presentation of a bucket.
+//
+// <begin>
+// number_of_records: varint32
+// record 1 file offset: fixedint32
+// record 2 file offset: fixedint32
+// ....
+// record N file offset: fixedint32
+// <end>
+
+// The class loads the index block from a PlainTable SST file, and executes
+// the index lookup.
+// The class is used by PlainTableReader class.
+class PlainTableIndex {
+ public:
+ enum IndexSearchResult {
+ kNoPrefixForBucket = 0,
+ kDirectToFile = 1,
+ kSubindex = 2
+ };
+
+ explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
+
+ PlainTableIndex()
+ : index_size_(0),
+ sub_index_size_(0),
+ num_prefixes_(0),
+ index_(nullptr),
+ sub_index_(nullptr) {}
+
+ // The function that executes the lookup the hash table.
+ // The hash key is `prefix_hash`. The function fills the hash bucket
+ // content in `bucket_value`, which is up to the caller to interpret.
+ IndexSearchResult GetOffset(uint32_t prefix_hash,
+ uint32_t* bucket_value) const;
+
+ // Initialize data from `index_data`, which points to raw data for
+ // index stored in the SST file.
+ Status InitFromRawData(Slice index_data);
+
+ // Decode the sub index for specific hash bucket.
+ // The `offset` is the value returned as `bucket_value` by GetOffset()
+ // and is only valid when the return value is `kSubindex`.
+ // The return value is the pointer to the starting address of the
+ // sub-index. `upper_bound` is filled with the value indicating how many
+ // entries the sub-index has.
+ const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
+ uint32_t* upper_bound) const {
+ const char* index_ptr = &sub_index_[offset];
+ return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
+ }
+
+ uint32_t GetIndexSize() const { return index_size_; }
+
+ uint32_t GetSubIndexSize() const { return sub_index_size_; }
+
+ uint32_t GetNumPrefixes() const { return num_prefixes_; }
+
+ static const uint64_t kMaxFileSize = (1u << 31) - 1;
+ static const uint32_t kSubIndexMask = 0x80000000;
+ static const size_t kOffsetLen = sizeof(uint32_t);
+
+ private:
+ uint32_t index_size_;
+ uint32_t sub_index_size_;
+ uint32_t num_prefixes_;
+
+ uint32_t* index_;
+ char* sub_index_;
+};
+
+// PlainTableIndexBuilder is used to create plain table index.
+// After calling Finish(), it returns Slice, which is usually
+// used either to initialize PlainTableIndex or
+// to save index to sst file.
+// For more details about the index, please refer to:
+// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
+// #wiki-in-memory-index-format
+// The class is used by PlainTableBuilder class.
+class PlainTableIndexBuilder {
+ public:
+ PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions,
+ const SliceTransform* prefix_extractor,
+ size_t index_sparseness, double hash_table_ratio,
+ size_t huge_page_tlb_size)
+ : arena_(arena),
+ ioptions_(ioptions),
+ record_list_(kRecordsPerGroup),
+ is_first_record_(true),
+ due_index_(false),
+ num_prefixes_(0),
+ num_keys_per_prefix_(0),
+ prev_key_prefix_hash_(0),
+ index_sparseness_(index_sparseness),
+ index_size_(0),
+ sub_index_size_(0),
+ prefix_extractor_(prefix_extractor),
+ hash_table_ratio_(hash_table_ratio),
+ huge_page_tlb_size_(huge_page_tlb_size) {}
+
+ void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
+
+ Slice Finish();
+
+ uint32_t GetTotalSize() const {
+ return VarintLength(index_size_) + VarintLength(num_prefixes_) +
+ PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
+ }
+
+ static const std::string kPlainTableIndexBlock;
+
+ private:
+ struct IndexRecord {
+ uint32_t hash; // hash of the prefix
+ uint32_t offset; // offset of a row
+ IndexRecord* next;
+ };
+
+ // Helper class to track all the index records
+ class IndexRecordList {
+ public:
+ explicit IndexRecordList(size_t num_records_per_group)
+ : kNumRecordsPerGroup(num_records_per_group),
+ current_group_(nullptr),
+ num_records_in_current_group_(num_records_per_group) {}
+
+ ~IndexRecordList() {
+ for (size_t i = 0; i < groups_.size(); i++) {
+ delete[] groups_[i];
+ }
+ }
+
+ void AddRecord(uint32_t hash, uint32_t offset);
+
+ size_t GetNumRecords() const {
+ return (groups_.size() - 1) * kNumRecordsPerGroup +
+ num_records_in_current_group_;
+ }
+ IndexRecord* At(size_t index) {
+ return &(
+ groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
+ }
+
+ private:
+ IndexRecord* AllocateNewGroup() {
+ IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
+ groups_.push_back(result);
+ return result;
+ }
+
+ // Each group in `groups_` contains fix-sized records (determined by
+ // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
+ // occurs.
+ const size_t kNumRecordsPerGroup;
+ IndexRecord* current_group_;
+ // List of arrays allocated
+ std::vector<IndexRecord*> groups_;
+ size_t num_records_in_current_group_;
+ };
+
+ void AllocateIndex();
+
+ // Internal helper function to bucket index record list to hash buckets.
+ void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
+ std::vector<uint32_t>* entries_per_bucket);
+
+ // Internal helper class to fill the indexes and bloom filters to internal
+ // data structures.
+ Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
+ const std::vector<uint32_t>& entries_per_bucket);
+
+ Arena* arena_;
+ const ImmutableOptions ioptions_;
+ HistogramImpl keys_per_prefix_hist_;
+ IndexRecordList record_list_;
+ bool is_first_record_;
+ bool due_index_;
+ uint32_t num_prefixes_;
+ uint32_t num_keys_per_prefix_;
+
+ uint32_t prev_key_prefix_hash_;
+ size_t index_sparseness_;
+ uint32_t index_size_;
+ uint32_t sub_index_size_;
+
+ const SliceTransform* prefix_extractor_;
+ double hash_table_ratio_;
+ size_t huge_page_tlb_size_;
+
+ std::string prev_key_prefix_;
+
+ static const size_t kRecordsPerGroup = 256;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_key_coding.cc b/src/rocksdb/table/plain/plain_table_key_coding.cc
new file mode 100644
index 000000000..800d8d76f
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_key_coding.cc
@@ -0,0 +1,509 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_key_coding.h"
+
+#include <algorithm>
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum PlainTableEntryType : unsigned char {
+ kFullKey = 0,
+ kPrefixFromPreviousKey = 1,
+ kKeySuffix = 2,
+};
+
+namespace {
+
+// Control byte:
+// First two bits indicate type of entry
+// Other bytes are inlined sizes. If all bits are 1 (0x03F), overflow bytes
+// are used. key_size-0x3F will be encoded as a variint32 after this bytes.
+
+const unsigned char kSizeInlineLimit = 0x3F;
+
+// Return 0 for error
+size_t EncodeSize(PlainTableEntryType type, uint32_t key_size,
+ char* out_buffer) {
+ out_buffer[0] = type << 6;
+
+ if (key_size < static_cast<uint32_t>(kSizeInlineLimit)) {
+ // size inlined
+ out_buffer[0] |= static_cast<char>(key_size);
+ return 1;
+ } else {
+ out_buffer[0] |= kSizeInlineLimit;
+ char* ptr = EncodeVarint32(out_buffer + 1, key_size - kSizeInlineLimit);
+ return ptr - out_buffer;
+ }
+}
+} // namespace
+
+// Fill bytes_read with number of bytes read.
+inline Status PlainTableKeyDecoder::DecodeSize(uint32_t start_offset,
+ PlainTableEntryType* entry_type,
+ uint32_t* key_size,
+ uint32_t* bytes_read) {
+ Slice next_byte_slice;
+ bool success = file_reader_.Read(start_offset, 1, &next_byte_slice);
+ if (!success) {
+ return file_reader_.status();
+ }
+ *entry_type = static_cast<PlainTableEntryType>(
+ (static_cast<unsigned char>(next_byte_slice[0]) & ~kSizeInlineLimit) >>
+ 6);
+ char inline_key_size = next_byte_slice[0] & kSizeInlineLimit;
+ if (inline_key_size < kSizeInlineLimit) {
+ *key_size = inline_key_size;
+ *bytes_read = 1;
+ return Status::OK();
+ } else {
+ uint32_t extra_size;
+ uint32_t tmp_bytes_read;
+ success = file_reader_.ReadVarint32(start_offset + 1, &extra_size,
+ &tmp_bytes_read);
+ if (!success) {
+ return file_reader_.status();
+ }
+ assert(tmp_bytes_read > 0);
+ *key_size = kSizeInlineLimit + extra_size;
+ *bytes_read = tmp_bytes_read + 1;
+ return Status::OK();
+ }
+}
+
+IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key,
+ WritableFileWriter* file,
+ uint64_t* offset, char* meta_bytes_buf,
+ size_t* meta_bytes_buf_size) {
+ ParsedInternalKey parsed_key;
+ Status pik_status =
+ ParseInternalKey(key, &parsed_key, false /* log_err_key */); // TODO
+ if (!pik_status.ok()) {
+ return IOStatus::Corruption(pik_status.getState());
+ }
+
+ Slice key_to_write = key; // Portion of internal key to write out.
+
+ uint32_t user_key_size = static_cast<uint32_t>(key.size() - 8);
+ if (encoding_type_ == kPlain) {
+ if (fixed_user_key_len_ == kPlainTableVariableLength) {
+ // Write key length
+ char key_size_buf[5]; // tmp buffer for key size as varint32
+ char* ptr = EncodeVarint32(key_size_buf, user_key_size);
+ assert(ptr <= key_size_buf + sizeof(key_size_buf));
+ auto len = ptr - key_size_buf;
+ IOStatus io_s = file->Append(Slice(key_size_buf, len));
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ *offset += len;
+ }
+ } else {
+ assert(encoding_type_ == kPrefix);
+ char size_bytes[12];
+ size_t size_bytes_pos = 0;
+
+ Slice prefix =
+ prefix_extractor_->Transform(Slice(key.data(), user_key_size));
+ if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetUserKey() ||
+ key_count_for_prefix_ % index_sparseness_ == 0) {
+ key_count_for_prefix_ = 1;
+ pre_prefix_.SetUserKey(prefix);
+ size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes);
+ IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos));
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ *offset += size_bytes_pos;
+ } else {
+ key_count_for_prefix_++;
+ if (key_count_for_prefix_ == 2) {
+ // For second key within a prefix, need to encode prefix length
+ size_bytes_pos +=
+ EncodeSize(kPrefixFromPreviousKey,
+ static_cast<uint32_t>(pre_prefix_.GetUserKey().size()),
+ size_bytes + size_bytes_pos);
+ }
+ uint32_t prefix_len =
+ static_cast<uint32_t>(pre_prefix_.GetUserKey().size());
+ size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len,
+ size_bytes + size_bytes_pos);
+ IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos));
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ *offset += size_bytes_pos;
+ key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len);
+ }
+ }
+
+ // Encode full key
+ // For value size as varint32 (up to 5 bytes).
+ // If the row is of value type with seqId 0, flush the special flag together
+ // in this buffer to safe one file append call, which takes 1 byte.
+ if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
+ IOStatus io_s =
+ file->Append(Slice(key_to_write.data(), key_to_write.size() - 8));
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ *offset += key_to_write.size() - 8;
+ meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0;
+ *meta_bytes_buf_size += 1;
+ } else {
+ IOStatus io_s = file->Append(key_to_write);
+ if (!io_s.ok()) {
+ return io_s;
+ }
+ *offset += key_to_write.size();
+ }
+
+ return IOStatus::OK();
+}
+
+Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset,
+ uint32_t len) {
+ assert(file_offset + len <= file_info_->data_end_offset);
+ return Slice(buffer->buf.get() + (file_offset - buffer->buf_start_offset),
+ len);
+}
+
+bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len,
+ Slice* out) {
+ const uint32_t kPrefetchSize = 256u;
+
+ // Try to read from buffers.
+ for (uint32_t i = 0; i < num_buf_; i++) {
+ Buffer* buffer = buffers_[num_buf_ - 1 - i].get();
+ if (file_offset >= buffer->buf_start_offset &&
+ file_offset + len <= buffer->buf_start_offset + buffer->buf_len) {
+ *out = GetFromBuffer(buffer, file_offset, len);
+ return true;
+ }
+ }
+
+ Buffer* new_buffer;
+ // Data needed is not in any of the buffer. Allocate a new buffer.
+ if (num_buf_ < buffers_.size()) {
+ // Add a new buffer
+ new_buffer = new Buffer();
+ buffers_[num_buf_++].reset(new_buffer);
+ } else {
+ // Now simply replace the last buffer. Can improve the placement policy
+ // if needed.
+ new_buffer = buffers_[num_buf_ - 1].get();
+ }
+
+ assert(file_offset + len <= file_info_->data_end_offset);
+ uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset,
+ std::max(kPrefetchSize, len));
+ if (size_to_read > new_buffer->buf_capacity) {
+ new_buffer->buf.reset(new char[size_to_read]);
+ new_buffer->buf_capacity = size_to_read;
+ new_buffer->buf_len = 0;
+ }
+ Slice read_result;
+ // TODO: rate limit plain table reads.
+ Status s =
+ file_info_->file->Read(IOOptions(), file_offset, size_to_read,
+ &read_result, new_buffer->buf.get(), nullptr,
+ Env::IO_TOTAL /* rate_limiter_priority */);
+ if (!s.ok()) {
+ status_ = s;
+ return false;
+ }
+ new_buffer->buf_start_offset = file_offset;
+ new_buffer->buf_len = size_to_read;
+ *out = GetFromBuffer(new_buffer, file_offset, len);
+ return true;
+}
+
+inline bool PlainTableFileReader::ReadVarint32(uint32_t offset, uint32_t* out,
+ uint32_t* bytes_read) {
+ if (file_info_->is_mmap_mode) {
+ const char* start = file_info_->file_data.data() + offset;
+ const char* limit =
+ file_info_->file_data.data() + file_info_->data_end_offset;
+ const char* key_ptr = GetVarint32Ptr(start, limit, out);
+ assert(key_ptr != nullptr);
+ *bytes_read = static_cast<uint32_t>(key_ptr - start);
+ return true;
+ } else {
+ return ReadVarint32NonMmap(offset, out, bytes_read);
+ }
+}
+
+bool PlainTableFileReader::ReadVarint32NonMmap(uint32_t offset, uint32_t* out,
+ uint32_t* bytes_read) {
+ const char* start;
+ const char* limit;
+ const uint32_t kMaxVarInt32Size = 6u;
+ uint32_t bytes_to_read =
+ std::min(file_info_->data_end_offset - offset, kMaxVarInt32Size);
+ Slice bytes;
+ if (!Read(offset, bytes_to_read, &bytes)) {
+ return false;
+ }
+ start = bytes.data();
+ limit = bytes.data() + bytes.size();
+
+ const char* key_ptr = GetVarint32Ptr(start, limit, out);
+ *bytes_read =
+ (key_ptr != nullptr) ? static_cast<uint32_t>(key_ptr - start) : 0;
+ return true;
+}
+
+Status PlainTableKeyDecoder::ReadInternalKey(
+ uint32_t file_offset, uint32_t user_key_size, ParsedInternalKey* parsed_key,
+ uint32_t* bytes_read, bool* internal_key_valid, Slice* internal_key) {
+ Slice tmp_slice;
+ bool success = file_reader_.Read(file_offset, user_key_size + 1, &tmp_slice);
+ if (!success) {
+ return file_reader_.status();
+ }
+ if (tmp_slice[user_key_size] == PlainTableFactory::kValueTypeSeqId0) {
+ // Special encoding for the row with seqID=0
+ parsed_key->user_key = Slice(tmp_slice.data(), user_key_size);
+ parsed_key->sequence = 0;
+ parsed_key->type = kTypeValue;
+ *bytes_read += user_key_size + 1;
+ *internal_key_valid = false;
+ } else {
+ success = file_reader_.Read(file_offset, user_key_size + 8, internal_key);
+ if (!success) {
+ return file_reader_.status();
+ }
+ *internal_key_valid = true;
+ Status pik_status = ParseInternalKey(*internal_key, parsed_key,
+ false /* log_err_key */); // TODO
+ if (!pik_status.ok()) {
+ return Status::Corruption(
+ Slice("Corrupted key found during next key read. "),
+ pik_status.getState());
+ }
+ *bytes_read += user_key_size + 8;
+ }
+ return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key,
+ uint32_t* bytes_read,
+ bool* /*seekable*/) {
+ uint32_t user_key_size = 0;
+ Status s;
+ if (fixed_user_key_len_ != kPlainTableVariableLength) {
+ user_key_size = fixed_user_key_len_;
+ } else {
+ uint32_t tmp_size = 0;
+ uint32_t tmp_read;
+ bool success =
+ file_reader_.ReadVarint32(start_offset, &tmp_size, &tmp_read);
+ if (!success) {
+ return file_reader_.status();
+ }
+ assert(tmp_read > 0);
+ user_key_size = tmp_size;
+ *bytes_read = tmp_read;
+ }
+ // dummy initial value to avoid compiler complain
+ bool decoded_internal_key_valid = true;
+ Slice decoded_internal_key;
+ s = ReadInternalKey(start_offset + *bytes_read, user_key_size, parsed_key,
+ bytes_read, &decoded_internal_key_valid,
+ &decoded_internal_key);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!file_reader_.file_info()->is_mmap_mode) {
+ cur_key_.SetInternalKey(*parsed_key);
+ parsed_key->user_key =
+ Slice(cur_key_.GetInternalKey().data(), user_key_size);
+ if (internal_key != nullptr) {
+ *internal_key = cur_key_.GetInternalKey();
+ }
+ } else if (internal_key != nullptr) {
+ if (decoded_internal_key_valid) {
+ *internal_key = decoded_internal_key;
+ } else {
+ // Need to copy out the internal key
+ cur_key_.SetInternalKey(*parsed_key);
+ *internal_key = cur_key_.GetInternalKey();
+ }
+ }
+ return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextPrefixEncodingKey(
+ uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key,
+ uint32_t* bytes_read, bool* seekable) {
+ PlainTableEntryType entry_type;
+
+ bool expect_suffix = false;
+ Status s;
+ do {
+ uint32_t size = 0;
+ // dummy initial value to avoid compiler complain
+ bool decoded_internal_key_valid = true;
+ uint32_t my_bytes_read = 0;
+ s = DecodeSize(start_offset + *bytes_read, &entry_type, &size,
+ &my_bytes_read);
+ if (!s.ok()) {
+ return s;
+ }
+ if (my_bytes_read == 0) {
+ return Status::Corruption("Unexpected EOF when reading size of the key");
+ }
+ *bytes_read += my_bytes_read;
+
+ switch (entry_type) {
+ case kFullKey: {
+ expect_suffix = false;
+ Slice decoded_internal_key;
+ s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+ bytes_read, &decoded_internal_key_valid,
+ &decoded_internal_key);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!file_reader_.file_info()->is_mmap_mode ||
+ (internal_key != nullptr && !decoded_internal_key_valid)) {
+ // In non-mmap mode, always need to make a copy of keys returned to
+ // users, because after reading value for the key, the key might
+ // be invalid.
+ cur_key_.SetInternalKey(*parsed_key);
+ saved_user_key_ = cur_key_.GetUserKey();
+ if (!file_reader_.file_info()->is_mmap_mode) {
+ parsed_key->user_key =
+ Slice(cur_key_.GetInternalKey().data(), size);
+ }
+ if (internal_key != nullptr) {
+ *internal_key = cur_key_.GetInternalKey();
+ }
+ } else {
+ if (internal_key != nullptr) {
+ *internal_key = decoded_internal_key;
+ }
+ saved_user_key_ = parsed_key->user_key;
+ }
+ break;
+ }
+ case kPrefixFromPreviousKey: {
+ if (seekable != nullptr) {
+ *seekable = false;
+ }
+ prefix_len_ = size;
+ assert(prefix_extractor_ == nullptr ||
+ prefix_extractor_->Transform(saved_user_key_).size() ==
+ prefix_len_);
+ // Need read another size flag for suffix
+ expect_suffix = true;
+ break;
+ }
+ case kKeySuffix: {
+ expect_suffix = false;
+ if (seekable != nullptr) {
+ *seekable = false;
+ }
+
+ Slice tmp_slice;
+ s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+ bytes_read, &decoded_internal_key_valid,
+ &tmp_slice);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!file_reader_.file_info()->is_mmap_mode) {
+ // In non-mmap mode, we need to make a copy of keys returned to
+ // users, because after reading value for the key, the key might
+ // be invalid.
+ // saved_user_key_ points to cur_key_. We are making a copy of
+ // the prefix part to another string, and construct the current
+ // key from the prefix part and the suffix part back to cur_key_.
+ std::string tmp =
+ Slice(saved_user_key_.data(), prefix_len_).ToString();
+ cur_key_.Reserve(prefix_len_ + size);
+ cur_key_.SetInternalKey(tmp, *parsed_key);
+ parsed_key->user_key =
+ Slice(cur_key_.GetInternalKey().data(), prefix_len_ + size);
+ saved_user_key_ = cur_key_.GetUserKey();
+ } else {
+ cur_key_.Reserve(prefix_len_ + size);
+ cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_),
+ *parsed_key);
+ }
+ parsed_key->user_key = cur_key_.GetUserKey();
+ if (internal_key != nullptr) {
+ *internal_key = cur_key_.GetInternalKey();
+ }
+ break;
+ }
+ default:
+ return Status::Corruption("Un-identified size flag.");
+ }
+ } while (expect_suffix); // Another round if suffix is expected.
+ return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextKey(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key, Slice* value,
+ uint32_t* bytes_read, bool* seekable) {
+ assert(value != nullptr);
+ Status s = NextKeyNoValue(start_offset, parsed_key, internal_key, bytes_read,
+ seekable);
+ if (s.ok()) {
+ assert(bytes_read != nullptr);
+ uint32_t value_size;
+ uint32_t value_size_bytes;
+ bool success = file_reader_.ReadVarint32(start_offset + *bytes_read,
+ &value_size, &value_size_bytes);
+ if (!success) {
+ return file_reader_.status();
+ }
+ if (value_size_bytes == 0) {
+ return Status::Corruption(
+ "Unexpected EOF when reading the next value's size.");
+ }
+ *bytes_read += value_size_bytes;
+ success = file_reader_.Read(start_offset + *bytes_read, value_size, value);
+ if (!success) {
+ return file_reader_.status();
+ }
+ *bytes_read += value_size;
+ }
+ return s;
+}
+
+Status PlainTableKeyDecoder::NextKeyNoValue(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key,
+ uint32_t* bytes_read,
+ bool* seekable) {
+ *bytes_read = 0;
+ if (seekable != nullptr) {
+ *seekable = true;
+ }
+ if (encoding_type_ == kPlain) {
+ return NextPlainEncodingKey(start_offset, parsed_key, internal_key,
+ bytes_read, seekable);
+ } else {
+ assert(encoding_type_ == kPrefix);
+ return NextPrefixEncodingKey(start_offset, parsed_key, internal_key,
+ bytes_read, seekable);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LIT
diff --git a/src/rocksdb/table/plain/plain_table_key_coding.h b/src/rocksdb/table/plain/plain_table_key_coding.h
new file mode 100644
index 000000000..9cda7df32
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_key_coding.h
@@ -0,0 +1,201 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <array>
+
+#include "rocksdb/slice.h"
+#include "table/plain/plain_table_reader.h"
+
+// The file contains three helper classes of PlainTable format,
+// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
+// These classes issue the lowest level of operations of PlainTable.
+// Actual data format of the key is documented in comments of class
+// PlainTableFactory.
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFile;
+struct ParsedInternalKey;
+struct PlainTableReaderFileInfo;
+enum PlainTableEntryType : unsigned char;
+
+// Helper class for PlainTable format to write out a key to an output file
+// The class is used in PlainTableBuilder.
+class PlainTableKeyEncoder {
+ public:
+ explicit PlainTableKeyEncoder(EncodingType encoding_type,
+ uint32_t user_key_len,
+ const SliceTransform* prefix_extractor,
+ size_t index_sparseness)
+ : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
+ fixed_user_key_len_(user_key_len),
+ prefix_extractor_(prefix_extractor),
+ index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
+ key_count_for_prefix_(0) {}
+ // key: the key to write out, in the format of internal key.
+ // file: the output file to write out
+ // offset: offset in the file. Needs to be updated after appending bytes
+ // for the key
+ // meta_bytes_buf: buffer for extra meta bytes
+ // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
+ // if meta_bytes_buf is updated.
+ IOStatus AppendKey(const Slice& key, WritableFileWriter* file,
+ uint64_t* offset, char* meta_bytes_buf,
+ size_t* meta_bytes_buf_size);
+
+ // Return actual encoding type to be picked
+ EncodingType GetEncodingType() { return encoding_type_; }
+
+ private:
+ EncodingType encoding_type_;
+ uint32_t fixed_user_key_len_;
+ const SliceTransform* prefix_extractor_;
+ const size_t index_sparseness_;
+ size_t key_count_for_prefix_;
+ IterKey pre_prefix_;
+};
+
+// The class does raw file reads for PlainTableReader.
+// It hides whether it is a mmap-read, or a non-mmap read.
+// The class is implemented in a way to favor the performance of mmap case.
+// The class is used by PlainTableReader.
+class PlainTableFileReader {
+ public:
+ explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
+ : file_info_(_file_info), num_buf_(0) {}
+
+ ~PlainTableFileReader() {
+ // Should fix.
+ status_.PermitUncheckedError();
+ }
+
+ // In mmaped mode, the results point to mmaped area of the file, which
+ // means it is always valid before closing the file.
+ // In non-mmap mode, the results point to an internal buffer. If the caller
+ // makes another read call, the results may not be valid. So callers should
+ // make a copy when needed.
+ // In order to save read calls to files, we keep two internal buffers:
+ // the first read and the most recent read. This is efficient because it
+ // columns these two common use cases:
+ // (1) hash index only identify one location, we read the key to verify
+ // the location, and read key and value if it is the right location.
+ // (2) after hash index checking, we identify two locations (because of
+ // hash bucket conflicts), we binary search the two location to see
+ // which one is what we need and start to read from the location.
+ // These two most common use cases will be covered by the two buffers
+ // so that we don't need to re-read the same location.
+ // Currently we keep a fixed size buffer. If a read doesn't exactly fit
+ // the buffer, we replace the second buffer with the location user reads.
+ //
+ // If return false, status code is stored in status_.
+ bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
+ if (file_info_->is_mmap_mode) {
+ assert(file_offset + len <= file_info_->data_end_offset);
+ *out = Slice(file_info_->file_data.data() + file_offset, len);
+ return true;
+ } else {
+ return ReadNonMmap(file_offset, len, out);
+ }
+ }
+
+ // If return false, status code is stored in status_.
+ bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
+
+ // *bytes_read = 0 means eof. false means failure and status is saved
+ // in status_. Not directly returning Status to save copying status
+ // object to map previous performance of mmap mode.
+ inline bool ReadVarint32(uint32_t offset, uint32_t* output,
+ uint32_t* bytes_read);
+
+ bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
+ uint32_t* bytes_read);
+
+ Status status() const { return status_; }
+
+ const PlainTableReaderFileInfo* file_info() { return file_info_; }
+
+ private:
+ const PlainTableReaderFileInfo* file_info_;
+
+ struct Buffer {
+ Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
+ std::unique_ptr<char[]> buf;
+ uint32_t buf_start_offset;
+ uint32_t buf_len;
+ uint32_t buf_capacity;
+ };
+
+ // Keep buffers for two recent reads.
+ std::array<std::unique_ptr<Buffer>, 2> buffers_;
+ uint32_t num_buf_;
+ Status status_;
+
+ Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
+};
+
+// A helper class to decode keys from input buffer
+// The class is used by PlainTableBuilder.
+class PlainTableKeyDecoder {
+ public:
+ explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
+ EncodingType encoding_type,
+ uint32_t user_key_len,
+ const SliceTransform* prefix_extractor)
+ : file_reader_(file_info),
+ encoding_type_(encoding_type),
+ prefix_len_(0),
+ fixed_user_key_len_(user_key_len),
+ prefix_extractor_(prefix_extractor),
+ in_prefix_(false) {}
+
+ // Find the next key.
+ // start: char array where the key starts.
+ // limit: boundary of the char array
+ // parsed_key: the output of the result key
+ // internal_key: if not null, fill with the output of the result key in
+ // un-parsed format
+ // bytes_read: how many bytes read from start. Output
+ // seekable: whether key can be read from this place. Used when building
+ // indexes. Output.
+ Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
+ Slice* internal_key, Slice* value, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+
+ Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
+ Slice* internal_key, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+
+ PlainTableFileReader file_reader_;
+ EncodingType encoding_type_;
+ uint32_t prefix_len_;
+ uint32_t fixed_user_key_len_;
+ Slice saved_user_key_;
+ IterKey cur_key_;
+ const SliceTransform* prefix_extractor_;
+ bool in_prefix_;
+
+ private:
+ Status NextPlainEncodingKey(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+ Status NextPrefixEncodingKey(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+ Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
+ ParsedInternalKey* parsed_key, uint32_t* bytes_read,
+ bool* internal_key_valid, Slice* internal_key);
+ inline Status DecodeSize(uint32_t start_offset,
+ PlainTableEntryType* entry_type, uint32_t* key_size,
+ uint32_t* bytes_read);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_reader.cc b/src/rocksdb/table/plain/plain_table_reader.cc
new file mode 100644
index 000000000..6ce3d0ab9
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_reader.cc
@@ -0,0 +1,765 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "table/plain/plain_table_reader.h"
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "memory/arena.h"
+#include "monitoring/histogram.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "table/block_based/block.h"
+#include "table/block_based/filter_block.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Safely getting a uint32_t element from a char array, where, starting from
+// `base`, every 4 bytes are considered as an fixed 32 bit integer.
+inline uint32_t GetFixed32Element(const char* base, size_t offset) {
+ return DecodeFixed32(base + offset * sizeof(uint32_t));
+}
+} // namespace
+
+// Iterator to iterate IndexedTable
+class PlainTableIterator : public InternalIterator {
+ public:
+ explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
+ // No copying allowed
+ PlainTableIterator(const PlainTableIterator&) = delete;
+ void operator=(const Iterator&) = delete;
+
+ ~PlainTableIterator() override;
+
+ bool Valid() const override;
+
+ void SeekToFirst() override;
+
+ void SeekToLast() override;
+
+ void Seek(const Slice& target) override;
+
+ void SeekForPrev(const Slice& target) override;
+
+ void Next() override;
+
+ void Prev() override;
+
+ Slice key() const override;
+
+ Slice value() const override;
+
+ Status status() const override;
+
+ private:
+ PlainTableReader* table_;
+ PlainTableKeyDecoder decoder_;
+ bool use_prefix_seek_;
+ uint32_t offset_;
+ uint32_t next_offset_;
+ Slice key_;
+ Slice value_;
+ Status status_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+PlainTableReader::PlainTableReader(
+ const ImmutableOptions& ioptions,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
+ EncodingType encoding_type, uint64_t file_size,
+ const TableProperties* table_properties,
+ const SliceTransform* prefix_extractor)
+ : internal_comparator_(icomparator),
+ encoding_type_(encoding_type),
+ full_scan_mode_(false),
+ user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
+ prefix_extractor_(prefix_extractor),
+ enable_bloom_(false),
+ bloom_(6),
+ file_info_(std::move(file), storage_options,
+ static_cast<uint32_t>(table_properties->data_size)),
+ ioptions_(ioptions),
+ file_size_(file_size),
+ table_properties_(nullptr) {}
+
+PlainTableReader::~PlainTableReader() {
+ // Should fix?
+ status_.PermitUncheckedError();
+}
+
+Status PlainTableReader::Open(
+ const ImmutableOptions& ioptions, const EnvOptions& env_options,
+ const InternalKeyComparator& internal_comparator,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
+ double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
+ bool full_scan_mode, const bool immortal_table,
+ const SliceTransform* prefix_extractor) {
+ if (file_size > PlainTableIndex::kMaxFileSize) {
+ return Status::NotSupported("File is too large for PlainTableReader!");
+ }
+
+ std::unique_ptr<TableProperties> props;
+ auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+ ioptions, &props);
+ if (!s.ok()) {
+ return s;
+ }
+
+ assert(hash_table_ratio >= 0.0);
+ auto& user_props = props->user_collected_properties;
+ auto prefix_extractor_in_file = props->prefix_extractor_name;
+
+ if (!full_scan_mode &&
+ !prefix_extractor_in_file.empty() /* old version sst file*/
+ && prefix_extractor_in_file != "nullptr") {
+ if (!prefix_extractor) {
+ return Status::InvalidArgument(
+ "Prefix extractor is missing when opening a PlainTable built "
+ "using a prefix extractor");
+ } else if (prefix_extractor_in_file != prefix_extractor->AsString()) {
+ return Status::InvalidArgument(
+ "Prefix extractor given doesn't match the one used to build "
+ "PlainTable");
+ }
+ }
+
+ EncodingType encoding_type = kPlain;
+ auto encoding_type_prop =
+ user_props.find(PlainTablePropertyNames::kEncodingType);
+ if (encoding_type_prop != user_props.end()) {
+ encoding_type = static_cast<EncodingType>(
+ DecodeFixed32(encoding_type_prop->second.c_str()));
+ }
+
+ std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
+ ioptions, std::move(file), env_options, internal_comparator,
+ encoding_type, file_size, props.get(), prefix_extractor));
+
+ s = new_reader->MmapDataIfNeeded();
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (!full_scan_mode) {
+ s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
+ hash_table_ratio, index_sparseness,
+ huge_page_tlb_size);
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ // Flag to indicate it is a full scan mode so that none of the indexes
+ // can be used.
+ new_reader->full_scan_mode_ = true;
+ }
+ // PopulateIndex can add to the props, so don't store them until now
+ new_reader->table_properties_ = std::move(props);
+
+ if (immortal_table && new_reader->file_info_.is_mmap_mode) {
+ new_reader->dummy_cleanable_.reset(new Cleanable());
+ }
+
+ *table_reader = std::move(new_reader);
+ return s;
+}
+
+void PlainTableReader::SetupForCompaction() {}
+
+InternalIterator* PlainTableReader::NewIterator(
+ const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
+ Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
+ size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
+ // Not necessarily used here, but make sure this has been initialized
+ assert(table_properties_);
+
+ // Auto prefix mode is not implemented in PlainTable.
+ bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
+ !options.auto_prefix_mode;
+ if (arena == nullptr) {
+ return new PlainTableIterator(this, use_prefix_seek);
+ } else {
+ auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
+ return new (mem) PlainTableIterator(this, use_prefix_seek);
+ }
+}
+
+Status PlainTableReader::PopulateIndexRecordList(
+ PlainTableIndexBuilder* index_builder,
+ std::vector<uint32_t>* prefix_hashes) {
+ Slice prev_key_prefix_slice;
+ std::string prev_key_prefix_buf;
+ uint32_t pos = data_start_offset_;
+
+ bool is_first_record = true;
+ Slice key_prefix_slice;
+ PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
+ prefix_extractor_);
+ while (pos < file_info_.data_end_offset) {
+ uint32_t key_offset = pos;
+ ParsedInternalKey key;
+ Slice value_slice;
+ bool seekable = false;
+ Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
+ if (!s.ok()) {
+ return s;
+ }
+
+ key_prefix_slice = GetPrefix(key);
+ if (enable_bloom_) {
+ bloom_.AddHash(GetSliceHash(key.user_key));
+ } else {
+ if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
+ if (!is_first_record) {
+ prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
+ }
+ if (file_info_.is_mmap_mode) {
+ prev_key_prefix_slice = key_prefix_slice;
+ } else {
+ prev_key_prefix_buf = key_prefix_slice.ToString();
+ prev_key_prefix_slice = prev_key_prefix_buf;
+ }
+ }
+ }
+
+ index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
+
+ if (!seekable && is_first_record) {
+ return Status::Corruption("Key for a prefix is not seekable");
+ }
+
+ is_first_record = false;
+ }
+
+ prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
+ auto s = index_.InitFromRawData(index_builder->Finish());
+ return s;
+}
+
+void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
+ size_t huge_page_tlb_size) {
+ uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
+ if (bloom_total_bits > 0) {
+ enable_bloom_ = true;
+ bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
+ huge_page_tlb_size, ioptions_.logger);
+ }
+}
+
+void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
+ assert(bloom_.IsInitialized());
+ for (const auto prefix_hash : prefix_hashes) {
+ bloom_.AddHash(prefix_hash);
+ }
+}
+
+Status PlainTableReader::MmapDataIfNeeded() {
+ if (file_info_.is_mmap_mode) {
+ // Get mmapped memory.
+ return file_info_.file->Read(
+ IOOptions(), 0, static_cast<size_t>(file_size_), &file_info_.file_data,
+ nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+ }
+ return Status::OK();
+}
+
+Status PlainTableReader::PopulateIndex(TableProperties* props,
+ int bloom_bits_per_key,
+ double hash_table_ratio,
+ size_t index_sparseness,
+ size_t huge_page_tlb_size) {
+ assert(props != nullptr);
+
+ BlockContents index_block_contents;
+ Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+ file_size_, kPlainTableMagicNumber, ioptions_,
+ PlainTableIndexBuilder::kPlainTableIndexBlock,
+ BlockType::kIndex, &index_block_contents);
+
+ bool index_in_file = s.ok();
+
+ BlockContents bloom_block_contents;
+ bool bloom_in_file = false;
+ // We only need to read the bloom block if index block is in file.
+ if (index_in_file) {
+ s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+ file_size_, kPlainTableMagicNumber, ioptions_,
+ BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
+ &bloom_block_contents);
+ bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
+ }
+
+ Slice* bloom_block;
+ if (bloom_in_file) {
+ // If bloom_block_contents.allocation is not empty (which will be the case
+ // for non-mmap mode), it holds the alloated memory for the bloom block.
+ // It needs to be kept alive to keep `bloom_block` valid.
+ bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
+ bloom_block = &bloom_block_contents.data;
+ } else {
+ bloom_block = nullptr;
+ }
+
+ Slice* index_block;
+ if (index_in_file) {
+ // If index_block_contents.allocation is not empty (which will be the case
+ // for non-mmap mode), it holds the alloated memory for the index block.
+ // It needs to be kept alive to keep `index_block` valid.
+ index_block_alloc_ = std::move(index_block_contents.allocation);
+ index_block = &index_block_contents.data;
+ } else {
+ index_block = nullptr;
+ }
+
+ if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
+ // moptions.prefix_extractor is requried for a hash-based look-up.
+ return Status::NotSupported(
+ "PlainTable requires a prefix extractor enable prefix hash mode.");
+ }
+
+ // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
+ // for a prefix (starting from the first one), generate a record of (hash,
+ // offset) and append it to IndexRecordList, which is a data structure created
+ // to store them.
+
+ if (!index_in_file) {
+ // Allocate bloom filter here for total order mode.
+ if (IsTotalOrderMode()) {
+ AllocateBloom(bloom_bits_per_key,
+ static_cast<uint32_t>(props->num_entries),
+ huge_page_tlb_size);
+ }
+ } else if (bloom_in_file) {
+ enable_bloom_ = true;
+ auto num_blocks_property = props->user_collected_properties.find(
+ PlainTablePropertyNames::kNumBloomBlocks);
+
+ uint32_t num_blocks = 0;
+ if (num_blocks_property != props->user_collected_properties.end()) {
+ Slice temp_slice(num_blocks_property->second);
+ if (!GetVarint32(&temp_slice, &num_blocks)) {
+ num_blocks = 0;
+ }
+ }
+ // cast away const qualifier, because bloom_ won't be changed
+ bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
+ static_cast<uint32_t>(bloom_block->size()) * 8,
+ num_blocks);
+ } else {
+ // Index in file but no bloom in file. Disable bloom filter in this case.
+ enable_bloom_ = false;
+ bloom_bits_per_key = 0;
+ }
+
+ PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
+ index_sparseness, hash_table_ratio,
+ huge_page_tlb_size);
+
+ std::vector<uint32_t> prefix_hashes;
+ if (!index_in_file) {
+ // Populates _bloom if enabled (total order mode)
+ s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ s = index_.InitFromRawData(*index_block);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (!index_in_file) {
+ if (!IsTotalOrderMode()) {
+ // Calculated bloom filter size and allocate memory for
+ // bloom filter based on the number of prefixes, then fill it.
+ AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
+ huge_page_tlb_size);
+ if (enable_bloom_) {
+ FillBloom(prefix_hashes);
+ }
+ }
+ }
+
+ // Fill two table properties.
+ if (!index_in_file) {
+ props->user_collected_properties["plain_table_hash_table_size"] =
+ std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
+ props->user_collected_properties["plain_table_sub_index_size"] =
+ std::to_string(index_.GetSubIndexSize());
+ } else {
+ props->user_collected_properties["plain_table_hash_table_size"] =
+ std::to_string(0);
+ props->user_collected_properties["plain_table_sub_index_size"] =
+ std::to_string(0);
+ }
+
+ return Status::OK();
+}
+
+Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
+ const Slice& target, const Slice& prefix,
+ uint32_t prefix_hash, bool& prefix_matched,
+ uint32_t* offset) const {
+ prefix_matched = false;
+ uint32_t prefix_index_offset;
+ auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
+ if (res == PlainTableIndex::kNoPrefixForBucket) {
+ *offset = file_info_.data_end_offset;
+ return Status::OK();
+ } else if (res == PlainTableIndex::kDirectToFile) {
+ *offset = prefix_index_offset;
+ return Status::OK();
+ }
+
+ // point to sub-index, need to do a binary search
+ uint32_t upper_bound = 0;
+ const char* base_ptr =
+ index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
+ uint32_t low = 0;
+ uint32_t high = upper_bound;
+ ParsedInternalKey mid_key;
+ ParsedInternalKey parsed_target;
+ Status s = ParseInternalKey(target, &parsed_target,
+ false /* log_err_key */); // TODO
+ if (!s.ok()) return s;
+
+ // The key is between [low, high). Do a binary search between it.
+ while (high - low > 1) {
+ uint32_t mid = (high + low) / 2;
+ uint32_t file_offset = GetFixed32Element(base_ptr, mid);
+ uint32_t tmp;
+ s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
+ if (!s.ok()) {
+ return s;
+ }
+ int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
+ if (cmp_result < 0) {
+ low = mid;
+ } else {
+ if (cmp_result == 0) {
+ // Happen to have found the exact key or target is smaller than the
+ // first key after base_offset.
+ prefix_matched = true;
+ *offset = file_offset;
+ return Status::OK();
+ } else {
+ high = mid;
+ }
+ }
+ }
+ // Both of the key at the position low or low+1 could share the same
+ // prefix as target. We need to rule out one of them to avoid to go
+ // to the wrong prefix.
+ ParsedInternalKey low_key;
+ uint32_t tmp;
+ uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
+ s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (GetPrefix(low_key) == prefix) {
+ prefix_matched = true;
+ *offset = low_key_offset;
+ } else if (low + 1 < upper_bound) {
+ // There is possible a next prefix, return it
+ prefix_matched = false;
+ *offset = GetFixed32Element(base_ptr, low + 1);
+ } else {
+ // target is larger than a key of the last prefix in this bucket
+ // but with a different prefix. Key does not exist.
+ *offset = file_info_.data_end_offset;
+ }
+ return Status::OK();
+}
+
+bool PlainTableReader::MatchBloom(uint32_t hash) const {
+ if (!enable_bloom_) {
+ return true;
+ }
+
+ if (bloom_.MayContainHash(hash)) {
+ PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+ return true;
+ } else {
+ PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+ return false;
+ }
+}
+
+Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key, Slice* value,
+ bool* seekable) const {
+ if (*offset == file_info_.data_end_offset) {
+ *offset = file_info_.data_end_offset;
+ return Status::OK();
+ }
+
+ if (*offset > file_info_.data_end_offset) {
+ return Status::Corruption("Offset is out of file size");
+ }
+
+ uint32_t bytes_read;
+ Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
+ &bytes_read, seekable);
+ if (!s.ok()) {
+ return s;
+ }
+ *offset = *offset + bytes_read;
+ return Status::OK();
+}
+
+void PlainTableReader::Prepare(const Slice& target) {
+ if (enable_bloom_) {
+ uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
+ bloom_.Prefetch(prefix_hash);
+ }
+}
+
+Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
+ GetContext* get_context,
+ const SliceTransform* /* prefix_extractor */,
+ bool /*skip_filters*/) {
+ // Check bloom filter first.
+ Slice prefix_slice;
+ uint32_t prefix_hash;
+ if (IsTotalOrderMode()) {
+ if (full_scan_mode_) {
+ status_ =
+ Status::InvalidArgument("Get() is not allowed in full scan mode.");
+ }
+ // Match whole user key for bloom filter check.
+ if (!MatchBloom(GetSliceHash(ExtractUserKey(target)))) {
+ return Status::OK();
+ }
+ // in total order mode, there is only one bucket 0, and we always use empty
+ // prefix.
+ prefix_slice = Slice();
+ prefix_hash = 0;
+ } else {
+ prefix_slice = GetPrefix(target);
+ prefix_hash = GetSliceHash(prefix_slice);
+ if (!MatchBloom(prefix_hash)) {
+ return Status::OK();
+ }
+ }
+ uint32_t offset;
+ bool prefix_match;
+ PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
+ prefix_extractor_);
+ Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
+ prefix_match, &offset);
+
+ if (!s.ok()) {
+ return s;
+ }
+ ParsedInternalKey found_key;
+ ParsedInternalKey parsed_target;
+ s = ParseInternalKey(target, &parsed_target,
+ false /* log_err_key */); // TODO
+ if (!s.ok()) return s;
+
+ Slice found_value;
+ while (offset < file_info_.data_end_offset) {
+ s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!prefix_match) {
+ // Need to verify prefix for the first key found if it is not yet
+ // checked.
+ if (GetPrefix(found_key) != prefix_slice) {
+ return Status::OK();
+ }
+ prefix_match = true;
+ }
+ // TODO(ljin): since we know the key comparison result here,
+ // can we enable the fast path?
+ if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
+ bool dont_care __attribute__((__unused__));
+ if (!get_context->SaveValue(found_key, found_value, &dont_care,
+ dummy_cleanable_.get())) {
+ break;
+ }
+ }
+ }
+ return Status::OK();
+}
+
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
+ TableReaderCaller /*caller*/) {
+ return 0;
+}
+
+uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
+ const Slice& /*end*/,
+ TableReaderCaller /*caller*/) {
+ return 0;
+}
+
+PlainTableIterator::PlainTableIterator(PlainTableReader* table,
+ bool use_prefix_seek)
+ : table_(table),
+ decoder_(&table_->file_info_, table_->encoding_type_,
+ table_->user_key_len_, table_->prefix_extractor_),
+ use_prefix_seek_(use_prefix_seek) {
+ next_offset_ = offset_ = table_->file_info_.data_end_offset;
+}
+
+PlainTableIterator::~PlainTableIterator() {}
+
+bool PlainTableIterator::Valid() const {
+ return offset_ < table_->file_info_.data_end_offset &&
+ offset_ >= table_->data_start_offset_;
+}
+
+void PlainTableIterator::SeekToFirst() {
+ status_ = Status::OK();
+ next_offset_ = table_->data_start_offset_;
+ if (next_offset_ >= table_->file_info_.data_end_offset) {
+ next_offset_ = offset_ = table_->file_info_.data_end_offset;
+ } else {
+ Next();
+ }
+}
+
+void PlainTableIterator::SeekToLast() {
+ assert(false);
+ status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
+ next_offset_ = offset_ = table_->file_info_.data_end_offset;
+}
+
+void PlainTableIterator::Seek(const Slice& target) {
+ if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
+ // This check is done here instead of NewIterator() to permit creating an
+ // iterator with total_order_seek = true even if we won't be able to Seek()
+ // it. This is needed for compaction: it creates iterator with
+ // total_order_seek = true but usually never does Seek() on it,
+ // only SeekToFirst().
+ status_ = Status::InvalidArgument(
+ "total_order_seek not implemented for PlainTable.");
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ return;
+ }
+
+ // If the user doesn't set prefix seek option and we are not able to do a
+ // total Seek(). assert failure.
+ if (table_->IsTotalOrderMode()) {
+ if (table_->full_scan_mode_) {
+ status_ =
+ Status::InvalidArgument("Seek() is not allowed in full scan mode.");
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ return;
+ } else if (table_->GetIndexSize() > 1) {
+ assert(false);
+ status_ = Status::NotSupported(
+ "PlainTable cannot issue non-prefix seek unless in total order "
+ "mode.");
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ return;
+ }
+ }
+
+ Slice prefix_slice = table_->GetPrefix(target);
+ uint32_t prefix_hash = 0;
+ // Bloom filter is ignored in total-order mode.
+ if (!table_->IsTotalOrderMode()) {
+ prefix_hash = GetSliceHash(prefix_slice);
+ if (!table_->MatchBloom(prefix_hash)) {
+ status_ = Status::OK();
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ return;
+ }
+ }
+ bool prefix_match;
+ status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash,
+ prefix_match, &next_offset_);
+ if (!status_.ok()) {
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ return;
+ }
+
+ if (next_offset_ < table_->file_info_.data_end_offset) {
+ for (Next(); status_.ok() && Valid(); Next()) {
+ if (!prefix_match) {
+ // Need to verify the first key's prefix
+ if (table_->GetPrefix(key()) != prefix_slice) {
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ break;
+ }
+ prefix_match = true;
+ }
+ if (table_->internal_comparator_.Compare(key(), target) >= 0) {
+ break;
+ }
+ }
+ } else {
+ offset_ = table_->file_info_.data_end_offset;
+ }
+}
+
+void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
+ assert(false);
+ status_ =
+ Status::NotSupported("SeekForPrev() is not supported in PlainTable");
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+}
+
+void PlainTableIterator::Next() {
+ offset_ = next_offset_;
+ if (offset_ < table_->file_info_.data_end_offset) {
+ Slice tmp_slice;
+ ParsedInternalKey parsed_key;
+ status_ =
+ table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
+ if (!status_.ok()) {
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ }
+ }
+}
+
+void PlainTableIterator::Prev() { assert(false); }
+
+Slice PlainTableIterator::key() const {
+ assert(Valid());
+ return key_;
+}
+
+Slice PlainTableIterator::value() const {
+ assert(Valid());
+ return value_;
+}
+
+Status PlainTableIterator::status() const { return status_; }
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_reader.h b/src/rocksdb/table/plain/plain_table_reader.h
new file mode 100644
index 000000000..62bda693a
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_reader.h
@@ -0,0 +1,244 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Block;
+struct BlockContents;
+class BlockHandle;
+class Footer;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class TableReader;
+class InternalKeyComparator;
+class PlainTableKeyDecoder;
+class GetContext;
+
+extern const uint32_t kPlainTableVariableLength;
+
+struct PlainTableReaderFileInfo {
+ bool is_mmap_mode;
+ Slice file_data;
+ uint32_t data_end_offset;
+ std::unique_ptr<RandomAccessFileReader> file;
+
+ PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file,
+ const EnvOptions& storage_options,
+ uint32_t _data_size_offset)
+ : is_mmap_mode(storage_options.use_mmap_reads),
+ data_end_offset(_data_size_offset),
+ file(std::move(_file)) {}
+};
+
+// The reader class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableReader : public TableReader {
+ public:
+ // Based on following output file format shown in plain_table_factory.h
+ // When opening the output file, PlainTableReader creates a hash table
+ // from key prefixes to offset of the output file. PlainTable will decide
+ // whether it points to the data offset of the first key with the key prefix
+ // or the offset of it. If there are too many keys share this prefix, it will
+ // create a binary search-able index from the suffix to offset on disk.
+ static Status Open(const ImmutableOptions& ioptions,
+ const EnvOptions& env_options,
+ const InternalKeyComparator& internal_comparator,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ uint64_t file_size, std::unique_ptr<TableReader>* table,
+ const int bloom_bits_per_key, double hash_table_ratio,
+ size_t index_sparseness, size_t huge_page_tlb_size,
+ bool full_scan_mode, const bool immortal_table = false,
+ const SliceTransform* prefix_extractor = nullptr);
+
+ // Returns new iterator over table contents
+ // compaction_readahead_size: its value will only be used if for_compaction =
+ // true
+ InternalIterator* NewIterator(const ReadOptions&,
+ const SliceTransform* prefix_extractor,
+ Arena* arena, bool skip_filters,
+ TableReaderCaller caller,
+ size_t compaction_readahead_size = 0,
+ bool allow_unprepared_value = false) override;
+
+ void Prepare(const Slice& target) override;
+
+ Status Get(const ReadOptions& readOptions, const Slice& key,
+ GetContext* get_context, const SliceTransform* prefix_extractor,
+ bool skip_filters = false) override;
+
+ uint64_t ApproximateOffsetOf(const Slice& key,
+ TableReaderCaller caller) override;
+
+ uint64_t ApproximateSize(const Slice& start, const Slice& end,
+ TableReaderCaller caller) override;
+
+ uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
+ void SetupForCompaction() override;
+
+ std::shared_ptr<const TableProperties> GetTableProperties() const override {
+ return table_properties_;
+ }
+
+ virtual size_t ApproximateMemoryUsage() const override {
+ return arena_.MemoryAllocatedBytes();
+ }
+
+ PlainTableReader(const ImmutableOptions& ioptions,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ const EnvOptions& env_options,
+ const InternalKeyComparator& internal_comparator,
+ EncodingType encoding_type, uint64_t file_size,
+ const TableProperties* table_properties,
+ const SliceTransform* prefix_extractor);
+ virtual ~PlainTableReader();
+
+ protected:
+ // Check bloom filter to see whether it might contain this prefix.
+ // The hash of the prefix is given, since it can be reused for index lookup
+ // too.
+ virtual bool MatchBloom(uint32_t hash) const;
+
+ // PopulateIndex() builds index of keys. It must be called before any query
+ // to the table.
+ //
+ // props: the table properties object that need to be stored. Ownership of
+ // the object will be passed.
+ //
+
+ Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
+ double hash_table_ratio, size_t index_sparseness,
+ size_t huge_page_tlb_size);
+
+ Status MmapDataIfNeeded();
+
+ private:
+ const InternalKeyComparator internal_comparator_;
+ EncodingType encoding_type_;
+ // represents plain table's current status.
+ Status status_;
+
+ PlainTableIndex index_;
+ bool full_scan_mode_;
+
+ // data_start_offset_ and data_end_offset_ defines the range of the
+ // sst file that stores data.
+ const uint32_t data_start_offset_ = 0;
+ const uint32_t user_key_len_;
+ const SliceTransform* prefix_extractor_;
+
+ static const size_t kNumInternalBytes = 8;
+
+ // Bloom filter is used to rule out non-existent key
+ bool enable_bloom_;
+ PlainTableBloomV1 bloom_;
+ PlainTableReaderFileInfo file_info_;
+ Arena arena_;
+ CacheAllocationPtr index_block_alloc_;
+ CacheAllocationPtr bloom_block_alloc_;
+
+ const ImmutableOptions& ioptions_;
+ std::unique_ptr<Cleanable> dummy_cleanable_;
+ uint64_t file_size_;
+
+ protected: // for testing
+ std::shared_ptr<const TableProperties> table_properties_;
+
+ private:
+ bool IsFixedLength() const {
+ return user_key_len_ != kPlainTableVariableLength;
+ }
+
+ size_t GetFixedInternalKeyLength() const {
+ return user_key_len_ + kNumInternalBytes;
+ }
+
+ Slice GetPrefix(const Slice& target) const {
+ assert(target.size() >= 8); // target is internal key
+ return GetPrefixFromUserKey(ExtractUserKey(target));
+ }
+
+ Slice GetPrefix(const ParsedInternalKey& target) const {
+ return GetPrefixFromUserKey(target.user_key);
+ }
+
+ Slice GetPrefixFromUserKey(const Slice& user_key) const {
+ if (!IsTotalOrderMode()) {
+ return prefix_extractor_->Transform(user_key);
+ } else {
+ // Use empty slice as prefix if prefix_extractor is not set.
+ // In that case,
+ // it falls back to pure binary search and
+ // total iterator seek is supported.
+ return Slice();
+ }
+ }
+
+ friend class TableCache;
+ friend class PlainTableIterator;
+
+ // Internal helper function to generate an IndexRecordList object from all
+ // the rows, which contains index records as a list.
+ // If bloom_ is not null, all the keys' full-key hash will be added to the
+ // bloom filter.
+ Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
+ std::vector<uint32_t>* prefix_hashes);
+
+ // Internal helper function to allocate memory for bloom filter
+ void AllocateBloom(int bloom_bits_per_key, int num_prefixes,
+ size_t huge_page_tlb_size);
+
+ void FillBloom(const std::vector<uint32_t>& prefix_hashes);
+
+ // Read the key and value at `offset` to parameters for keys, the and
+ // `seekable`.
+ // On success, `offset` will be updated as the offset for the next key.
+ // `parsed_key` will be key in parsed format.
+ // if `internal_key` is not empty, it will be filled with key with slice
+ // format.
+ // if `seekable` is not null, it will return whether we can directly read
+ // data using this offset.
+ Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+ ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
+ bool* seekable = nullptr) const;
+ // Get file offset for key target.
+ // return value prefix_matched is set to true if the offset is confirmed
+ // for a key with the same prefix as target.
+ Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target,
+ const Slice& prefix, uint32_t prefix_hash,
+ bool& prefix_matched, uint32_t* offset) const;
+
+ bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+
+ // No copying allowed
+ explicit PlainTableReader(const TableReader&) = delete;
+ void operator=(const TableReader&) = delete;
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE