summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/table
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/table')
-rw-r--r--src/rocksdb/table/adaptive/adaptive_table_factory.cc124
-rw-r--r--src/rocksdb/table/adaptive/adaptive_table_factory.h63
-rw-r--r--src/rocksdb/table/block_based/block.cc1004
-rw-r--r--src/rocksdb/table/block_based/block.h631
-rw-r--r--src/rocksdb/table/block_based/block_based_filter_block.cc347
-rw-r--r--src/rocksdb/table/block_based/block_based_filter_block.h119
-rw-r--r--src/rocksdb/table/block_based/block_based_filter_block_test.cc434
-rw-r--r--src/rocksdb/table/block_based/block_based_table_builder.cc1217
-rw-r--r--src/rocksdb/table/block_based/block_based_table_builder.h157
-rw-r--r--src/rocksdb/table/block_based/block_based_table_factory.cc649
-rw-r--r--src/rocksdb/table/block_based/block_based_table_factory.h195
-rw-r--r--src/rocksdb/table/block_based/block_based_table_reader.cc4531
-rw-r--r--src/rocksdb/table/block_based/block_based_table_reader.h824
-rw-r--r--src/rocksdb/table/block_based/block_builder.cc196
-rw-r--r--src/rocksdb/table/block_based/block_builder.h75
-rw-r--r--src/rocksdb/table/block_based/block_prefix_index.cc232
-rw-r--r--src/rocksdb/table/block_based/block_prefix_index.h66
-rw-r--r--src/rocksdb/table/block_based/block_test.cc627
-rw-r--r--src/rocksdb/table/block_based/block_type.h30
-rw-r--r--src/rocksdb/table/block_based/cachable_entry.h220
-rw-r--r--src/rocksdb/table/block_based/data_block_footer.cc59
-rw-r--r--src/rocksdb/table/block_based/data_block_footer.h25
-rw-r--r--src/rocksdb/table/block_based/data_block_hash_index.cc93
-rw-r--r--src/rocksdb/table/block_based/data_block_hash_index.h136
-rw-r--r--src/rocksdb/table/block_based/data_block_hash_index_test.cc719
-rw-r--r--src/rocksdb/table/block_based/filter_block.h176
-rw-r--r--src/rocksdb/table/block_based/filter_block_reader_common.cc102
-rw-r--r--src/rocksdb/table/block_based/filter_block_reader_common.h55
-rw-r--r--src/rocksdb/table/block_based/filter_policy.cc759
-rw-r--r--src/rocksdb/table/block_based/filter_policy_internal.h142
-rw-r--r--src/rocksdb/table/block_based/flush_block_policy.cc88
-rw-r--r--src/rocksdb/table/block_based/flush_block_policy.h41
-rw-r--r--src/rocksdb/table/block_based/full_filter_block.cc338
-rw-r--r--src/rocksdb/table/block_based/full_filter_block.h139
-rw-r--r--src/rocksdb/table/block_based/full_filter_block_test.cc333
-rw-r--r--src/rocksdb/table/block_based/index_builder.cc222
-rw-r--r--src/rocksdb/table/block_based/index_builder.h443
-rw-r--r--src/rocksdb/table/block_based/mock_block_based_table.h56
-rw-r--r--src/rocksdb/table/block_based/parsed_full_filter_block.cc22
-rw-r--r--src/rocksdb/table/block_based/parsed_full_filter_block.h40
-rw-r--r--src/rocksdb/table/block_based/partitioned_filter_block.cc388
-rw-r--r--src/rocksdb/table/block_based/partitioned_filter_block.h122
-rw-r--r--src/rocksdb/table/block_based/partitioned_filter_block_test.cc424
-rw-r--r--src/rocksdb/table/block_based/uncompression_dict_reader.cc120
-rw-r--r--src/rocksdb/table/block_based/uncompression_dict_reader.h59
-rw-r--r--src/rocksdb/table/block_fetcher.cc284
-rw-r--r--src/rocksdb/table/block_fetcher.h109
-rw-r--r--src/rocksdb/table/cleanable_test.cc277
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_builder.cc528
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_builder.h136
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc662
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_factory.cc72
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_factory.h92
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_reader.cc399
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_reader.h100
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc578
-rw-r--r--src/rocksdb/table/format.cc465
-rw-r--r--src/rocksdb/table/format.h344
-rw-r--r--src/rocksdb/table/get_context.cc366
-rw-r--r--src/rocksdb/table/get_context.h191
-rw-r--r--src/rocksdb/table/internal_iterator.h182
-rw-r--r--src/rocksdb/table/iter_heap.h42
-rw-r--r--src/rocksdb/table/iterator.cc210
-rw-r--r--src/rocksdb/table/iterator_wrapper.h149
-rw-r--r--src/rocksdb/table/merger_test.cc180
-rw-r--r--src/rocksdb/table/merging_iterator.cc468
-rw-r--r--src/rocksdb/table/merging_iterator.h64
-rw-r--r--src/rocksdb/table/meta_blocks.cc525
-rw-r--r--src/rocksdb/table/meta_blocks.h152
-rw-r--r--src/rocksdb/table/mock_table.cc148
-rw-r--r--src/rocksdb/table/mock_table.h214
-rw-r--r--src/rocksdb/table/multiget_context.h259
-rw-r--r--src/rocksdb/table/persistent_cache_helper.cc113
-rw-r--r--src/rocksdb/table/persistent_cache_helper.h44
-rw-r--r--src/rocksdb/table/persistent_cache_options.h34
-rw-r--r--src/rocksdb/table/plain/plain_table_bloom.cc78
-rw-r--r--src/rocksdb/table/plain/plain_table_bloom.h135
-rw-r--r--src/rocksdb/table/plain/plain_table_builder.cc314
-rw-r--r--src/rocksdb/table/plain/plain_table_builder.h151
-rw-r--r--src/rocksdb/table/plain/plain_table_factory.cc235
-rw-r--r--src/rocksdb/table/plain/plain_table_factory.h223
-rw-r--r--src/rocksdb/table/plain/plain_table_index.cc211
-rw-r--r--src/rocksdb/table/plain/plain_table_index.h249
-rw-r--r--src/rocksdb/table/plain/plain_table_key_coding.cc498
-rw-r--r--src/rocksdb/table/plain/plain_table_key_coding.h193
-rw-r--r--src/rocksdb/table/plain/plain_table_reader.cc775
-rw-r--r--src/rocksdb/table/plain/plain_table_reader.h246
-rw-r--r--src/rocksdb/table/scoped_arena_iterator.h61
-rw-r--r--src/rocksdb/table/sst_file_reader.cc91
-rw-r--r--src/rocksdb/table/sst_file_reader_test.cc174
-rw-r--r--src/rocksdb/table/sst_file_writer.cc319
-rw-r--r--src/rocksdb/table/sst_file_writer_collectors.h94
-rw-r--r--src/rocksdb/table/table_builder.h170
-rw-r--r--src/rocksdb/table/table_properties.cc272
-rw-r--r--src/rocksdb/table/table_properties_internal.h30
-rw-r--r--src/rocksdb/table/table_reader.h137
-rw-r--r--src/rocksdb/table/table_reader_bench.cc347
-rw-r--r--src/rocksdb/table/table_reader_caller.h39
-rw-r--r--src/rocksdb/table/table_test.cc4651
-rw-r--r--src/rocksdb/table/two_level_iterator.cc211
-rw-r--r--src/rocksdb/table/two_level_iterator.h43
101 files changed, 34176 insertions, 0 deletions
diff --git a/src/rocksdb/table/adaptive/adaptive_table_factory.cc b/src/rocksdb/table/adaptive/adaptive_table_factory.cc
new file mode 100644
index 000000000..fa94e7bcd
--- /dev/null
+++ b/src/rocksdb/table/adaptive/adaptive_table_factory.cc
@@ -0,0 +1,124 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/adaptive/adaptive_table_factory.h"
+
+#include "table/table_builder.h"
+#include "table/format.h"
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+AdaptiveTableFactory::AdaptiveTableFactory(
+ std::shared_ptr<TableFactory> table_factory_to_write,
+ std::shared_ptr<TableFactory> block_based_table_factory,
+ std::shared_ptr<TableFactory> plain_table_factory,
+ std::shared_ptr<TableFactory> cuckoo_table_factory)
+ : table_factory_to_write_(table_factory_to_write),
+ block_based_table_factory_(block_based_table_factory),
+ plain_table_factory_(plain_table_factory),
+ cuckoo_table_factory_(cuckoo_table_factory) {
+ if (!plain_table_factory_) {
+ plain_table_factory_.reset(NewPlainTableFactory());
+ }
+ if (!block_based_table_factory_) {
+ block_based_table_factory_.reset(NewBlockBasedTableFactory());
+ }
+ if (!cuckoo_table_factory_) {
+ cuckoo_table_factory_.reset(NewCuckooTableFactory());
+ }
+ if (!table_factory_to_write_) {
+ table_factory_to_write_ = block_based_table_factory_;
+ }
+}
+
+extern const uint64_t kPlainTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kCuckooTableMagicNumber;
+
+Status AdaptiveTableFactory::NewTableReader(
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table,
+ bool /*prefetch_index_and_filter_in_cache*/) const {
+ Footer footer;
+ auto s = ReadFooterFromFile(file.get(), nullptr /* prefetch_buffer */,
+ file_size, &footer);
+ if (!s.ok()) {
+ return s;
+ }
+ if (footer.table_magic_number() == kPlainTableMagicNumber ||
+ footer.table_magic_number() == kLegacyPlainTableMagicNumber) {
+ return plain_table_factory_->NewTableReader(
+ table_reader_options, std::move(file), file_size, table);
+ } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
+ footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
+ return block_based_table_factory_->NewTableReader(
+ table_reader_options, std::move(file), file_size, table);
+ } else if (footer.table_magic_number() == kCuckooTableMagicNumber) {
+ return cuckoo_table_factory_->NewTableReader(
+ table_reader_options, std::move(file), file_size, table);
+ } else {
+ return Status::NotSupported("Unidentified table format");
+ }
+}
+
+TableBuilder* AdaptiveTableFactory::NewTableBuilder(
+ const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+ WritableFileWriter* file) const {
+ return table_factory_to_write_->NewTableBuilder(table_builder_options,
+ column_family_id, file);
+}
+
+std::string AdaptiveTableFactory::GetPrintableTableOptions() const {
+ std::string ret;
+ ret.reserve(20000);
+ const int kBufferSize = 200;
+ char buffer[kBufferSize];
+
+ if (table_factory_to_write_) {
+ snprintf(buffer, kBufferSize, " write factory (%s) options:\n%s\n",
+ (table_factory_to_write_->Name() ? table_factory_to_write_->Name()
+ : ""),
+ table_factory_to_write_->GetPrintableTableOptions().c_str());
+ ret.append(buffer);
+ }
+ if (plain_table_factory_) {
+ snprintf(buffer, kBufferSize, " %s options:\n%s\n",
+ plain_table_factory_->Name() ? plain_table_factory_->Name() : "",
+ plain_table_factory_->GetPrintableTableOptions().c_str());
+ ret.append(buffer);
+ }
+ if (block_based_table_factory_) {
+ snprintf(
+ buffer, kBufferSize, " %s options:\n%s\n",
+ (block_based_table_factory_->Name() ? block_based_table_factory_->Name()
+ : ""),
+ block_based_table_factory_->GetPrintableTableOptions().c_str());
+ ret.append(buffer);
+ }
+ if (cuckoo_table_factory_) {
+ snprintf(buffer, kBufferSize, " %s options:\n%s\n",
+ cuckoo_table_factory_->Name() ? cuckoo_table_factory_->Name() : "",
+ cuckoo_table_factory_->GetPrintableTableOptions().c_str());
+ ret.append(buffer);
+ }
+ return ret;
+}
+
+extern TableFactory* NewAdaptiveTableFactory(
+ std::shared_ptr<TableFactory> table_factory_to_write,
+ std::shared_ptr<TableFactory> block_based_table_factory,
+ std::shared_ptr<TableFactory> plain_table_factory,
+ std::shared_ptr<TableFactory> cuckoo_table_factory) {
+ return new AdaptiveTableFactory(table_factory_to_write,
+ block_based_table_factory, plain_table_factory, cuckoo_table_factory);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/adaptive/adaptive_table_factory.h b/src/rocksdb/table/adaptive/adaptive_table_factory.h
new file mode 100644
index 000000000..fcc4c682c
--- /dev/null
+++ b/src/rocksdb/table/adaptive/adaptive_table_factory.h
@@ -0,0 +1,63 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct EnvOptions;
+
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+class AdaptiveTableFactory : public TableFactory {
+ public:
+ ~AdaptiveTableFactory() {}
+
+ explicit AdaptiveTableFactory(
+ std::shared_ptr<TableFactory> table_factory_to_write,
+ std::shared_ptr<TableFactory> block_based_table_factory,
+ std::shared_ptr<TableFactory> plain_table_factory,
+ std::shared_ptr<TableFactory> cuckoo_table_factory);
+
+ const char* Name() const override { return "AdaptiveTableFactory"; }
+
+ Status NewTableReader(
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table,
+ bool prefetch_index_and_filter_in_cache = true) const override;
+
+ TableBuilder* NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ uint32_t column_family_id, WritableFileWriter* file) const override;
+
+ // Sanitizes the specified DB Options.
+ Status SanitizeOptions(
+ const DBOptions& /*db_opts*/,
+ const ColumnFamilyOptions& /*cf_opts*/) const override {
+ return Status::OK();
+ }
+
+ std::string GetPrintableTableOptions() const override;
+
+ private:
+ std::shared_ptr<TableFactory> table_factory_to_write_;
+ std::shared_ptr<TableFactory> block_based_table_factory_;
+ std::shared_ptr<TableFactory> plain_table_factory_;
+ std::shared_ptr<TableFactory> cuckoo_table_factory_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/block_based/block.cc b/src/rocksdb/table/block_based/block.cc
new file mode 100644
index 000000000..a04dd8ac2
--- /dev/null
+++ b/src/rocksdb/table/block_based/block.cc
@@ -0,0 +1,1004 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Decodes the blocks generated by block_builder.cc.
+
+#include "table/block_based/block.h"
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_footer.h"
+#include "table/format.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Helper routine: decode the next block entry starting at "p",
+// storing the number of shared key bytes, non_shared key bytes,
+// and the length of the value in "*shared", "*non_shared", and
+// "*value_length", respectively. Will not derefence past "limit".
+//
+// If any errors are detected, returns nullptr. Otherwise, returns a
+// pointer to the key delta (just past the three decoded values).
+struct DecodeEntry {
+ inline const char* operator()(const char* p, const char* limit,
+ uint32_t* shared, uint32_t* non_shared,
+ uint32_t* value_length) {
+ // We need 2 bytes for shared and non_shared size. We also need one more
+ // byte either for value size or the actual value in case of value delta
+ // encoding.
+ assert(limit - p >= 3);
+ *shared = reinterpret_cast<const unsigned char*>(p)[0];
+ *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+ *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+ if ((*shared | *non_shared | *value_length) < 128) {
+ // Fast path: all three values are encoded in one byte each
+ p += 3;
+ } else {
+ if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+ if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+ if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+ return nullptr;
+ }
+ }
+
+ // Using an assert in place of "return null" since we should not pay the
+ // cost of checking for corruption on every single key decoding
+ assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
+ return p;
+ }
+};
+
+// Helper routine: similar to DecodeEntry but does not have assertions.
+// Instead, returns nullptr so that caller can detect and report failure.
+struct CheckAndDecodeEntry {
+ inline const char* operator()(const char* p, const char* limit,
+ uint32_t* shared, uint32_t* non_shared,
+ uint32_t* value_length) {
+ // We need 2 bytes for shared and non_shared size. We also need one more
+ // byte either for value size or the actual value in case of value delta
+ // encoding.
+ if (limit - p < 3) {
+ return nullptr;
+ }
+ *shared = reinterpret_cast<const unsigned char*>(p)[0];
+ *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+ *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+ if ((*shared | *non_shared | *value_length) < 128) {
+ // Fast path: all three values are encoded in one byte each
+ p += 3;
+ } else {
+ if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+ if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+ if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+ return nullptr;
+ }
+ }
+
+ if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
+ return nullptr;
+ }
+ return p;
+ }
+};
+
+struct DecodeKey {
+ inline const char* operator()(const char* p, const char* limit,
+ uint32_t* shared, uint32_t* non_shared) {
+ uint32_t value_length;
+ return DecodeEntry()(p, limit, shared, non_shared, &value_length);
+ }
+};
+
+// In format_version 4, which is used by index blocks, the value size is not
+// encoded before the entry, as the value is known to be the handle with the
+// known size.
+struct DecodeKeyV4 {
+ inline const char* operator()(const char* p, const char* limit,
+ uint32_t* shared, uint32_t* non_shared) {
+ // We need 2 bytes for shared and non_shared size. We also need one more
+ // byte either for value size or the actual value in case of value delta
+ // encoding.
+ if (limit - p < 3) return nullptr;
+ *shared = reinterpret_cast<const unsigned char*>(p)[0];
+ *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+ if ((*shared | *non_shared) < 128) {
+ // Fast path: all three values are encoded in one byte each
+ p += 2;
+ } else {
+ if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+ if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+ }
+ return p;
+ }
+};
+
+void DataBlockIter::Next() {
+ assert(Valid());
+ ParseNextDataKey<DecodeEntry>();
+}
+
+void DataBlockIter::NextOrReport() {
+ assert(Valid());
+ ParseNextDataKey<CheckAndDecodeEntry>();
+}
+
+void IndexBlockIter::Next() {
+ assert(Valid());
+ ParseNextIndexKey();
+}
+
+void IndexBlockIter::Prev() {
+ assert(Valid());
+ // Scan backwards to a restart point before current_
+ const uint32_t original = current_;
+ while (GetRestartPoint(restart_index_) >= original) {
+ if (restart_index_ == 0) {
+ // No more entries
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ return;
+ }
+ restart_index_--;
+ }
+ SeekToRestartPoint(restart_index_);
+ // Loop until end of current entry hits the start of original entry
+ while (ParseNextIndexKey() && NextEntryOffset() < original) {
+ }
+}
+
+// Similar to IndexBlockIter::Prev but also caches the prev entries
+void DataBlockIter::Prev() {
+ assert(Valid());
+
+ assert(prev_entries_idx_ == -1 ||
+ static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
+ // Check if we can use cached prev_entries_
+ if (prev_entries_idx_ > 0 &&
+ prev_entries_[prev_entries_idx_].offset == current_) {
+ // Read cached CachedPrevEntry
+ prev_entries_idx_--;
+ const CachedPrevEntry& current_prev_entry =
+ prev_entries_[prev_entries_idx_];
+
+ const char* key_ptr = nullptr;
+ if (current_prev_entry.key_ptr != nullptr) {
+ // The key is not delta encoded and stored in the data block
+ key_ptr = current_prev_entry.key_ptr;
+ key_pinned_ = true;
+ } else {
+ // The key is delta encoded and stored in prev_entries_keys_buff_
+ key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
+ key_pinned_ = false;
+ }
+ const Slice current_key(key_ptr, current_prev_entry.key_size);
+
+ current_ = current_prev_entry.offset;
+ key_.SetKey(current_key, false /* copy */);
+ value_ = current_prev_entry.value;
+
+ return;
+ }
+
+ // Clear prev entries cache
+ prev_entries_idx_ = -1;
+ prev_entries_.clear();
+ prev_entries_keys_buff_.clear();
+
+ // Scan backwards to a restart point before current_
+ const uint32_t original = current_;
+ while (GetRestartPoint(restart_index_) >= original) {
+ if (restart_index_ == 0) {
+ // No more entries
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ return;
+ }
+ restart_index_--;
+ }
+
+ SeekToRestartPoint(restart_index_);
+
+ do {
+ if (!ParseNextDataKey<DecodeEntry>()) {
+ break;
+ }
+ Slice current_key = key();
+
+ if (key_.IsKeyPinned()) {
+ // The key is not delta encoded
+ prev_entries_.emplace_back(current_, current_key.data(), 0,
+ current_key.size(), value());
+ } else {
+ // The key is delta encoded, cache decoded key in buffer
+ size_t new_key_offset = prev_entries_keys_buff_.size();
+ prev_entries_keys_buff_.append(current_key.data(), current_key.size());
+
+ prev_entries_.emplace_back(current_, nullptr, new_key_offset,
+ current_key.size(), value());
+ }
+ // Loop until end of current entry hits the start of original entry
+ } while (NextEntryOffset() < original);
+ prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
+}
+
+void DataBlockIter::Seek(const Slice& target) {
+ Slice seek_key = target;
+ PERF_TIMER_GUARD(block_seek_nanos);
+ if (data_ == nullptr) { // Not init yet
+ return;
+ }
+ uint32_t index = 0;
+ bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+ comparator_);
+
+ if (!ok) {
+ return;
+ }
+ SeekToRestartPoint(index);
+
+ // Linear search (within restart block) for first key >= target
+ while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
+ }
+}
+
+// Optimized Seek for point lookup for an internal key `target`
+// target = "seek_user_key @ type | seqno".
+//
+// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex, this function behaves identically as Seek().
+//
+// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex:
+//
+// If the return value is FALSE, iter location is undefined, and it means:
+// 1) there is no key in this block falling into the range:
+// ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
+// inclusive; AND
+// 2) the last key of this block has a greater user_key from seek_user_key
+//
+// If the return value is TRUE, iter location has two possibilies:
+// 1) If iter is valid, it is set to a location as if set by BinarySeek. In
+// this case, it points to the first key_ with a larger user_key or a
+// matching user_key with a seqno no greater than the seeking seqno.
+// 2) If the iter is invalid, it means that either all the user_key is less
+// than the seek_user_key, or the block ends with a matching user_key but
+// with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
+// but larger type).
+bool DataBlockIter::SeekForGetImpl(const Slice& target) {
+ Slice target_user_key = ExtractUserKey(target);
+ uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
+ uint8_t entry =
+ data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
+
+ if (entry == kCollision) {
+ // HashSeek not effective, falling back
+ Seek(target);
+ return true;
+ }
+
+ if (entry == kNoEntry) {
+ // Even if we cannot find the user_key in this block, the result may
+ // exist in the next block. Consider this exmpale:
+ //
+ // Block N: [aab@100, ... , app@120]
+ // bounary key: axy@50 (we make minimal assumption about a boundary key)
+ // Block N+1: [axy@10, ... ]
+ //
+ // If seek_key = axy@60, the search will starts from Block N.
+ // Even if the user_key is not found in the hash map, the caller still
+ // have to conntinue searching the next block.
+ //
+ // In this case, we pretend the key is the the last restart interval.
+ // The while-loop below will search the last restart interval for the
+ // key. It will stop at the first key that is larger than the seek_key,
+ // or to the end of the block if no one is larger.
+ entry = static_cast<uint8_t>(num_restarts_ - 1);
+ }
+
+ uint32_t restart_index = entry;
+
+ // check if the key is in the restart_interval
+ assert(restart_index < num_restarts_);
+ SeekToRestartPoint(restart_index);
+
+ const char* limit = nullptr;
+ if (restart_index_ + 1 < num_restarts_) {
+ limit = data_ + GetRestartPoint(restart_index_ + 1);
+ } else {
+ limit = data_ + restarts_;
+ }
+
+ while (true) {
+ // Here we only linear seek the target key inside the restart interval.
+ // If a key does not exist inside a restart interval, we avoid
+ // further searching the block content accross restart interval boundary.
+ //
+ // TODO(fwu): check the left and write boundary of the restart interval
+ // to avoid linear seek a target key that is out of range.
+ if (!ParseNextDataKey<DecodeEntry>(limit) || Compare(key_, target) >= 0) {
+ // we stop at the first potential matching user key.
+ break;
+ }
+ }
+
+ if (current_ == restarts_) {
+ // Search reaches to the end of the block. There are three possibilites:
+ // 1) there is only one user_key match in the block (otherwise collsion).
+ // the matching user_key resides in the last restart interval, and it
+ // is the last key of the restart interval and of the block as well.
+ // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
+ //
+ // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
+ // AND all existing user_keys in the restart interval are smaller than
+ // seek_user_key.
+ //
+ // 3) The seek_key is a false positive and happens to be hashed to the
+ // last restart interval, AND all existing user_keys in the restart
+ // interval are smaller than seek_user_key.
+ //
+ // The result may exist in the next block each case, so we return true.
+ return true;
+ }
+
+ if (user_comparator_->Compare(key_.GetUserKey(), target_user_key) != 0) {
+ // the key is not in this block and cannot be at the next block either.
+ return false;
+ }
+
+ // Here we are conservative and only support a limited set of cases
+ ValueType value_type = ExtractValueType(key_.GetKey());
+ if (value_type != ValueType::kTypeValue &&
+ value_type != ValueType::kTypeDeletion &&
+ value_type != ValueType::kTypeSingleDeletion &&
+ value_type != ValueType::kTypeBlobIndex) {
+ Seek(target);
+ return true;
+ }
+
+ // Result found, and the iter is correctly set.
+ return true;
+}
+
+void IndexBlockIter::Seek(const Slice& target) {
+ TEST_SYNC_POINT("IndexBlockIter::Seek:0");
+ Slice seek_key = target;
+ if (!key_includes_seq_) {
+ seek_key = ExtractUserKey(target);
+ }
+ PERF_TIMER_GUARD(block_seek_nanos);
+ if (data_ == nullptr) { // Not init yet
+ return;
+ }
+ status_ = Status::OK();
+ uint32_t index = 0;
+ bool ok = false;
+ if (prefix_index_) {
+ bool prefix_may_exist = true;
+ ok = PrefixSeek(target, &index, &prefix_may_exist);
+ if (!prefix_may_exist) {
+ // This is to let the caller to distinguish between non-existing prefix,
+ // and when key is larger than the last key, which both set Valid() to
+ // false.
+ current_ = restarts_;
+ status_ = Status::NotFound();
+ }
+ } else if (value_delta_encoded_) {
+ ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
+ comparator_);
+ } else {
+ ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+ comparator_);
+ }
+
+ if (!ok) {
+ return;
+ }
+ SeekToRestartPoint(index);
+
+ // Linear search (within restart block) for first key >= target
+ while (ParseNextIndexKey() && Compare(key_, seek_key) < 0) {
+ }
+}
+
+void DataBlockIter::SeekForPrev(const Slice& target) {
+ PERF_TIMER_GUARD(block_seek_nanos);
+ Slice seek_key = target;
+ if (data_ == nullptr) { // Not init yet
+ return;
+ }
+ uint32_t index = 0;
+ bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+ comparator_);
+
+ if (!ok) {
+ return;
+ }
+ SeekToRestartPoint(index);
+
+ // Linear search (within restart block) for first key >= seek_key
+ while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
+ }
+ if (!Valid()) {
+ SeekToLast();
+ } else {
+ while (Valid() && Compare(key_, seek_key) > 0) {
+ Prev();
+ }
+ }
+}
+
+void DataBlockIter::SeekToFirst() {
+ if (data_ == nullptr) { // Not init yet
+ return;
+ }
+ SeekToRestartPoint(0);
+ ParseNextDataKey<DecodeEntry>();
+}
+
+void DataBlockIter::SeekToFirstOrReport() {
+ if (data_ == nullptr) { // Not init yet
+ return;
+ }
+ SeekToRestartPoint(0);
+ ParseNextDataKey<CheckAndDecodeEntry>();
+}
+
+void IndexBlockIter::SeekToFirst() {
+ if (data_ == nullptr) { // Not init yet
+ return;
+ }
+ status_ = Status::OK();
+ SeekToRestartPoint(0);
+ ParseNextIndexKey();
+}
+
+void DataBlockIter::SeekToLast() {
+ if (data_ == nullptr) { // Not init yet
+ return;
+ }
+ SeekToRestartPoint(num_restarts_ - 1);
+ while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) {
+ // Keep skipping
+ }
+}
+
+void IndexBlockIter::SeekToLast() {
+ if (data_ == nullptr) { // Not init yet
+ return;
+ }
+ status_ = Status::OK();
+ SeekToRestartPoint(num_restarts_ - 1);
+ while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
+ // Keep skipping
+ }
+}
+
+template <class TValue>
+void BlockIter<TValue>::CorruptionError() {
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ status_ = Status::Corruption("bad entry in block");
+ key_.Clear();
+ value_.clear();
+}
+
+template <typename DecodeEntryFunc>
+bool DataBlockIter::ParseNextDataKey(const char* limit) {
+ current_ = NextEntryOffset();
+ const char* p = data_ + current_;
+ if (!limit) {
+ limit = data_ + restarts_; // Restarts come right after data
+ }
+
+ if (p >= limit) {
+ // No more entries to return. Mark as invalid.
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ return false;
+ }
+
+ // Decode next entry
+ uint32_t shared, non_shared, value_length;
+ p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
+ if (p == nullptr || key_.Size() < shared) {
+ CorruptionError();
+ return false;
+ } else {
+ if (shared == 0) {
+ // If this key dont share any bytes with prev key then we dont need
+ // to decode it and can use it's address in the block directly.
+ key_.SetKey(Slice(p, non_shared), false /* copy */);
+ key_pinned_ = true;
+ } else {
+ // This key share `shared` bytes with prev key, we need to decode it
+ key_.TrimAppend(shared, p, non_shared);
+ key_pinned_ = false;
+ }
+
+ if (global_seqno_ != kDisableGlobalSequenceNumber) {
+ // If we are reading a file with a global sequence number we should
+ // expect that all encoded sequence numbers are zeros and any value
+ // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
+ assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0);
+
+ ValueType value_type = ExtractValueType(key_.GetKey());
+ assert(value_type == ValueType::kTypeValue ||
+ value_type == ValueType::kTypeMerge ||
+ value_type == ValueType::kTypeDeletion ||
+ value_type == ValueType::kTypeRangeDeletion);
+
+ if (key_pinned_) {
+ // TODO(tec): Investigate updating the seqno in the loaded block
+ // directly instead of doing a copy and update.
+
+ // We cannot use the key address in the block directly because
+ // we have a global_seqno_ that will overwrite the encoded one.
+ key_.OwnKey();
+ key_pinned_ = false;
+ }
+
+ key_.UpdateInternalKey(global_seqno_, value_type);
+ }
+
+ value_ = Slice(p + non_shared, value_length);
+ if (shared == 0) {
+ while (restart_index_ + 1 < num_restarts_ &&
+ GetRestartPoint(restart_index_ + 1) < current_) {
+ ++restart_index_;
+ }
+ }
+ // else we are in the middle of a restart interval and the restart_index_
+ // thus has not changed
+ return true;
+ }
+}
+
+bool IndexBlockIter::ParseNextIndexKey() {
+ current_ = NextEntryOffset();
+ const char* p = data_ + current_;
+ const char* limit = data_ + restarts_; // Restarts come right after data
+ if (p >= limit) {
+ // No more entries to return. Mark as invalid.
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ return false;
+ }
+
+ // Decode next entry
+ uint32_t shared, non_shared, value_length;
+ if (value_delta_encoded_) {
+ p = DecodeKeyV4()(p, limit, &shared, &non_shared);
+ value_length = 0;
+ } else {
+ p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
+ }
+ if (p == nullptr || key_.Size() < shared) {
+ CorruptionError();
+ return false;
+ }
+ if (shared == 0) {
+ // If this key dont share any bytes with prev key then we dont need
+ // to decode it and can use it's address in the block directly.
+ key_.SetKey(Slice(p, non_shared), false /* copy */);
+ key_pinned_ = true;
+ } else {
+ // This key share `shared` bytes with prev key, we need to decode it
+ key_.TrimAppend(shared, p, non_shared);
+ key_pinned_ = false;
+ }
+ value_ = Slice(p + non_shared, value_length);
+ if (shared == 0) {
+ while (restart_index_ + 1 < num_restarts_ &&
+ GetRestartPoint(restart_index_ + 1) < current_) {
+ ++restart_index_;
+ }
+ }
+ // else we are in the middle of a restart interval and the restart_index_
+ // thus has not changed
+ if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
+ DecodeCurrentValue(shared);
+ }
+ return true;
+}
+
+// The format:
+// restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// ...
+// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// where, k is key, v is value, and its encoding is in parenthesis.
+// The format of each key is (shared_size, non_shared_size, shared, non_shared)
+// The format of each value, i.e., block hanlde, is (offset, size) whenever the
+// shared_size is 0, which included the first entry in each restart point.
+// Otherwise the format is delta-size = block handle size - size of last block
+// handle.
+void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
+ Slice v(value_.data(), data_ + restarts_ - value_.data());
+ // Delta encoding is used if `shared` != 0.
+ Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
+ &v, have_first_key_,
+ (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr);
+ assert(decode_s.ok());
+ value_ = Slice(value_.data(), v.data() - value_.data());
+
+ if (global_seqno_state_ != nullptr) {
+ // Overwrite sequence number the same way as in DataBlockIter.
+
+ IterKey& first_internal_key = global_seqno_state_->first_internal_key;
+ first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
+ /* copy */ true);
+
+ assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
+
+ ValueType value_type = ExtractValueType(first_internal_key.GetKey());
+ assert(value_type == ValueType::kTypeValue ||
+ value_type == ValueType::kTypeMerge ||
+ value_type == ValueType::kTypeDeletion ||
+ value_type == ValueType::kTypeRangeDeletion);
+
+ first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
+ value_type);
+ decoded_value_.first_internal_key = first_internal_key.GetKey();
+ }
+}
+
+// Binary search in restart array to find the first restart point that
+// is either the last restart point with a key less than target,
+// which means the key of next restart point is larger than target, or
+// the first restart point with a key = target
+template <class TValue>
+template <typename DecodeKeyFunc>
+bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left,
+ uint32_t right, uint32_t* index,
+ const Comparator* comp) {
+ assert(left <= right);
+
+ while (left < right) {
+ uint32_t mid = (left + right + 1) / 2;
+ uint32_t region_offset = GetRestartPoint(mid);
+ uint32_t shared, non_shared;
+ const char* key_ptr = DecodeKeyFunc()(
+ data_ + region_offset, data_ + restarts_, &shared, &non_shared);
+ if (key_ptr == nullptr || (shared != 0)) {
+ CorruptionError();
+ return false;
+ }
+ Slice mid_key(key_ptr, non_shared);
+ int cmp = comp->Compare(mid_key, target);
+ if (cmp < 0) {
+ // Key at "mid" is smaller than "target". Therefore all
+ // blocks before "mid" are uninteresting.
+ left = mid;
+ } else if (cmp > 0) {
+ // Key at "mid" is >= "target". Therefore all blocks at or
+ // after "mid" are uninteresting.
+ right = mid - 1;
+ } else {
+ left = right = mid;
+ }
+ }
+
+ *index = left;
+ return true;
+}
+
+// Compare target key and the block key of the block of `block_index`.
+// Return -1 if error.
+int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
+ uint32_t region_offset = GetRestartPoint(block_index);
+ uint32_t shared, non_shared;
+ const char* key_ptr =
+ value_delta_encoded_
+ ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
+ &non_shared)
+ : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
+ &non_shared);
+ if (key_ptr == nullptr || (shared != 0)) {
+ CorruptionError();
+ return 1; // Return target is smaller
+ }
+ Slice block_key(key_ptr, non_shared);
+ return Compare(block_key, target);
+}
+
+// Binary search in block_ids to find the first block
+// with a key >= target
+bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
+ uint32_t* block_ids, uint32_t left,
+ uint32_t right, uint32_t* index,
+ bool* prefix_may_exist) {
+ assert(left <= right);
+ assert(index);
+ assert(prefix_may_exist);
+ *prefix_may_exist = true;
+ uint32_t left_bound = left;
+
+ while (left <= right) {
+ uint32_t mid = (right + left) / 2;
+
+ int cmp = CompareBlockKey(block_ids[mid], target);
+ if (!status_.ok()) {
+ return false;
+ }
+ if (cmp < 0) {
+ // Key at "target" is larger than "mid". Therefore all
+ // blocks before or at "mid" are uninteresting.
+ left = mid + 1;
+ } else {
+ // Key at "target" is <= "mid". Therefore all blocks
+ // after "mid" are uninteresting.
+ // If there is only one block left, we found it.
+ if (left == right) break;
+ right = mid;
+ }
+ }
+
+ if (left == right) {
+ // In one of the two following cases:
+ // (1) left is the first one of block_ids
+ // (2) there is a gap of blocks between block of `left` and `left-1`.
+ // we can further distinguish the case of key in the block or key not
+ // existing, by comparing the target key and the key of the previous
+ // block to the left of the block found.
+ if (block_ids[left] > 0 &&
+ (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
+ CompareBlockKey(block_ids[left] - 1, target) > 0) {
+ current_ = restarts_;
+ *prefix_may_exist = false;
+ return false;
+ }
+
+ *index = block_ids[left];
+ return true;
+ } else {
+ assert(left > right);
+
+ // If the next block key is larger than seek key, it is possible that
+ // no key shares the prefix with `target`, or all keys with the same
+ // prefix as `target` are smaller than prefix. In the latter case,
+ // we are mandated to set the position the same as the total order.
+ // In the latter case, either:
+ // (1) `target` falls into the range of the next block. In this case,
+ // we can place the iterator to the next block, or
+ // (2) `target` is larger than all block keys. In this case we can
+ // keep the iterator invalidate without setting `prefix_may_exist`
+ // to false.
+ // We might sometimes end up with setting the total order position
+ // while there is no key sharing the prefix as `target`, but it
+ // still follows the contract.
+ uint32_t right_index = block_ids[right];
+ assert(right_index + 1 <= num_restarts_);
+ if (right_index + 1 < num_restarts_) {
+ if (CompareBlockKey(right_index + 1, target) >= 0) {
+ *index = right_index + 1;
+ return true;
+ } else {
+ // We have to set the flag here because we are not positioning
+ // the iterator to the total order position.
+ *prefix_may_exist = false;
+ }
+ }
+
+ // Mark iterator invalid
+ current_ = restarts_;
+ return false;
+ }
+}
+
+bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
+ bool* prefix_may_exist) {
+ assert(index);
+ assert(prefix_may_exist);
+ assert(prefix_index_);
+ *prefix_may_exist = true;
+ Slice seek_key = target;
+ if (!key_includes_seq_) {
+ seek_key = ExtractUserKey(target);
+ }
+ uint32_t* block_ids = nullptr;
+ uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
+
+ if (num_blocks == 0) {
+ current_ = restarts_;
+ *prefix_may_exist = false;
+ return false;
+ } else {
+ assert(block_ids);
+ return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
+ prefix_may_exist);
+ }
+}
+
+uint32_t Block::NumRestarts() const {
+ assert(size_ >= 2 * sizeof(uint32_t));
+ uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+ uint32_t num_restarts = block_footer;
+ if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+ // In BlockBuilder, we have ensured a block with HashIndex is less than
+ // kMaxBlockSizeSupportedByHashIndex (64KiB).
+ //
+ // Therefore, if we encounter a block with a size > 64KiB, the block
+ // cannot have HashIndex. So the footer will directly interpreted as
+ // num_restarts.
+ //
+ // Such check is for backward compatibility. We can ensure legacy block
+ // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
+ // correctly as no HashIndex even if the MSB of num_restarts is set.
+ return num_restarts;
+ }
+ BlockBasedTableOptions::DataBlockIndexType index_type;
+ UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+ return num_restarts;
+}
+
+BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
+ assert(size_ >= 2 * sizeof(uint32_t));
+ if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+ // The check is for the same reason as that in NumRestarts()
+ return BlockBasedTableOptions::kDataBlockBinarySearch;
+ }
+ uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+ uint32_t num_restarts = block_footer;
+ BlockBasedTableOptions::DataBlockIndexType index_type;
+ UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+ return index_type;
+}
+
+Block::~Block() {
+ // This sync point can be re-enabled if RocksDB can control the
+ // initialization order of any/all static options created by the user.
+ // TEST_SYNC_POINT("Block::~Block");
+}
+
+Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
+ size_t read_amp_bytes_per_bit, Statistics* statistics)
+ : contents_(std::move(contents)),
+ data_(contents_.data.data()),
+ size_(contents_.data.size()),
+ restart_offset_(0),
+ num_restarts_(0),
+ global_seqno_(_global_seqno) {
+ TEST_SYNC_POINT("Block::Block:0");
+ if (size_ < sizeof(uint32_t)) {
+ size_ = 0; // Error marker
+ } else {
+ // Should only decode restart points for uncompressed blocks
+ num_restarts_ = NumRestarts();
+ switch (IndexType()) {
+ case BlockBasedTableOptions::kDataBlockBinarySearch:
+ restart_offset_ = static_cast<uint32_t>(size_) -
+ (1 + num_restarts_) * sizeof(uint32_t);
+ if (restart_offset_ > size_ - sizeof(uint32_t)) {
+ // The size is too small for NumRestarts() and therefore
+ // restart_offset_ wrapped around.
+ size_ = 0;
+ }
+ break;
+ case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+ if (size_ < sizeof(uint32_t) /* block footer */ +
+ sizeof(uint16_t) /* NUM_BUCK */) {
+ size_ = 0;
+ break;
+ }
+
+ uint16_t map_offset;
+ data_block_hash_index_.Initialize(
+ contents.data.data(),
+ static_cast<uint16_t>(contents.data.size() -
+ sizeof(uint32_t)), /*chop off
+ NUM_RESTARTS*/
+ &map_offset);
+
+ restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
+
+ if (restart_offset_ > map_offset) {
+ // map_offset is too small for NumRestarts() and
+ // therefore restart_offset_ wrapped around.
+ size_ = 0;
+ break;
+ }
+ break;
+ default:
+ size_ = 0; // Error marker
+ }
+ }
+ if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
+ read_amp_bitmap_.reset(new BlockReadAmpBitmap(
+ restart_offset_, read_amp_bytes_per_bit, statistics));
+ }
+}
+
+DataBlockIter* Block::NewDataIterator(const Comparator* cmp,
+ const Comparator* ucmp,
+ DataBlockIter* iter, Statistics* stats,
+ bool block_contents_pinned) {
+ DataBlockIter* ret_iter;
+ if (iter != nullptr) {
+ ret_iter = iter;
+ } else {
+ ret_iter = new DataBlockIter;
+ }
+ if (size_ < 2 * sizeof(uint32_t)) {
+ ret_iter->Invalidate(Status::Corruption("bad block contents"));
+ return ret_iter;
+ }
+ if (num_restarts_ == 0) {
+ // Empty block.
+ ret_iter->Invalidate(Status::OK());
+ return ret_iter;
+ } else {
+ ret_iter->Initialize(
+ cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
+ read_amp_bitmap_.get(), block_contents_pinned,
+ data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
+ if (read_amp_bitmap_) {
+ if (read_amp_bitmap_->GetStatistics() != stats) {
+ // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
+ read_amp_bitmap_->SetStatistics(stats);
+ }
+ }
+ }
+
+ return ret_iter;
+}
+
+IndexBlockIter* Block::NewIndexIterator(
+ const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter,
+ Statistics* /*stats*/, bool total_order_seek, bool have_first_key,
+ bool key_includes_seq, bool value_is_full, bool block_contents_pinned,
+ BlockPrefixIndex* prefix_index) {
+ IndexBlockIter* ret_iter;
+ if (iter != nullptr) {
+ ret_iter = iter;
+ } else {
+ ret_iter = new IndexBlockIter;
+ }
+ if (size_ < 2 * sizeof(uint32_t)) {
+ ret_iter->Invalidate(Status::Corruption("bad block contents"));
+ return ret_iter;
+ }
+ if (num_restarts_ == 0) {
+ // Empty block.
+ ret_iter->Invalidate(Status::OK());
+ return ret_iter;
+ } else {
+ BlockPrefixIndex* prefix_index_ptr =
+ total_order_seek ? nullptr : prefix_index;
+ ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
+ global_seqno_, prefix_index_ptr, have_first_key,
+ key_includes_seq, value_is_full,
+ block_contents_pinned);
+ }
+
+ return ret_iter;
+}
+
+size_t Block::ApproximateMemoryUsage() const {
+ size_t usage = usable_size();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size((void*)this);
+#else
+ usage += sizeof(*this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ if (read_amp_bitmap_) {
+ usage += read_amp_bitmap_->ApproximateMemoryUsage();
+ }
+ return usage;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block.h b/src/rocksdb/table/block_based/block.h
new file mode 100644
index 000000000..e82a1b2a6
--- /dev/null
+++ b/src/rocksdb/table/block_based/block.h
@@ -0,0 +1,631 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "port/malloc.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_hash_index.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct BlockContents;
+class Comparator;
+template <class TValue>
+class BlockIter;
+class DataBlockIter;
+class IndexBlockIter;
+class BlockPrefixIndex;
+
+// BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data
+// bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of
+// bytes in the Block we update the bitmap and increment
+// READ_AMP_ESTIMATE_USEFUL_BYTES.
+class BlockReadAmpBitmap {
+ public:
+ explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
+ Statistics* statistics)
+ : bitmap_(nullptr),
+ bytes_per_bit_pow_(0),
+ statistics_(statistics),
+ rnd_(Random::GetTLSInstance()->Uniform(
+ static_cast<int>(bytes_per_bit))) {
+ TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
+ assert(block_size > 0 && bytes_per_bit > 0);
+
+ // convert bytes_per_bit to be a power of 2
+ while (bytes_per_bit >>= 1) {
+ bytes_per_bit_pow_++;
+ }
+
+ // num_bits_needed = ceil(block_size / bytes_per_bit)
+ size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1;
+ assert(num_bits_needed > 0);
+
+ // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
+ size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
+
+ // Create bitmap and set all the bits to 0
+ bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
+
+ RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
+ }
+
+ ~BlockReadAmpBitmap() { delete[] bitmap_; }
+
+ void Mark(uint32_t start_offset, uint32_t end_offset) {
+ assert(end_offset >= start_offset);
+ // Index of first bit in mask
+ uint32_t start_bit =
+ (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
+ bytes_per_bit_pow_;
+ // Index of last bit in mask + 1
+ uint32_t exclusive_end_bit =
+ (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
+ if (start_bit >= exclusive_end_bit) {
+ return;
+ }
+ assert(exclusive_end_bit > 0);
+
+ if (GetAndSet(start_bit) == 0) {
+ uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
+ << bytes_per_bit_pow_;
+ RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
+ new_useful_bytes);
+ }
+ }
+
+ Statistics* GetStatistics() {
+ return statistics_.load(std::memory_order_relaxed);
+ }
+
+ void SetStatistics(Statistics* stats) { statistics_.store(stats); }
+
+ uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
+
+ size_t ApproximateMemoryUsage() const {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ return malloc_usable_size((void*)this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ return sizeof(*this);
+ }
+
+ private:
+ // Get the current value of bit at `bit_idx` and set it to 1
+ inline bool GetAndSet(uint32_t bit_idx) {
+ const uint32_t byte_idx = bit_idx / kBitsPerEntry;
+ const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry);
+
+ return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) &
+ bit_mask;
+ }
+
+ const uint32_t kBytesPersEntry = sizeof(uint32_t); // 4 bytes
+ const uint32_t kBitsPerEntry = kBytesPersEntry * 8; // 32 bits
+
+ // Bitmap used to record the bytes that we read, use atomic to protect
+ // against multiple threads updating the same bit
+ std::atomic<uint32_t>* bitmap_;
+ // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
+ // muliplication and division
+ uint8_t bytes_per_bit_pow_;
+ // Pointer to DB Statistics object, Since this bitmap may outlive the DB
+ // this pointer maybe invalid, but the DB will update it to a valid pointer
+ // by using SetStatistics() before calling Mark()
+ std::atomic<Statistics*> statistics_;
+ uint32_t rnd_;
+};
+
+// This Block class is not for any old block: it is designed to hold only
+// uncompressed blocks containing sorted key-value pairs. It is thus
+// suitable for storing uncompressed data blocks, index blocks (including
+// partitions), range deletion blocks, properties blocks, metaindex blocks,
+// as well as the top level of the partitioned filter structure (which is
+// actually an index of the filter partitions). It is NOT suitable for
+// compressed blocks in general, filter blocks/partitions, or compression
+// dictionaries (since the latter do not contain sorted key-value pairs).
+// Use BlockContents directly for those.
+//
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details of the format and the various block types.
+class Block {
+ public:
+ // Initialize the block with the specified contents.
+ explicit Block(BlockContents&& contents, SequenceNumber _global_seqno,
+ size_t read_amp_bytes_per_bit = 0,
+ Statistics* statistics = nullptr);
+ // No copying allowed
+ Block(const Block&) = delete;
+ void operator=(const Block&) = delete;
+
+ ~Block();
+
+ size_t size() const { return size_; }
+ const char* data() const { return data_; }
+ // The additional memory space taken by the block data.
+ size_t usable_size() const { return contents_.usable_size(); }
+ uint32_t NumRestarts() const;
+ bool own_bytes() const { return contents_.own_bytes(); }
+
+ BlockBasedTableOptions::DataBlockIndexType IndexType() const;
+
+ // If comparator is InternalKeyComparator, user_comparator is its user
+ // comparator; they are equal otherwise.
+ //
+ // If iter is null, return new Iterator
+ // If iter is not null, update this one and return it as Iterator*
+ //
+ // Updates read_amp_bitmap_ if it is not nullptr.
+ //
+ // If `block_contents_pinned` is true, the caller will guarantee that when
+ // the cleanup functions are transferred from the iterator to other
+ // classes, e.g. PinnableSlice, the pointer to the bytes will still be
+ // valid. Either the iterator holds cache handle or ownership of some resource
+ // and release them in a release function, or caller is sure that the data
+ // will not go away (for example, it's from mmapped file which will not be
+ // closed).
+ //
+ // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+ // the iterator will simply be set as "invalid", rather than returning
+ // the key that is just pass the target key.
+ DataBlockIter* NewDataIterator(const Comparator* comparator,
+ const Comparator* user_comparator,
+ DataBlockIter* iter = nullptr,
+ Statistics* stats = nullptr,
+ bool block_contents_pinned = false);
+
+ // key_includes_seq, default true, means that the keys are in internal key
+ // format.
+ // value_is_full, default true, means that no delta encoding is
+ // applied to values.
+ //
+ // If `prefix_index` is not nullptr this block will do hash lookup for the key
+ // prefix. If total_order_seek is true, prefix_index_ is ignored.
+ //
+ // `have_first_key` controls whether IndexValue will contain
+ // first_internal_key. It affects data serialization format, so the same value
+ // have_first_key must be used when writing and reading index.
+ // It is determined by IndexType property of the table.
+ IndexBlockIter* NewIndexIterator(const Comparator* comparator,
+ const Comparator* user_comparator,
+ IndexBlockIter* iter, Statistics* stats,
+ bool total_order_seek, bool have_first_key,
+ bool key_includes_seq, bool value_is_full,
+ bool block_contents_pinned = false,
+ BlockPrefixIndex* prefix_index = nullptr);
+
+ // Report an approximation of how much memory has been used.
+ size_t ApproximateMemoryUsage() const;
+
+ SequenceNumber global_seqno() const { return global_seqno_; }
+
+ private:
+ BlockContents contents_;
+ const char* data_; // contents_.data.data()
+ size_t size_; // contents_.data.size()
+ uint32_t restart_offset_; // Offset in data_ of restart array
+ uint32_t num_restarts_;
+ std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
+ // All keys in the block will have seqno = global_seqno_, regardless of
+ // the encoded value (kDisableGlobalSequenceNumber means disabled)
+ const SequenceNumber global_seqno_;
+
+ DataBlockHashIndex data_block_hash_index_;
+};
+
+template <class TValue>
+class BlockIter : public InternalIteratorBase<TValue> {
+ public:
+ void InitializeBase(const Comparator* comparator, const char* data,
+ uint32_t restarts, uint32_t num_restarts,
+ SequenceNumber global_seqno, bool block_contents_pinned) {
+ assert(data_ == nullptr); // Ensure it is called only once
+ assert(num_restarts > 0); // Ensure the param is valid
+
+ comparator_ = comparator;
+ data_ = data;
+ restarts_ = restarts;
+ num_restarts_ = num_restarts;
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ global_seqno_ = global_seqno;
+ block_contents_pinned_ = block_contents_pinned;
+ cache_handle_ = nullptr;
+ }
+
+ // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
+ // nothing. Calls cleanup functions.
+ void InvalidateBase(Status s) {
+ // Assert that the BlockIter is never deleted while Pinning is Enabled.
+ assert(!pinned_iters_mgr_ ||
+ (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+
+ data_ = nullptr;
+ current_ = restarts_;
+ status_ = s;
+
+ // Call cleanup callbacks.
+ Cleanable::Reset();
+ }
+
+ bool Valid() const override { return current_ < restarts_; }
+ Status status() const override { return status_; }
+ Slice key() const override {
+ assert(Valid());
+ return key_.GetKey();
+ }
+
+#ifndef NDEBUG
+ ~BlockIter() override {
+ // Assert that the BlockIter is never deleted while Pinning is Enabled.
+ assert(!pinned_iters_mgr_ ||
+ (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+ }
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ }
+ PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
+#endif
+
+ bool IsKeyPinned() const override {
+ return block_contents_pinned_ && key_pinned_;
+ }
+
+ bool IsValuePinned() const override { return block_contents_pinned_; }
+
+ size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
+
+ uint32_t ValueOffset() const {
+ return static_cast<uint32_t>(value_.data() - data_);
+ }
+
+ void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
+
+ Cache::Handle* cache_handle() { return cache_handle_; }
+
+ protected:
+ // Note: The type could be changed to InternalKeyComparator but we see a weird
+ // performance drop by that.
+ const Comparator* comparator_;
+ const char* data_; // underlying block contents
+ uint32_t num_restarts_; // Number of uint32_t entries in restart array
+
+ // Index of restart block in which current_ or current_-1 falls
+ uint32_t restart_index_;
+ uint32_t restarts_; // Offset of restart array (list of fixed32)
+ // current_ is offset in data_ of current entry. >= restarts_ if !Valid
+ uint32_t current_;
+ IterKey key_;
+ Slice value_;
+ Status status_;
+ bool key_pinned_;
+ // Whether the block data is guaranteed to outlive this iterator, and
+ // as long as the cleanup functions are transferred to another class,
+ // e.g. PinnableSlice, the pointer to the bytes will still be valid.
+ bool block_contents_pinned_;
+ SequenceNumber global_seqno_;
+
+ private:
+ // Store the cache handle, if the block is cached. We need this since the
+ // only other place the handle is stored is as an argument to the Cleanable
+ // function callback, which is hard to retrieve. When multiple value
+ // PinnableSlices reference the block, they need the cache handle in order
+ // to bump up the ref count
+ Cache::Handle* cache_handle_;
+
+ public:
+ // Return the offset in data_ just past the end of the current entry.
+ inline uint32_t NextEntryOffset() const {
+ // NOTE: We don't support blocks bigger than 2GB
+ return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
+ }
+
+ uint32_t GetRestartPoint(uint32_t index) {
+ assert(index < num_restarts_);
+ return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+ }
+
+ void SeekToRestartPoint(uint32_t index) {
+ key_.Clear();
+ restart_index_ = index;
+ // current_ will be fixed by ParseNextKey();
+
+ // ParseNextKey() starts at the end of value_, so set value_ accordingly
+ uint32_t offset = GetRestartPoint(index);
+ value_ = Slice(data_ + offset, 0);
+ }
+
+ void CorruptionError();
+
+ template <typename DecodeKeyFunc>
+ inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+ uint32_t* index, const Comparator* comp);
+};
+
+class DataBlockIter final : public BlockIter<Slice> {
+ public:
+ DataBlockIter()
+ : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
+ DataBlockIter(const Comparator* comparator, const Comparator* user_comparator,
+ const char* data, uint32_t restarts, uint32_t num_restarts,
+ SequenceNumber global_seqno,
+ BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
+ DataBlockHashIndex* data_block_hash_index)
+ : DataBlockIter() {
+ Initialize(comparator, user_comparator, data, restarts, num_restarts,
+ global_seqno, read_amp_bitmap, block_contents_pinned,
+ data_block_hash_index);
+ }
+ void Initialize(const Comparator* comparator,
+ const Comparator* user_comparator, const char* data,
+ uint32_t restarts, uint32_t num_restarts,
+ SequenceNumber global_seqno,
+ BlockReadAmpBitmap* read_amp_bitmap,
+ bool block_contents_pinned,
+ DataBlockHashIndex* data_block_hash_index) {
+ InitializeBase(comparator, data, restarts, num_restarts, global_seqno,
+ block_contents_pinned);
+ user_comparator_ = user_comparator;
+ key_.SetIsUserKey(false);
+ read_amp_bitmap_ = read_amp_bitmap;
+ last_bitmap_offset_ = current_ + 1;
+ data_block_hash_index_ = data_block_hash_index;
+ }
+
+ Slice value() const override {
+ assert(Valid());
+ if (read_amp_bitmap_ && current_ < restarts_ &&
+ current_ != last_bitmap_offset_) {
+ read_amp_bitmap_->Mark(current_ /* current entry offset */,
+ NextEntryOffset() - 1);
+ last_bitmap_offset_ = current_;
+ }
+ return value_;
+ }
+
+ void Seek(const Slice& target) override;
+
+ inline bool SeekForGet(const Slice& target) {
+ if (!data_block_hash_index_) {
+ Seek(target);
+ return true;
+ }
+
+ return SeekForGetImpl(target);
+ }
+
+ void SeekForPrev(const Slice& target) override;
+
+ void Prev() override;
+
+ void Next() final override;
+
+ // Try to advance to the next entry in the block. If there is data corruption
+ // or error, report it to the caller instead of aborting the process. May
+ // incur higher CPU overhead because we need to perform check on every entry.
+ void NextOrReport();
+
+ void SeekToFirst() override;
+
+ // Try to seek to the first entry in the block. If there is data corruption
+ // or error, report it to caller instead of aborting the process. May incur
+ // higher CPU overhead because we need to perform check on every entry.
+ void SeekToFirstOrReport();
+
+ void SeekToLast() override;
+
+ void Invalidate(Status s) {
+ InvalidateBase(s);
+ // Clear prev entries cache.
+ prev_entries_keys_buff_.clear();
+ prev_entries_.clear();
+ prev_entries_idx_ = -1;
+ }
+
+ private:
+ // read-amp bitmap
+ BlockReadAmpBitmap* read_amp_bitmap_;
+ // last `current_` value we report to read-amp bitmp
+ mutable uint32_t last_bitmap_offset_;
+ struct CachedPrevEntry {
+ explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
+ size_t _key_offset, size_t _key_size, Slice _value)
+ : offset(_offset),
+ key_ptr(_key_ptr),
+ key_offset(_key_offset),
+ key_size(_key_size),
+ value(_value) {}
+
+ // offset of entry in block
+ uint32_t offset;
+ // Pointer to key data in block (nullptr if key is delta-encoded)
+ const char* key_ptr;
+ // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr)
+ size_t key_offset;
+ // size of key
+ size_t key_size;
+ // value slice pointing to data in block
+ Slice value;
+ };
+ std::string prev_entries_keys_buff_;
+ std::vector<CachedPrevEntry> prev_entries_;
+ int32_t prev_entries_idx_ = -1;
+
+ DataBlockHashIndex* data_block_hash_index_;
+ const Comparator* user_comparator_;
+
+ template <typename DecodeEntryFunc>
+ inline bool ParseNextDataKey(const char* limit = nullptr);
+
+ inline int Compare(const IterKey& ikey, const Slice& b) const {
+ return comparator_->Compare(ikey.GetInternalKey(), b);
+ }
+
+ bool SeekForGetImpl(const Slice& target);
+};
+
+class IndexBlockIter final : public BlockIter<IndexValue> {
+ public:
+ IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
+
+ Slice key() const override {
+ assert(Valid());
+ return key_.GetKey();
+ }
+ // key_includes_seq, default true, means that the keys are in internal key
+ // format.
+ // value_is_full, default true, means that no delta encoding is
+ // applied to values.
+ void Initialize(const Comparator* comparator,
+ const Comparator* user_comparator, const char* data,
+ uint32_t restarts, uint32_t num_restarts,
+ SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
+ bool have_first_key, bool key_includes_seq,
+ bool value_is_full, bool block_contents_pinned) {
+ InitializeBase(key_includes_seq ? comparator : user_comparator, data,
+ restarts, num_restarts, kDisableGlobalSequenceNumber,
+ block_contents_pinned);
+ key_includes_seq_ = key_includes_seq;
+ key_.SetIsUserKey(!key_includes_seq_);
+ prefix_index_ = prefix_index;
+ value_delta_encoded_ = !value_is_full;
+ have_first_key_ = have_first_key;
+ if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
+ global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
+ } else {
+ global_seqno_state_.reset();
+ }
+ }
+
+ Slice user_key() const override {
+ if (key_includes_seq_) {
+ return ExtractUserKey(key());
+ }
+ return key();
+ }
+
+ IndexValue value() const override {
+ assert(Valid());
+ if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
+ return decoded_value_;
+ } else {
+ IndexValue entry;
+ Slice v = value_;
+ Status decode_s __attribute__((__unused__)) =
+ entry.DecodeFrom(&v, have_first_key_, nullptr);
+ assert(decode_s.ok());
+ return entry;
+ }
+ }
+
+ // IndexBlockIter follows a different contract for prefix iterator
+ // from data iterators.
+ // If prefix of the seek key `target` exists in the file, it must
+ // return the same result as total order seek.
+ // If the prefix of `target` doesn't exist in the file, it can either
+ // return the result of total order seek, or set both of Valid() = false
+ // and status() = NotFound().
+ void Seek(const Slice& target) override;
+
+ void SeekForPrev(const Slice&) override {
+ assert(false);
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ status_ = Status::InvalidArgument(
+ "RocksDB internal error: should never call SeekForPrev() on index "
+ "blocks");
+ key_.Clear();
+ value_.clear();
+ }
+
+ void Prev() override;
+
+ void Next() override;
+
+ void SeekToFirst() override;
+
+ void SeekToLast() override;
+
+ void Invalidate(Status s) { InvalidateBase(s); }
+
+ bool IsValuePinned() const override {
+ return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
+ }
+
+ private:
+ // Key is in InternalKey format
+ bool key_includes_seq_;
+ bool value_delta_encoded_;
+ bool have_first_key_; // value includes first_internal_key
+ BlockPrefixIndex* prefix_index_;
+ // Whether the value is delta encoded. In that case the value is assumed to be
+ // BlockHandle. The first value in each restart interval is the full encoded
+ // BlockHandle; the restart of encoded size part of the BlockHandle. The
+ // offset of delta encoded BlockHandles is computed by adding the size of
+ // previous delta encoded values in the same restart interval to the offset of
+ // the first value in that restart interval.
+ IndexValue decoded_value_;
+
+ // When sequence number overwriting is enabled, this struct contains the seqno
+ // to overwrite with, and current first_internal_key with overwritten seqno.
+ // This is rarely used, so we put it behind a pointer and only allocate when
+ // needed.
+ struct GlobalSeqnoState {
+ // First internal key according to current index entry, but with sequence
+ // number overwritten to global_seqno.
+ IterKey first_internal_key;
+ SequenceNumber global_seqno;
+
+ explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {}
+ };
+
+ std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
+
+ // Set *prefix_may_exist to false if no key possibly share the same prefix
+ // as `target`. If not set, the result position should be the same as total
+ // order Seek.
+ bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist);
+ // Set *prefix_may_exist to false if no key can possibly share the same
+ // prefix as `target`. If not set, the result position should be the same
+ // as total order seek.
+ bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
+ uint32_t left, uint32_t right, uint32_t* index,
+ bool* prefix_may_exist);
+ inline int CompareBlockKey(uint32_t block_index, const Slice& target);
+
+ inline int Compare(const Slice& a, const Slice& b) const {
+ return comparator_->Compare(a, b);
+ }
+
+ inline int Compare(const IterKey& ikey, const Slice& b) const {
+ return comparator_->Compare(ikey.GetKey(), b);
+ }
+
+ inline bool ParseNextIndexKey();
+
+ // When value_delta_encoded_ is enabled it decodes the value which is assumed
+ // to be BlockHandle and put it to decoded_value_
+ inline void DecodeCurrentValue(uint32_t shared);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_filter_block.cc b/src/rocksdb/table/block_based/block_based_filter_block.cc
new file mode 100644
index 000000000..de3f5cb13
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_filter_block.cc
@@ -0,0 +1,347 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_filter_block.h"
+#include <algorithm>
+
+#include "db/dbformat.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void AppendItem(std::string* props, const std::string& key,
+ const std::string& value) {
+ char cspace = ' ';
+ std::string value_str("");
+ size_t i = 0;
+ const size_t dataLength = 64;
+ const size_t tabLength = 2;
+ const size_t offLength = 16;
+
+ value_str.append(&value[i], std::min(size_t(dataLength), value.size()));
+ i += dataLength;
+ while (i < value.size()) {
+ value_str.append("\n");
+ value_str.append(offLength, cspace);
+ value_str.append(&value[i], std::min(size_t(dataLength), value.size() - i));
+ i += dataLength;
+ }
+
+ std::string result("");
+ if (key.size() < (offLength - tabLength))
+ result.append(size_t((offLength - tabLength)) - key.size(), cspace);
+ result.append(key);
+
+ props->append(result + ": " + value_str + "\n");
+}
+
+template <class TKey>
+void AppendItem(std::string* props, const TKey& key, const std::string& value) {
+ std::string key_str = ROCKSDB_NAMESPACE::ToString(key);
+ AppendItem(props, key_str, value);
+}
+} // namespace
+
+// See doc/table_format.txt for an explanation of the filter block format.
+
+// Generate new filter every 2KB of data
+static const size_t kFilterBaseLg = 11;
+static const size_t kFilterBase = 1 << kFilterBaseLg;
+
+BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
+ const SliceTransform* prefix_extractor,
+ const BlockBasedTableOptions& table_opt)
+ : policy_(table_opt.filter_policy.get()),
+ prefix_extractor_(prefix_extractor),
+ whole_key_filtering_(table_opt.whole_key_filtering),
+ prev_prefix_start_(0),
+ prev_prefix_size_(0),
+ num_added_(0) {
+ assert(policy_);
+}
+
+void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
+ uint64_t filter_index = (block_offset / kFilterBase);
+ assert(filter_index >= filter_offsets_.size());
+ while (filter_index > filter_offsets_.size()) {
+ GenerateFilter();
+ }
+}
+
+void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
+ if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
+ AddPrefix(key);
+ }
+
+ if (whole_key_filtering_) {
+ AddKey(key);
+ }
+}
+
+// Add key to filter if needed
+inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
+ num_added_++;
+ start_.push_back(entries_.size());
+ entries_.append(key.data(), key.size());
+}
+
+// Add prefix to filter if needed
+inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {
+ // get slice for most recently added entry
+ Slice prev;
+ if (prev_prefix_size_ > 0) {
+ prev = Slice(entries_.data() + prev_prefix_start_, prev_prefix_size_);
+ }
+
+ Slice prefix = prefix_extractor_->Transform(key);
+ // insert prefix only when it's different from the previous prefix.
+ if (prev.size() == 0 || prefix != prev) {
+ prev_prefix_start_ = entries_.size();
+ prev_prefix_size_ = prefix.size();
+ AddKey(prefix);
+ }
+}
+
+Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
+ Status* status) {
+ // In this impl we ignore BlockHandle
+ *status = Status::OK();
+ if (!start_.empty()) {
+ GenerateFilter();
+ }
+
+ // Append array of per-filter offsets
+ const uint32_t array_offset = static_cast<uint32_t>(result_.size());
+ for (size_t i = 0; i < filter_offsets_.size(); i++) {
+ PutFixed32(&result_, filter_offsets_[i]);
+ }
+
+ PutFixed32(&result_, array_offset);
+ result_.push_back(kFilterBaseLg); // Save encoding parameter in result
+ return Slice(result_);
+}
+
+void BlockBasedFilterBlockBuilder::GenerateFilter() {
+ const size_t num_entries = start_.size();
+ if (num_entries == 0) {
+ // Fast path if there are no keys for this filter
+ filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
+ return;
+ }
+
+ // Make list of keys from flattened key structure
+ start_.push_back(entries_.size()); // Simplify length computation
+ tmp_entries_.resize(num_entries);
+ for (size_t i = 0; i < num_entries; i++) {
+ const char* base = entries_.data() + start_[i];
+ size_t length = start_[i + 1] - start_[i];
+ tmp_entries_[i] = Slice(base, length);
+ }
+
+ // Generate filter for current set of keys and append to result_.
+ filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
+ policy_->CreateFilter(&tmp_entries_[0], static_cast<int>(num_entries),
+ &result_);
+
+ tmp_entries_.clear();
+ entries_.clear();
+ start_.clear();
+ prev_prefix_start_ = 0;
+ prev_prefix_size_ = 0;
+}
+
+BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
+ const BlockBasedTable* t, CachableEntry<BlockContents>&& filter_block)
+ : FilterBlockReaderCommon(t, std::move(filter_block)) {
+ assert(table());
+ assert(table()->get_rep());
+ assert(table()->get_rep()->filter_policy);
+}
+
+std::unique_ptr<FilterBlockReader> BlockBasedFilterBlockReader::Create(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ bool use_cache, bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context) {
+ assert(table);
+ assert(table->get_rep());
+ assert(!pin || prefetch);
+
+ CachableEntry<BlockContents> filter_block;
+ if (prefetch || !use_cache) {
+ const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+ use_cache, nullptr /* get_context */,
+ lookup_context, &filter_block);
+ if (!s.ok()) {
+ return std::unique_ptr<FilterBlockReader>();
+ }
+
+ if (use_cache && !pin) {
+ filter_block.Reset();
+ }
+ }
+
+ return std::unique_ptr<FilterBlockReader>(
+ new BlockBasedFilterBlockReader(table, std::move(filter_block)));
+}
+
+bool BlockBasedFilterBlockReader::KeyMayMatch(
+ const Slice& key, const SliceTransform* /* prefix_extractor */,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) {
+ assert(block_offset != kNotValid);
+ if (!whole_key_filtering()) {
+ return true;
+ }
+ return MayMatch(key, block_offset, no_io, get_context, lookup_context);
+}
+
+bool BlockBasedFilterBlockReader::PrefixMayMatch(
+ const Slice& prefix, const SliceTransform* /* prefix_extractor */,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) {
+ assert(block_offset != kNotValid);
+ return MayMatch(prefix, block_offset, no_io, get_context, lookup_context);
+}
+
+bool BlockBasedFilterBlockReader::ParseFieldsFromBlock(
+ const BlockContents& contents, const char** data, const char** offset,
+ size_t* num, size_t* base_lg) {
+ assert(data);
+ assert(offset);
+ assert(num);
+ assert(base_lg);
+
+ const size_t n = contents.data.size();
+ if (n < 5) { // 1 byte for base_lg and 4 for start of offset array
+ return false;
+ }
+
+ const uint32_t last_word = DecodeFixed32(contents.data.data() + n - 5);
+ if (last_word > n - 5) {
+ return false;
+ }
+
+ *data = contents.data.data();
+ *offset = (*data) + last_word;
+ *num = (n - 5 - last_word) / 4;
+ *base_lg = contents.data[n - 1];
+
+ return true;
+}
+
+bool BlockBasedFilterBlockReader::MayMatch(
+ const Slice& entry, uint64_t block_offset, bool no_io,
+ GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
+ CachableEntry<BlockContents> filter_block;
+
+ const Status s =
+ GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+ if (!s.ok()) {
+ return true;
+ }
+
+ assert(filter_block.GetValue());
+
+ const char* data = nullptr;
+ const char* offset = nullptr;
+ size_t num = 0;
+ size_t base_lg = 0;
+ if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
+ &base_lg)) {
+ return true; // Errors are treated as potential matches
+ }
+
+ const uint64_t index = block_offset >> base_lg;
+ if (index < num) {
+ const uint32_t start = DecodeFixed32(offset + index * 4);
+ const uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
+ if (start <= limit && limit <= (uint32_t)(offset - data)) {
+ const Slice filter = Slice(data + start, limit - start);
+
+ assert(table());
+ assert(table()->get_rep());
+ const FilterPolicy* const policy = table()->get_rep()->filter_policy;
+
+ const bool may_match = policy->KeyMayMatch(entry, filter);
+ if (may_match) {
+ PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+ return true;
+ } else {
+ PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+ return false;
+ }
+ } else if (start == limit) {
+ // Empty filters do not match any entries
+ return false;
+ }
+ }
+ return true; // Errors are treated as potential matches
+}
+
+size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
+ size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(const_cast<BlockBasedFilterBlockReader*>(this));
+#else
+ usage += sizeof(*this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ return usage;
+}
+
+std::string BlockBasedFilterBlockReader::ToString() const {
+ CachableEntry<BlockContents> filter_block;
+
+ const Status s =
+ GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
+ nullptr /* lookup_context */, &filter_block);
+ if (!s.ok()) {
+ return std::string("Unable to retrieve filter block");
+ }
+
+ assert(filter_block.GetValue());
+
+ const char* data = nullptr;
+ const char* offset = nullptr;
+ size_t num = 0;
+ size_t base_lg = 0;
+ if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
+ &base_lg)) {
+ return std::string("Error parsing filter block");
+ }
+
+ std::string result;
+ result.reserve(1024);
+
+ std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks");
+ AppendItem(&result, s_fb, ROCKSDB_NAMESPACE::ToString(num));
+ AppendItem(&result, s_bo, s_hd);
+
+ for (size_t index = 0; index < num; index++) {
+ uint32_t start = DecodeFixed32(offset + index * 4);
+ uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
+
+ if (start != limit) {
+ result.append(" filter block # " +
+ ROCKSDB_NAMESPACE::ToString(index + 1) + "\n");
+ Slice filter = Slice(data + start, limit - start);
+ AppendItem(&result, start, filter.ToString(true));
+ }
+ }
+ return result;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_filter_block.h b/src/rocksdb/table/block_based/block_based_filter_block.h
new file mode 100644
index 000000000..01c98a70b
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_filter_block.h
@@ -0,0 +1,119 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file. It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/format.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A BlockBasedFilterBlockBuilder is used to construct all of the filters for a
+// particular Table. It generates a single string which is stored as
+// a special block in the Table.
+//
+// The sequence of calls to BlockBasedFilterBlockBuilder must match the regexp:
+// (StartBlock Add*)* Finish
+class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+ BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor,
+ const BlockBasedTableOptions& table_opt);
+ // No copying allowed
+ BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&) = delete;
+ void operator=(const BlockBasedFilterBlockBuilder&) = delete;
+
+ virtual bool IsBlockBased() override { return true; }
+ virtual void StartBlock(uint64_t block_offset) override;
+ virtual void Add(const Slice& key) override;
+ virtual size_t NumAdded() const override { return num_added_; }
+ virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
+ using FilterBlockBuilder::Finish;
+
+ private:
+ void AddKey(const Slice& key);
+ void AddPrefix(const Slice& key);
+ void GenerateFilter();
+
+ // important: all of these might point to invalid addresses
+ // at the time of destruction of this filter block. destructor
+ // should NOT dereference them.
+ const FilterPolicy* policy_;
+ const SliceTransform* prefix_extractor_;
+ bool whole_key_filtering_;
+
+ size_t prev_prefix_start_; // the position of the last appended prefix
+ // to "entries_".
+ size_t prev_prefix_size_; // the length of the last appended prefix to
+ // "entries_".
+ std::string entries_; // Flattened entry contents
+ std::vector<size_t> start_; // Starting index in entries_ of each entry
+ std::string result_; // Filter data computed so far
+ std::vector<Slice> tmp_entries_; // policy_->CreateFilter() argument
+ std::vector<uint32_t> filter_offsets_;
+ size_t num_added_; // Number of keys added
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class BlockBasedFilterBlockReader
+ : public FilterBlockReaderCommon<BlockContents> {
+ public:
+ BlockBasedFilterBlockReader(const BlockBasedTable* t,
+ CachableEntry<BlockContents>&& filter_block);
+ // No copying allowed
+ BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&) = delete;
+ void operator=(const BlockBasedFilterBlockReader&) = delete;
+
+ static std::unique_ptr<FilterBlockReader> Create(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ bool use_cache, bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context);
+
+ bool IsBlockBased() override { return true; }
+
+ bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const const_ikey_ptr, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) override;
+ bool PrefixMayMatch(const Slice& prefix,
+ const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const const_ikey_ptr,
+ GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) override;
+ size_t ApproximateMemoryUsage() const override;
+
+ // convert this object to a human readable form
+ std::string ToString() const override;
+
+ private:
+ static bool ParseFieldsFromBlock(const BlockContents& contents,
+ const char** data, const char** offset,
+ size_t* num, size_t* base_lg);
+
+ bool MayMatch(const Slice& entry, uint64_t block_offset, bool no_io,
+ GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) const;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_filter_block_test.cc b/src/rocksdb/table/block_based/block_based_filter_block_test.cc
new file mode 100644
index 000000000..283d6a9a2
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_filter_block_test.cc
@@ -0,0 +1,434 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_filter_block.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// For testing: emit an array with one hash value per key
+class TestHashFilter : public FilterPolicy {
+ public:
+ const char* Name() const override { return "TestHashFilter"; }
+
+ void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+ for (int i = 0; i < n; i++) {
+ uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+ PutFixed32(dst, h);
+ }
+ }
+
+ bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+ uint32_t h = Hash(key.data(), key.size(), 1);
+ for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+ if (h == DecodeFixed32(filter.data() + i)) {
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+ explicit MockBlockBasedTable(Rep* rep)
+ : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class FilterBlockTest : public mock::MockBlockBasedTableTester,
+ public testing::Test {
+ public:
+ FilterBlockTest() : mock::MockBlockBasedTableTester(new TestHashFilter) {}
+};
+
+TEST_F(FilterBlockTest, EmptyBuilder) {
+ BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+ Slice slice(builder.Finish());
+ ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice));
+
+ CachableEntry<BlockContents> block(
+ new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+ true /* own_value */);
+
+ BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
+ ASSERT_TRUE(reader.KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+}
+
+TEST_F(FilterBlockTest, SingleChunk) {
+ BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+ ASSERT_EQ(0, builder.NumAdded());
+ builder.StartBlock(100);
+ builder.Add("foo");
+ builder.Add("bar");
+ builder.Add("box");
+ builder.StartBlock(200);
+ builder.Add("box");
+ builder.StartBlock(300);
+ builder.Add("hello");
+ ASSERT_EQ(5, builder.NumAdded());
+ Slice slice(builder.Finish());
+
+ CachableEntry<BlockContents> block(
+ new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+ true /* own_value */);
+
+ BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
+ ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+}
+
+TEST_F(FilterBlockTest, MultiChunk) {
+ BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+
+ // First filter
+ builder.StartBlock(0);
+ builder.Add("foo");
+ builder.StartBlock(2000);
+ builder.Add("bar");
+
+ // Second filter
+ builder.StartBlock(3100);
+ builder.Add("box");
+
+ // Third filter is empty
+
+ // Last filter
+ builder.StartBlock(9000);
+ builder.Add("box");
+ builder.Add("hello");
+
+ Slice slice(builder.Finish());
+
+ CachableEntry<BlockContents> block(
+ new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+ true /* own_value */);
+
+ BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
+
+ // Check first filter
+ ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/uint64_t{0},
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/2000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+
+ // Check second filter
+ ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/3100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+
+ // Check third filter (empty)
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+
+ // Check last filter
+ ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/9000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/9000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+}
+
+// Test for block based filter block
+// use new interface in FilterPolicy to create filter builder/reader
+class BlockBasedFilterBlockTest : public mock::MockBlockBasedTableTester,
+ public testing::Test {
+ public:
+ BlockBasedFilterBlockTest()
+ : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, true)) {}
+};
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
+ FilterBlockBuilder* builder =
+ new BlockBasedFilterBlockBuilder(nullptr, table_options_);
+ Slice slice(builder->Finish());
+ ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice));
+
+ CachableEntry<BlockContents> block(
+ new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+ true /* own_value */);
+
+ FilterBlockReader* reader =
+ new BlockBasedFilterBlockReader(table_.get(), std::move(block));
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/10000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+
+ delete builder;
+ delete reader;
+}
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
+ FilterBlockBuilder* builder =
+ new BlockBasedFilterBlockBuilder(nullptr, table_options_);
+ builder->StartBlock(100);
+ builder->Add("foo");
+ builder->Add("bar");
+ builder->Add("box");
+ builder->StartBlock(200);
+ builder->Add("box");
+ builder->StartBlock(300);
+ builder->Add("hello");
+ Slice slice(builder->Finish());
+
+ CachableEntry<BlockContents> block(
+ new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+ true /* own_value */);
+
+ FilterBlockReader* reader =
+ new BlockBasedFilterBlockReader(table_.get(), std::move(block));
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+
+ delete builder;
+ delete reader;
+}
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
+ FilterBlockBuilder* builder =
+ new BlockBasedFilterBlockBuilder(nullptr, table_options_);
+
+ // First filter
+ builder->StartBlock(0);
+ builder->Add("foo");
+ builder->StartBlock(2000);
+ builder->Add("bar");
+
+ // Second filter
+ builder->StartBlock(3100);
+ builder->Add("box");
+
+ // Third filter is empty
+
+ // Last filter
+ builder->StartBlock(9000);
+ builder->Add("box");
+ builder->Add("hello");
+
+ Slice slice(builder->Finish());
+
+ CachableEntry<BlockContents> block(
+ new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+ true /* own_value */);
+
+ FilterBlockReader* reader =
+ new BlockBasedFilterBlockReader(table_.get(), std::move(block));
+
+ // Check first filter
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+
+ // Check second filter
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+
+ // Check third filter (empty)
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+
+ // Check last filter
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader->KeyMayMatch(
+ "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader->KeyMayMatch(
+ "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+
+ delete builder;
+ delete reader;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/block_based_table_builder.cc b/src/rocksdb/table/block_based/block_based_table_builder.cc
new file mode 100644
index 000000000..2003008fe
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_builder.cc
@@ -0,0 +1,1217 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_table_builder.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "db/dbformat.h"
+#include "index_builder.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/table.h"
+
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/format.h"
+#include "table/table_builder.h"
+
+#include "memory/memory_allocator.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+
+typedef BlockBasedTableOptions::IndexType IndexType;
+
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+
+// Create a filter block builder based on its type.
+FilterBlockBuilder* CreateFilterBlockBuilder(
+ const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
+ const FilterBuildingContext& context,
+ const bool use_delta_encoding_for_index_values,
+ PartitionedIndexBuilder* const p_index_builder) {
+ const BlockBasedTableOptions& table_opt = context.table_options;
+ if (table_opt.filter_policy == nullptr) return nullptr;
+
+ FilterBitsBuilder* filter_bits_builder =
+ BloomFilterPolicy::GetBuilderFromContext(context);
+ if (filter_bits_builder == nullptr) {
+ return new BlockBasedFilterBlockBuilder(mopt.prefix_extractor.get(),
+ table_opt);
+ } else {
+ if (table_opt.partition_filters) {
+ assert(p_index_builder != nullptr);
+ // Since after partition cut request from filter builder it takes time
+ // until index builder actully cuts the partition, we take the lower bound
+ // as partition size.
+ assert(table_opt.block_size_deviation <= 100);
+ auto partition_size =
+ static_cast<uint32_t>(((table_opt.metadata_block_size *
+ (100 - table_opt.block_size_deviation)) +
+ 99) /
+ 100);
+ partition_size = std::max(partition_size, static_cast<uint32_t>(1));
+ return new PartitionedFilterBlockBuilder(
+ mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
+ filter_bits_builder, table_opt.index_block_restart_interval,
+ use_delta_encoding_for_index_values, p_index_builder, partition_size);
+ } else {
+ return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
+ table_opt.whole_key_filtering,
+ filter_bits_builder);
+ }
+ }
+}
+
+bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
+ // Check to see if compressed less than 12.5%
+ return compressed_size < raw_size - (raw_size / 8u);
+}
+
+bool CompressBlockInternal(const Slice& raw,
+ const CompressionInfo& compression_info,
+ uint32_t format_version,
+ std::string* compressed_output) {
+ // Will return compressed block contents if (1) the compression method is
+ // supported in this platform and (2) the compression rate is "good enough".
+ switch (compression_info.type()) {
+ case kSnappyCompression:
+ return Snappy_Compress(compression_info, raw.data(), raw.size(),
+ compressed_output);
+ case kZlibCompression:
+ return Zlib_Compress(
+ compression_info,
+ GetCompressFormatForVersion(kZlibCompression, format_version),
+ raw.data(), raw.size(), compressed_output);
+ case kBZip2Compression:
+ return BZip2_Compress(
+ compression_info,
+ GetCompressFormatForVersion(kBZip2Compression, format_version),
+ raw.data(), raw.size(), compressed_output);
+ case kLZ4Compression:
+ return LZ4_Compress(
+ compression_info,
+ GetCompressFormatForVersion(kLZ4Compression, format_version),
+ raw.data(), raw.size(), compressed_output);
+ case kLZ4HCCompression:
+ return LZ4HC_Compress(
+ compression_info,
+ GetCompressFormatForVersion(kLZ4HCCompression, format_version),
+ raw.data(), raw.size(), compressed_output);
+ case kXpressCompression:
+ return XPRESS_Compress(raw.data(), raw.size(), compressed_output);
+ case kZSTD:
+ case kZSTDNotFinalCompression:
+ return ZSTD_Compress(compression_info, raw.data(), raw.size(),
+ compressed_output);
+ default:
+ // Do not recognize this compression type
+ return false;
+ }
+}
+
+} // namespace
+
+// format_version is the block format as defined in include/rocksdb/table.h
+Slice CompressBlock(const Slice& raw, const CompressionInfo& info,
+ CompressionType* type, uint32_t format_version,
+ bool do_sample, std::string* compressed_output,
+ std::string* sampled_output_fast,
+ std::string* sampled_output_slow) {
+ *type = info.type();
+
+ if (info.type() == kNoCompression && !info.SampleForCompression()) {
+ return raw;
+ }
+
+ // If requested, we sample one in every N block with a
+ // fast and slow compression algorithm and report the stats.
+ // The users can use these stats to decide if it is worthwhile
+ // enabling compression and they also get a hint about which
+ // compression algorithm wil be beneficial.
+ if (do_sample && info.SampleForCompression() &&
+ Random::GetTLSInstance()->OneIn((int)info.SampleForCompression()) &&
+ sampled_output_fast && sampled_output_slow) {
+ // Sampling with a fast compression algorithm
+ if (LZ4_Supported() || Snappy_Supported()) {
+ CompressionType c =
+ LZ4_Supported() ? kLZ4Compression : kSnappyCompression;
+ CompressionContext context(c);
+ CompressionOptions options;
+ CompressionInfo info_tmp(options, context,
+ CompressionDict::GetEmptyDict(), c,
+ info.SampleForCompression());
+
+ CompressBlockInternal(raw, info_tmp, format_version, sampled_output_fast);
+ }
+
+ // Sampling with a slow but high-compression algorithm
+ if (ZSTD_Supported() || Zlib_Supported()) {
+ CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression;
+ CompressionContext context(c);
+ CompressionOptions options;
+ CompressionInfo info_tmp(options, context,
+ CompressionDict::GetEmptyDict(), c,
+ info.SampleForCompression());
+ CompressBlockInternal(raw, info_tmp, format_version, sampled_output_slow);
+ }
+ }
+
+ // Actually compress the data
+ if (*type != kNoCompression) {
+ if (CompressBlockInternal(raw, info, format_version, compressed_output) &&
+ GoodCompressionRatio(compressed_output->size(), raw.size())) {
+ return *compressed_output;
+ }
+ }
+
+ // Compression method is not supported, or not good
+ // compression ratio, so just fall back to uncompressed form.
+ *type = kNoCompression;
+ return raw;
+}
+
+// kBlockBasedTableMagicNumber was picked by running
+// echo rocksdb.table.block_based | sha1sum
+// and taking the leading 64 bits.
+// Please note that kBlockBasedTableMagicNumber may also be accessed by other
+// .cc files
+// for that reason we declare it extern in the header but to get the space
+// allocated
+// it must be not extern in one place.
+const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
+// We also support reading and writing legacy block based table format (for
+// backwards compatibility)
+const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
+
+// A collector that collects properties of interest to block-based table.
+// For now this class looks heavy-weight since we only write one additional
+// property.
+// But in the foreseeable future, we will add more and more properties that are
+// specific to block-based table.
+class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
+ : public IntTblPropCollector {
+ public:
+ explicit BlockBasedTablePropertiesCollector(
+ BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering,
+ bool prefix_filtering)
+ : index_type_(index_type),
+ whole_key_filtering_(whole_key_filtering),
+ prefix_filtering_(prefix_filtering) {}
+
+ Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
+ uint64_t /*file_size*/) override {
+ // Intentionally left blank. Have no interest in collecting stats for
+ // individual key/value pairs.
+ return Status::OK();
+ }
+
+ virtual void BlockAdd(uint64_t /* blockRawBytes */,
+ uint64_t /* blockCompressedBytesFast */,
+ uint64_t /* blockCompressedBytesSlow */) override {
+ // Intentionally left blank. No interest in collecting stats for
+ // blocks.
+ return;
+ }
+
+ Status Finish(UserCollectedProperties* properties) override {
+ std::string val;
+ PutFixed32(&val, static_cast<uint32_t>(index_type_));
+ properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
+ properties->insert({BlockBasedTablePropertyNames::kWholeKeyFiltering,
+ whole_key_filtering_ ? kPropTrue : kPropFalse});
+ properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering,
+ prefix_filtering_ ? kPropTrue : kPropFalse});
+ return Status::OK();
+ }
+
+ // The name of the properties collector can be used for debugging purpose.
+ const char* Name() const override {
+ return "BlockBasedTablePropertiesCollector";
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ // Intentionally left blank.
+ return UserCollectedProperties();
+ }
+
+ private:
+ BlockBasedTableOptions::IndexType index_type_;
+ bool whole_key_filtering_;
+ bool prefix_filtering_;
+};
+
+struct BlockBasedTableBuilder::Rep {
+ const ImmutableCFOptions ioptions;
+ const MutableCFOptions moptions;
+ const BlockBasedTableOptions table_options;
+ const InternalKeyComparator& internal_comparator;
+ WritableFileWriter* file;
+ uint64_t offset = 0;
+ Status status;
+ size_t alignment;
+ BlockBuilder data_block;
+ // Buffers uncompressed data blocks and keys to replay later. Needed when
+ // compression dictionary is enabled so we can finalize the dictionary before
+ // compressing any data blocks.
+ // TODO(ajkr): ideally we don't buffer all keys and all uncompressed data
+ // blocks as it's redundant, but it's easier to implement for now.
+ std::vector<std::pair<std::string, std::vector<std::string>>>
+ data_block_and_keys_buffers;
+ BlockBuilder range_del_block;
+
+ InternalKeySliceTransform internal_prefix_transform;
+ std::unique_ptr<IndexBuilder> index_builder;
+ PartitionedIndexBuilder* p_index_builder_ = nullptr;
+
+ std::string last_key;
+ CompressionType compression_type;
+ uint64_t sample_for_compression;
+ CompressionOptions compression_opts;
+ std::unique_ptr<CompressionDict> compression_dict;
+ CompressionContext compression_ctx;
+ std::unique_ptr<UncompressionContext> verify_ctx;
+ std::unique_ptr<UncompressionDict> verify_dict;
+
+ size_t data_begin_offset = 0;
+
+ TableProperties props;
+
+ // States of the builder.
+ //
+ // - `kBuffered`: This is the initial state where zero or more data blocks are
+ // accumulated uncompressed in-memory. From this state, call
+ // `EnterUnbuffered()` to finalize the compression dictionary if enabled,
+ // compress/write out any buffered blocks, and proceed to the `kUnbuffered`
+ // state.
+ //
+ // - `kUnbuffered`: This is the state when compression dictionary is finalized
+ // either because it wasn't enabled in the first place or it's been created
+ // from sampling previously buffered data. In this state, blocks are simply
+ // compressed/written out as they fill up. From this state, call `Finish()`
+ // to complete the file (write meta-blocks, etc.), or `Abandon()` to delete
+ // the partially created file.
+ //
+ // - `kClosed`: This indicates either `Finish()` or `Abandon()` has been
+ // called, so the table builder is no longer usable. We must be in this
+ // state by the time the destructor runs.
+ enum class State {
+ kBuffered,
+ kUnbuffered,
+ kClosed,
+ };
+ State state;
+
+ const bool use_delta_encoding_for_index_values;
+ std::unique_ptr<FilterBlockBuilder> filter_builder;
+ char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
+ size_t compressed_cache_key_prefix_size;
+
+ BlockHandle pending_handle; // Handle to add to index block
+
+ std::string compressed_output;
+ std::unique_ptr<FlushBlockPolicy> flush_block_policy;
+ int level_at_creation;
+ uint32_t column_family_id;
+ const std::string& column_family_name;
+ uint64_t creation_time = 0;
+ uint64_t oldest_key_time = 0;
+ const uint64_t target_file_size;
+ uint64_t file_creation_time = 0;
+
+ std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
+
+ Rep(const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions,
+ const BlockBasedTableOptions& table_opt,
+ const InternalKeyComparator& icomparator,
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories,
+ uint32_t _column_family_id, WritableFileWriter* f,
+ const CompressionType _compression_type,
+ const uint64_t _sample_for_compression,
+ const CompressionOptions& _compression_opts, const bool skip_filters,
+ const int _level_at_creation, const std::string& _column_family_name,
+ const uint64_t _creation_time, const uint64_t _oldest_key_time,
+ const uint64_t _target_file_size, const uint64_t _file_creation_time)
+ : ioptions(_ioptions),
+ moptions(_moptions),
+ table_options(table_opt),
+ internal_comparator(icomparator),
+ file(f),
+ alignment(table_options.block_align
+ ? std::min(table_options.block_size, kDefaultPageSize)
+ : 0),
+ data_block(table_options.block_restart_interval,
+ table_options.use_delta_encoding,
+ false /* use_value_delta_encoding */,
+ icomparator.user_comparator()
+ ->CanKeysWithDifferentByteContentsBeEqual()
+ ? BlockBasedTableOptions::kDataBlockBinarySearch
+ : table_options.data_block_index_type,
+ table_options.data_block_hash_table_util_ratio),
+ range_del_block(1 /* block_restart_interval */),
+ internal_prefix_transform(_moptions.prefix_extractor.get()),
+ compression_type(_compression_type),
+ sample_for_compression(_sample_for_compression),
+ compression_opts(_compression_opts),
+ compression_dict(),
+ compression_ctx(_compression_type),
+ verify_dict(),
+ state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered
+ : State::kUnbuffered),
+ use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
+ !table_opt.block_align),
+ compressed_cache_key_prefix_size(0),
+ flush_block_policy(
+ table_options.flush_block_policy_factory->NewFlushBlockPolicy(
+ table_options, data_block)),
+ level_at_creation(_level_at_creation),
+ column_family_id(_column_family_id),
+ column_family_name(_column_family_name),
+ creation_time(_creation_time),
+ oldest_key_time(_oldest_key_time),
+ target_file_size(_target_file_size),
+ file_creation_time(_file_creation_time) {
+ if (table_options.index_type ==
+ BlockBasedTableOptions::kTwoLevelIndexSearch) {
+ p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
+ &internal_comparator, use_delta_encoding_for_index_values,
+ table_options);
+ index_builder.reset(p_index_builder_);
+ } else {
+ index_builder.reset(IndexBuilder::CreateIndexBuilder(
+ table_options.index_type, &internal_comparator,
+ &this->internal_prefix_transform, use_delta_encoding_for_index_values,
+ table_options));
+ }
+ if (skip_filters) {
+ filter_builder = nullptr;
+ } else {
+ FilterBuildingContext context(table_options);
+ context.column_family_name = column_family_name;
+ context.compaction_style = ioptions.compaction_style;
+ context.level_at_creation = level_at_creation;
+ context.info_log = ioptions.info_log;
+ filter_builder.reset(CreateFilterBlockBuilder(
+ ioptions, moptions, context, use_delta_encoding_for_index_values,
+ p_index_builder_));
+ }
+
+ for (auto& collector_factories : *int_tbl_prop_collector_factories) {
+ table_properties_collectors.emplace_back(
+ collector_factories->CreateIntTblPropCollector(column_family_id));
+ }
+ table_properties_collectors.emplace_back(
+ new BlockBasedTablePropertiesCollector(
+ table_options.index_type, table_options.whole_key_filtering,
+ _moptions.prefix_extractor != nullptr));
+ if (table_options.verify_compression) {
+ verify_ctx.reset(new UncompressionContext(UncompressionContext::NoCache(),
+ compression_type));
+ }
+ }
+
+ Rep(const Rep&) = delete;
+ Rep& operator=(const Rep&) = delete;
+
+ ~Rep() {}
+};
+
+BlockBasedTableBuilder::BlockBasedTableBuilder(
+ const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
+ const BlockBasedTableOptions& table_options,
+ const InternalKeyComparator& internal_comparator,
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories,
+ uint32_t column_family_id, WritableFileWriter* file,
+ const CompressionType compression_type,
+ const uint64_t sample_for_compression,
+ const CompressionOptions& compression_opts, const bool skip_filters,
+ const std::string& column_family_name, const int level_at_creation,
+ const uint64_t creation_time, const uint64_t oldest_key_time,
+ const uint64_t target_file_size, const uint64_t file_creation_time) {
+ BlockBasedTableOptions sanitized_table_options(table_options);
+ if (sanitized_table_options.format_version == 0 &&
+ sanitized_table_options.checksum != kCRC32c) {
+ ROCKS_LOG_WARN(
+ ioptions.info_log,
+ "Silently converting format_version to 1 because checksum is "
+ "non-default");
+ // silently convert format_version to 1 to keep consistent with current
+ // behavior
+ sanitized_table_options.format_version = 1;
+ }
+
+ rep_ = new Rep(ioptions, moptions, sanitized_table_options,
+ internal_comparator, int_tbl_prop_collector_factories,
+ column_family_id, file, compression_type,
+ sample_for_compression, compression_opts, skip_filters,
+ level_at_creation, column_family_name, creation_time,
+ oldest_key_time, target_file_size, file_creation_time);
+
+ if (rep_->filter_builder != nullptr) {
+ rep_->filter_builder->StartBlock(0);
+ }
+ if (table_options.block_cache_compressed.get() != nullptr) {
+ BlockBasedTable::GenerateCachePrefix(
+ table_options.block_cache_compressed.get(), file->writable_file(),
+ &rep_->compressed_cache_key_prefix[0],
+ &rep_->compressed_cache_key_prefix_size);
+ }
+}
+
+BlockBasedTableBuilder::~BlockBasedTableBuilder() {
+ // Catch errors where caller forgot to call Finish()
+ assert(rep_->state == Rep::State::kClosed);
+ delete rep_;
+}
+
+void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
+ Rep* r = rep_;
+ assert(rep_->state != Rep::State::kClosed);
+ if (!ok()) return;
+ ValueType value_type = ExtractValueType(key);
+ if (IsValueType(value_type)) {
+#ifndef NDEBUG
+ if (r->props.num_entries > r->props.num_range_deletions) {
+ assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
+ }
+#endif // NDEBUG
+
+ auto should_flush = r->flush_block_policy->Update(key, value);
+ if (should_flush) {
+ assert(!r->data_block.empty());
+ Flush();
+
+ if (r->state == Rep::State::kBuffered &&
+ r->data_begin_offset > r->target_file_size) {
+ EnterUnbuffered();
+ }
+
+ // Add item to index block.
+ // We do not emit the index entry for a block until we have seen the
+ // first key for the next data block. This allows us to use shorter
+ // keys in the index block. For example, consider a block boundary
+ // between the keys "the quick brown fox" and "the who". We can use
+ // "the r" as the key for the index block entry since it is >= all
+ // entries in the first block and < all entries in subsequent
+ // blocks.
+ if (ok() && r->state == Rep::State::kUnbuffered) {
+ r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle);
+ }
+ }
+
+ // Note: PartitionedFilterBlockBuilder requires key being added to filter
+ // builder after being added to index builder.
+ if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) {
+ size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size();
+ r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+ }
+
+ r->last_key.assign(key.data(), key.size());
+ r->data_block.Add(key, value);
+ if (r->state == Rep::State::kBuffered) {
+ // Buffer keys to be replayed during `Finish()` once compression
+ // dictionary has been finalized.
+ if (r->data_block_and_keys_buffers.empty() || should_flush) {
+ r->data_block_and_keys_buffers.emplace_back();
+ }
+ r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString());
+ } else {
+ r->index_builder->OnKeyAdded(key);
+ }
+ NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
+ r->table_properties_collectors,
+ r->ioptions.info_log);
+
+ } else if (value_type == kTypeRangeDeletion) {
+ r->range_del_block.Add(key, value);
+ NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
+ r->table_properties_collectors,
+ r->ioptions.info_log);
+ } else {
+ assert(false);
+ }
+
+ r->props.num_entries++;
+ r->props.raw_key_size += key.size();
+ r->props.raw_value_size += value.size();
+ if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion) {
+ r->props.num_deletions++;
+ } else if (value_type == kTypeRangeDeletion) {
+ r->props.num_deletions++;
+ r->props.num_range_deletions++;
+ } else if (value_type == kTypeMerge) {
+ r->props.num_merge_operands++;
+ }
+}
+
+void BlockBasedTableBuilder::Flush() {
+ Rep* r = rep_;
+ assert(rep_->state != Rep::State::kClosed);
+ if (!ok()) return;
+ if (r->data_block.empty()) return;
+ WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */);
+}
+
+void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
+ BlockHandle* handle,
+ bool is_data_block) {
+ WriteBlock(block->Finish(), handle, is_data_block);
+ block->Reset();
+}
+
+void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
+ BlockHandle* handle,
+ bool is_data_block) {
+ // File format contains a sequence of blocks where each block has:
+ // block_data: uint8[n]
+ // type: uint8
+ // crc: uint32
+ assert(ok());
+ Rep* r = rep_;
+
+ auto type = r->compression_type;
+ uint64_t sample_for_compression = r->sample_for_compression;
+ Slice block_contents;
+ bool abort_compression = false;
+
+ StopWatchNano timer(
+ r->ioptions.env,
+ ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics));
+
+ if (r->state == Rep::State::kBuffered) {
+ assert(is_data_block);
+ assert(!r->data_block_and_keys_buffers.empty());
+ r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString();
+ r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size();
+ return;
+ }
+
+ if (raw_block_contents.size() < kCompressionSizeLimit) {
+ const CompressionDict* compression_dict;
+ if (!is_data_block || r->compression_dict == nullptr) {
+ compression_dict = &CompressionDict::GetEmptyDict();
+ } else {
+ compression_dict = r->compression_dict.get();
+ }
+ assert(compression_dict != nullptr);
+ CompressionInfo compression_info(r->compression_opts, r->compression_ctx,
+ *compression_dict, type,
+ sample_for_compression);
+
+ std::string sampled_output_fast;
+ std::string sampled_output_slow;
+ block_contents = CompressBlock(
+ raw_block_contents, compression_info, &type,
+ r->table_options.format_version, is_data_block /* do_sample */,
+ &r->compressed_output, &sampled_output_fast, &sampled_output_slow);
+
+ // notify collectors on block add
+ NotifyCollectTableCollectorsOnBlockAdd(
+ r->table_properties_collectors, raw_block_contents.size(),
+ sampled_output_fast.size(), sampled_output_slow.size());
+
+ // Some of the compression algorithms are known to be unreliable. If
+ // the verify_compression flag is set then try to de-compress the
+ // compressed data and compare to the input.
+ if (type != kNoCompression && r->table_options.verify_compression) {
+ // Retrieve the uncompressed contents into a new buffer
+ const UncompressionDict* verify_dict;
+ if (!is_data_block || r->verify_dict == nullptr) {
+ verify_dict = &UncompressionDict::GetEmptyDict();
+ } else {
+ verify_dict = r->verify_dict.get();
+ }
+ assert(verify_dict != nullptr);
+ BlockContents contents;
+ UncompressionInfo uncompression_info(*r->verify_ctx, *verify_dict,
+ r->compression_type);
+ Status stat = UncompressBlockContentsForCompressionType(
+ uncompression_info, block_contents.data(), block_contents.size(),
+ &contents, r->table_options.format_version, r->ioptions);
+
+ if (stat.ok()) {
+ bool compressed_ok = contents.data.compare(raw_block_contents) == 0;
+ if (!compressed_ok) {
+ // The result of the compression was invalid. abort.
+ abort_compression = true;
+ ROCKS_LOG_ERROR(r->ioptions.info_log,
+ "Decompressed block did not match raw block");
+ r->status =
+ Status::Corruption("Decompressed block did not match raw block");
+ }
+ } else {
+ // Decompression reported an error. abort.
+ r->status = Status::Corruption("Could not decompress");
+ abort_compression = true;
+ }
+ }
+ } else {
+ // Block is too big to be compressed.
+ abort_compression = true;
+ }
+
+ // Abort compression if the block is too big, or did not pass
+ // verification.
+ if (abort_compression) {
+ RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
+ type = kNoCompression;
+ block_contents = raw_block_contents;
+ } else if (type != kNoCompression) {
+ if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)) {
+ RecordTimeToHistogram(r->ioptions.statistics, COMPRESSION_TIMES_NANOS,
+ timer.ElapsedNanos());
+ }
+ RecordInHistogram(r->ioptions.statistics, BYTES_COMPRESSED,
+ raw_block_contents.size());
+ RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED);
+ } else if (type != r->compression_type) {
+ RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
+ }
+
+ WriteRawBlock(block_contents, type, handle, is_data_block);
+ r->compressed_output.clear();
+ if (is_data_block) {
+ if (r->filter_builder != nullptr) {
+ r->filter_builder->StartBlock(r->offset);
+ }
+ r->props.data_size = r->offset;
+ ++r->props.num_data_blocks;
+ }
+}
+
+void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
+ CompressionType type,
+ BlockHandle* handle,
+ bool is_data_block) {
+ Rep* r = rep_;
+ StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS);
+ handle->set_offset(r->offset);
+ handle->set_size(block_contents.size());
+ assert(r->status.ok());
+ r->status = r->file->Append(block_contents);
+ if (r->status.ok()) {
+ char trailer[kBlockTrailerSize];
+ trailer[0] = type;
+ char* trailer_without_type = trailer + 1;
+ switch (r->table_options.checksum) {
+ case kNoChecksum:
+ EncodeFixed32(trailer_without_type, 0);
+ break;
+ case kCRC32c: {
+ auto crc = crc32c::Value(block_contents.data(), block_contents.size());
+ crc = crc32c::Extend(crc, trailer, 1); // Extend to cover block type
+ EncodeFixed32(trailer_without_type, crc32c::Mask(crc));
+ break;
+ }
+ case kxxHash: {
+ XXH32_state_t* const state = XXH32_createState();
+ XXH32_reset(state, 0);
+ XXH32_update(state, block_contents.data(),
+ static_cast<uint32_t>(block_contents.size()));
+ XXH32_update(state, trailer, 1); // Extend to cover block type
+ EncodeFixed32(trailer_without_type, XXH32_digest(state));
+ XXH32_freeState(state);
+ break;
+ }
+ case kxxHash64: {
+ XXH64_state_t* const state = XXH64_createState();
+ XXH64_reset(state, 0);
+ XXH64_update(state, block_contents.data(),
+ static_cast<uint32_t>(block_contents.size()));
+ XXH64_update(state, trailer, 1); // Extend to cover block type
+ EncodeFixed32(
+ trailer_without_type,
+ static_cast<uint32_t>(XXH64_digest(state) & // lower 32 bits
+ uint64_t{0xffffffff}));
+ XXH64_freeState(state);
+ break;
+ }
+ }
+
+ assert(r->status.ok());
+ TEST_SYNC_POINT_CALLBACK(
+ "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum",
+ static_cast<char*>(trailer));
+ r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
+ if (r->status.ok()) {
+ r->status = InsertBlockInCache(block_contents, type, handle);
+ }
+ if (r->status.ok()) {
+ r->offset += block_contents.size() + kBlockTrailerSize;
+ if (r->table_options.block_align && is_data_block) {
+ size_t pad_bytes =
+ (r->alignment - ((block_contents.size() + kBlockTrailerSize) &
+ (r->alignment - 1))) &
+ (r->alignment - 1);
+ r->status = r->file->Pad(pad_bytes);
+ if (r->status.ok()) {
+ r->offset += pad_bytes;
+ }
+ }
+ }
+ }
+}
+
+Status BlockBasedTableBuilder::status() const { return rep_->status; }
+
+static void DeleteCachedBlockContents(const Slice& /*key*/, void* value) {
+ BlockContents* bc = reinterpret_cast<BlockContents*>(value);
+ delete bc;
+}
+
+//
+// Make a copy of the block contents and insert into compressed block cache
+//
+Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
+ const CompressionType type,
+ const BlockHandle* handle) {
+ Rep* r = rep_;
+ Cache* block_cache_compressed = r->table_options.block_cache_compressed.get();
+
+ if (type != kNoCompression && block_cache_compressed != nullptr) {
+ size_t size = block_contents.size();
+
+ auto ubuf =
+ AllocateBlock(size + 1, block_cache_compressed->memory_allocator());
+ memcpy(ubuf.get(), block_contents.data(), size);
+ ubuf[size] = type;
+
+ BlockContents* block_contents_to_cache =
+ new BlockContents(std::move(ubuf), size);
+#ifndef NDEBUG
+ block_contents_to_cache->is_raw_block = true;
+#endif // NDEBUG
+
+ // make cache key by appending the file offset to the cache prefix id
+ char* end = EncodeVarint64(
+ r->compressed_cache_key_prefix + r->compressed_cache_key_prefix_size,
+ handle->offset());
+ Slice key(r->compressed_cache_key_prefix,
+ static_cast<size_t>(end - r->compressed_cache_key_prefix));
+
+ // Insert into compressed block cache.
+ block_cache_compressed->Insert(
+ key, block_contents_to_cache,
+ block_contents_to_cache->ApproximateMemoryUsage(),
+ &DeleteCachedBlockContents);
+
+ // Invalidate OS cache.
+ r->file->InvalidateCache(static_cast<size_t>(r->offset), size);
+ }
+ return Status::OK();
+}
+
+void BlockBasedTableBuilder::WriteFilterBlock(
+ MetaIndexBuilder* meta_index_builder) {
+ BlockHandle filter_block_handle;
+ bool empty_filter_block = (rep_->filter_builder == nullptr ||
+ rep_->filter_builder->NumAdded() == 0);
+ if (ok() && !empty_filter_block) {
+ Status s = Status::Incomplete();
+ while (ok() && s.IsIncomplete()) {
+ Slice filter_content =
+ rep_->filter_builder->Finish(filter_block_handle, &s);
+ assert(s.ok() || s.IsIncomplete());
+ rep_->props.filter_size += filter_content.size();
+ WriteRawBlock(filter_content, kNoCompression, &filter_block_handle);
+ }
+ }
+ if (ok() && !empty_filter_block) {
+ // Add mapping from "<filter_block_prefix>.Name" to location
+ // of filter data.
+ std::string key;
+ if (rep_->filter_builder->IsBlockBased()) {
+ key = BlockBasedTable::kFilterBlockPrefix;
+ } else {
+ key = rep_->table_options.partition_filters
+ ? BlockBasedTable::kPartitionedFilterBlockPrefix
+ : BlockBasedTable::kFullFilterBlockPrefix;
+ }
+ key.append(rep_->table_options.filter_policy->Name());
+ meta_index_builder->Add(key, filter_block_handle);
+ }
+}
+
+void BlockBasedTableBuilder::WriteIndexBlock(
+ MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) {
+ IndexBuilder::IndexBlocks index_blocks;
+ auto index_builder_status = rep_->index_builder->Finish(&index_blocks);
+ if (index_builder_status.IsIncomplete()) {
+ // We we have more than one index partition then meta_blocks are not
+ // supported for the index. Currently meta_blocks are used only by
+ // HashIndexBuilder which is not multi-partition.
+ assert(index_blocks.meta_blocks.empty());
+ } else if (ok() && !index_builder_status.ok()) {
+ rep_->status = index_builder_status;
+ }
+ if (ok()) {
+ for (const auto& item : index_blocks.meta_blocks) {
+ BlockHandle block_handle;
+ WriteBlock(item.second, &block_handle, false /* is_data_block */);
+ if (!ok()) {
+ break;
+ }
+ meta_index_builder->Add(item.first, block_handle);
+ }
+ }
+ if (ok()) {
+ if (rep_->table_options.enable_index_compression) {
+ WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
+ } else {
+ WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
+ index_block_handle);
+ }
+ }
+ // If there are more index partitions, finish them and write them out
+ Status s = index_builder_status;
+ while (ok() && s.IsIncomplete()) {
+ s = rep_->index_builder->Finish(&index_blocks, *index_block_handle);
+ if (!s.ok() && !s.IsIncomplete()) {
+ rep_->status = s;
+ return;
+ }
+ if (rep_->table_options.enable_index_compression) {
+ WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
+ } else {
+ WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
+ index_block_handle);
+ }
+ // The last index_block_handle will be for the partition index block
+ }
+}
+
+void BlockBasedTableBuilder::WritePropertiesBlock(
+ MetaIndexBuilder* meta_index_builder) {
+ BlockHandle properties_block_handle;
+ if (ok()) {
+ PropertyBlockBuilder property_block_builder;
+ rep_->props.column_family_id = rep_->column_family_id;
+ rep_->props.column_family_name = rep_->column_family_name;
+ rep_->props.filter_policy_name =
+ rep_->table_options.filter_policy != nullptr
+ ? rep_->table_options.filter_policy->Name()
+ : "";
+ rep_->props.index_size =
+ rep_->index_builder->IndexSize() + kBlockTrailerSize;
+ rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr
+ ? rep_->ioptions.user_comparator->Name()
+ : "nullptr";
+ rep_->props.merge_operator_name =
+ rep_->ioptions.merge_operator != nullptr
+ ? rep_->ioptions.merge_operator->Name()
+ : "nullptr";
+ rep_->props.compression_name =
+ CompressionTypeToString(rep_->compression_type);
+ rep_->props.compression_options =
+ CompressionOptionsToString(rep_->compression_opts);
+ rep_->props.prefix_extractor_name =
+ rep_->moptions.prefix_extractor != nullptr
+ ? rep_->moptions.prefix_extractor->Name()
+ : "nullptr";
+
+ std::string property_collectors_names = "[";
+ for (size_t i = 0;
+ i < rep_->ioptions.table_properties_collector_factories.size(); ++i) {
+ if (i != 0) {
+ property_collectors_names += ",";
+ }
+ property_collectors_names +=
+ rep_->ioptions.table_properties_collector_factories[i]->Name();
+ }
+ property_collectors_names += "]";
+ rep_->props.property_collectors_names = property_collectors_names;
+ if (rep_->table_options.index_type ==
+ BlockBasedTableOptions::kTwoLevelIndexSearch) {
+ assert(rep_->p_index_builder_ != nullptr);
+ rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions();
+ rep_->props.top_level_index_size =
+ rep_->p_index_builder_->TopLevelIndexSize(rep_->offset);
+ }
+ rep_->props.index_key_is_user_key =
+ !rep_->index_builder->seperator_is_key_plus_seq();
+ rep_->props.index_value_is_delta_encoded =
+ rep_->use_delta_encoding_for_index_values;
+ rep_->props.creation_time = rep_->creation_time;
+ rep_->props.oldest_key_time = rep_->oldest_key_time;
+ rep_->props.file_creation_time = rep_->file_creation_time;
+
+ // Add basic properties
+ property_block_builder.AddTableProperty(rep_->props);
+
+ // Add use collected properties
+ NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors,
+ rep_->ioptions.info_log,
+ &property_block_builder);
+
+ WriteRawBlock(property_block_builder.Finish(), kNoCompression,
+ &properties_block_handle);
+ }
+ if (ok()) {
+#ifndef NDEBUG
+ {
+ uint64_t props_block_offset = properties_block_handle.offset();
+ uint64_t props_block_size = properties_block_handle.size();
+ TEST_SYNC_POINT_CALLBACK(
+ "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
+ &props_block_offset);
+ TEST_SYNC_POINT_CALLBACK(
+ "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
+ &props_block_size);
+ }
+#endif // !NDEBUG
+ meta_index_builder->Add(kPropertiesBlock, properties_block_handle);
+ }
+}
+
+void BlockBasedTableBuilder::WriteCompressionDictBlock(
+ MetaIndexBuilder* meta_index_builder) {
+ if (rep_->compression_dict != nullptr &&
+ rep_->compression_dict->GetRawDict().size()) {
+ BlockHandle compression_dict_block_handle;
+ if (ok()) {
+ WriteRawBlock(rep_->compression_dict->GetRawDict(), kNoCompression,
+ &compression_dict_block_handle);
+#ifndef NDEBUG
+ Slice compression_dict = rep_->compression_dict->GetRawDict();
+ TEST_SYNC_POINT_CALLBACK(
+ "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+ &compression_dict);
+#endif // NDEBUG
+ }
+ if (ok()) {
+ meta_index_builder->Add(kCompressionDictBlock,
+ compression_dict_block_handle);
+ }
+ }
+}
+
+void BlockBasedTableBuilder::WriteRangeDelBlock(
+ MetaIndexBuilder* meta_index_builder) {
+ if (ok() && !rep_->range_del_block.empty()) {
+ BlockHandle range_del_block_handle;
+ WriteRawBlock(rep_->range_del_block.Finish(), kNoCompression,
+ &range_del_block_handle);
+ meta_index_builder->Add(kRangeDelBlock, range_del_block_handle);
+ }
+}
+
+void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
+ BlockHandle& index_block_handle) {
+ Rep* r = rep_;
+ // No need to write out new footer if we're using default checksum.
+ // We're writing legacy magic number because we want old versions of RocksDB
+ // be able to read files generated with new release (just in case if
+ // somebody wants to roll back after an upgrade)
+ // TODO(icanadi) at some point in the future, when we're absolutely sure
+ // nobody will roll back to RocksDB 2.x versions, retire the legacy magic
+ // number and always write new table files with new magic number
+ bool legacy = (r->table_options.format_version == 0);
+ // this is guaranteed by BlockBasedTableBuilder's constructor
+ assert(r->table_options.checksum == kCRC32c ||
+ r->table_options.format_version != 0);
+ Footer footer(
+ legacy ? kLegacyBlockBasedTableMagicNumber : kBlockBasedTableMagicNumber,
+ r->table_options.format_version);
+ footer.set_metaindex_handle(metaindex_block_handle);
+ footer.set_index_handle(index_block_handle);
+ footer.set_checksum(r->table_options.checksum);
+ std::string footer_encoding;
+ footer.EncodeTo(&footer_encoding);
+ assert(r->status.ok());
+ r->status = r->file->Append(footer_encoding);
+ if (r->status.ok()) {
+ r->offset += footer_encoding.size();
+ }
+}
+
+void BlockBasedTableBuilder::EnterUnbuffered() {
+ Rep* r = rep_;
+ assert(r->state == Rep::State::kBuffered);
+ r->state = Rep::State::kUnbuffered;
+ const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
+ ? r->compression_opts.zstd_max_train_bytes
+ : r->compression_opts.max_dict_bytes;
+ Random64 generator{r->creation_time};
+ std::string compression_dict_samples;
+ std::vector<size_t> compression_dict_sample_lens;
+ if (!r->data_block_and_keys_buffers.empty()) {
+ while (compression_dict_samples.size() < kSampleBytes) {
+ size_t rand_idx =
+ static_cast<size_t>(
+ generator.Uniform(r->data_block_and_keys_buffers.size()));
+ size_t copy_len =
+ std::min(kSampleBytes - compression_dict_samples.size(),
+ r->data_block_and_keys_buffers[rand_idx].first.size());
+ compression_dict_samples.append(
+ r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len);
+ compression_dict_sample_lens.emplace_back(copy_len);
+ }
+ }
+
+ // final data block flushed, now we can generate dictionary from the samples.
+ // OK if compression_dict_samples is empty, we'll just get empty dictionary.
+ std::string dict;
+ if (r->compression_opts.zstd_max_train_bytes > 0) {
+ dict = ZSTD_TrainDictionary(compression_dict_samples,
+ compression_dict_sample_lens,
+ r->compression_opts.max_dict_bytes);
+ } else {
+ dict = std::move(compression_dict_samples);
+ }
+ r->compression_dict.reset(new CompressionDict(dict, r->compression_type,
+ r->compression_opts.level));
+ r->verify_dict.reset(new UncompressionDict(
+ dict, r->compression_type == kZSTD ||
+ r->compression_type == kZSTDNotFinalCompression));
+
+ for (size_t i = 0; ok() && i < r->data_block_and_keys_buffers.size(); ++i) {
+ const auto& data_block = r->data_block_and_keys_buffers[i].first;
+ auto& keys = r->data_block_and_keys_buffers[i].second;
+ assert(!data_block.empty());
+ assert(!keys.empty());
+
+ for (const auto& key : keys) {
+ if (r->filter_builder != nullptr) {
+ size_t ts_sz =
+ r->internal_comparator.user_comparator()->timestamp_size();
+ r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+ }
+ r->index_builder->OnKeyAdded(key);
+ }
+ WriteBlock(Slice(data_block), &r->pending_handle, true /* is_data_block */);
+ if (ok() && i + 1 < r->data_block_and_keys_buffers.size()) {
+ Slice first_key_in_next_block =
+ r->data_block_and_keys_buffers[i + 1].second.front();
+ Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
+ r->index_builder->AddIndexEntry(&keys.back(), first_key_in_next_block_ptr,
+ r->pending_handle);
+ }
+ }
+ r->data_block_and_keys_buffers.clear();
+}
+
+Status BlockBasedTableBuilder::Finish() {
+ Rep* r = rep_;
+ assert(r->state != Rep::State::kClosed);
+ bool empty_data_block = r->data_block.empty();
+ Flush();
+ if (r->state == Rep::State::kBuffered) {
+ EnterUnbuffered();
+ }
+ // To make sure properties block is able to keep the accurate size of index
+ // block, we will finish writing all index entries first.
+ if (ok() && !empty_data_block) {
+ r->index_builder->AddIndexEntry(
+ &r->last_key, nullptr /* no next data block */, r->pending_handle);
+ }
+
+ // Write meta blocks, metaindex block and footer in the following order.
+ // 1. [meta block: filter]
+ // 2. [meta block: index]
+ // 3. [meta block: compression dictionary]
+ // 4. [meta block: range deletion tombstone]
+ // 5. [meta block: properties]
+ // 6. [metaindex block]
+ // 7. Footer
+ BlockHandle metaindex_block_handle, index_block_handle;
+ MetaIndexBuilder meta_index_builder;
+ WriteFilterBlock(&meta_index_builder);
+ WriteIndexBlock(&meta_index_builder, &index_block_handle);
+ WriteCompressionDictBlock(&meta_index_builder);
+ WriteRangeDelBlock(&meta_index_builder);
+ WritePropertiesBlock(&meta_index_builder);
+ if (ok()) {
+ // flush the meta index block
+ WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
+ &metaindex_block_handle);
+ }
+ if (ok()) {
+ WriteFooter(metaindex_block_handle, index_block_handle);
+ }
+ if (r->file != nullptr) {
+ file_checksum_ = r->file->GetFileChecksum();
+ }
+ r->state = Rep::State::kClosed;
+ return r->status;
+}
+
+void BlockBasedTableBuilder::Abandon() {
+ assert(rep_->state != Rep::State::kClosed);
+ rep_->state = Rep::State::kClosed;
+}
+
+uint64_t BlockBasedTableBuilder::NumEntries() const {
+ return rep_->props.num_entries;
+}
+
+uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; }
+
+bool BlockBasedTableBuilder::NeedCompact() const {
+ for (const auto& collector : rep_->table_properties_collectors) {
+ if (collector->NeedCompact()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+TableProperties BlockBasedTableBuilder::GetTableProperties() const {
+ TableProperties ret = rep_->props;
+ for (const auto& collector : rep_->table_properties_collectors) {
+ for (const auto& prop : collector->GetReadableProperties()) {
+ ret.readable_properties.insert(prop);
+ }
+ collector->Finish(&ret.user_collected_properties);
+ }
+ return ret;
+}
+
+const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const {
+ if (rep_->file != nullptr) {
+ return rep_->file->GetFileChecksumFuncName();
+ } else {
+ return kUnknownFileChecksumFuncName.c_str();
+ }
+}
+
+const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";
+const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter.";
+const std::string BlockBasedTable::kPartitionedFilterBlockPrefix =
+ "partitionedfilter.";
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_builder.h b/src/rocksdb/table/block_based/block_based_table_builder.h
new file mode 100644
index 000000000..97c9bc65a
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_builder.h
@@ -0,0 +1,157 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "table/meta_blocks.h"
+#include "table/table_builder.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+struct BlockBasedTableOptions;
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+
+class BlockBasedTableBuilder : public TableBuilder {
+ public:
+ // Create a builder that will store the contents of the table it is
+ // building in *file. Does not close the file. It is up to the
+ // caller to close the file after calling Finish().
+ BlockBasedTableBuilder(
+ const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
+ const BlockBasedTableOptions& table_options,
+ const InternalKeyComparator& internal_comparator,
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories,
+ uint32_t column_family_id, WritableFileWriter* file,
+ const CompressionType compression_type,
+ const uint64_t sample_for_compression,
+ const CompressionOptions& compression_opts, const bool skip_filters,
+ const std::string& column_family_name, const int level_at_creation,
+ const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
+ const uint64_t target_file_size = 0,
+ const uint64_t file_creation_time = 0);
+
+ // No copying allowed
+ BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
+ BlockBasedTableBuilder& operator=(const BlockBasedTableBuilder&) = delete;
+
+ // REQUIRES: Either Finish() or Abandon() has been called.
+ ~BlockBasedTableBuilder();
+
+ // Add key,value to the table being constructed.
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Add(const Slice& key, const Slice& value) override;
+
+ // Return non-ok iff some error has been detected.
+ Status status() const override;
+
+ // Finish building the table. Stops using the file passed to the
+ // constructor after this function returns.
+ // REQUIRES: Finish(), Abandon() have not been called
+ Status Finish() override;
+
+ // Indicate that the contents of this builder should be abandoned. Stops
+ // using the file passed to the constructor after this function returns.
+ // If the caller is not going to call Finish(), it must call Abandon()
+ // before destroying this builder.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Abandon() override;
+
+ // Number of calls to Add() so far.
+ uint64_t NumEntries() const override;
+
+ // Size of the file generated so far. If invoked after a successful
+ // Finish() call, returns the size of the final generated file.
+ uint64_t FileSize() const override;
+
+ bool NeedCompact() const override;
+
+ // Get table properties
+ TableProperties GetTableProperties() const override;
+
+ // Get file checksum
+ const std::string& GetFileChecksum() const override { return file_checksum_; }
+
+ // Get file checksum function name
+ const char* GetFileChecksumFuncName() const override;
+
+ private:
+ bool ok() const { return status().ok(); }
+
+ // Transition state from buffered to unbuffered. See `Rep::State` API comment
+ // for details of the states.
+ // REQUIRES: `rep_->state == kBuffered`
+ void EnterUnbuffered();
+
+ // Call block's Finish() method
+ // and then write the compressed block contents to file.
+ void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block);
+
+ // Compress and write block content to the file.
+ void WriteBlock(const Slice& block_contents, BlockHandle* handle,
+ bool is_data_block);
+ // Directly write data to the file.
+ void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle,
+ bool is_data_block = false);
+ Status InsertBlockInCache(const Slice& block_contents,
+ const CompressionType type,
+ const BlockHandle* handle);
+
+ void WriteFilterBlock(MetaIndexBuilder* meta_index_builder);
+ void WriteIndexBlock(MetaIndexBuilder* meta_index_builder,
+ BlockHandle* index_block_handle);
+ void WritePropertiesBlock(MetaIndexBuilder* meta_index_builder);
+ void WriteCompressionDictBlock(MetaIndexBuilder* meta_index_builder);
+ void WriteRangeDelBlock(MetaIndexBuilder* meta_index_builder);
+ void WriteFooter(BlockHandle& metaindex_block_handle,
+ BlockHandle& index_block_handle);
+
+ struct Rep;
+ class BlockBasedTablePropertiesCollectorFactory;
+ class BlockBasedTablePropertiesCollector;
+ Rep* rep_;
+
+ // Advanced operation: flush any buffered key/value pairs to file.
+ // Can be used to ensure that two adjacent entries never live in
+ // the same data block. Most clients should not need to use this method.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Flush();
+
+ // Some compression libraries fail when the raw size is bigger than int. If
+ // uncompressed size is bigger than kCompressionSizeLimit, don't compress it
+ const uint64_t kCompressionSizeLimit = std::numeric_limits<int>::max();
+
+ // Store file checksum. If checksum is disabled, its value is "0".
+ std::string file_checksum_ = kUnknownFileChecksum;
+};
+
+Slice CompressBlock(const Slice& raw, const CompressionInfo& info,
+ CompressionType* type, uint32_t format_version,
+ bool do_sample, std::string* compressed_output,
+ std::string* sampled_output_fast,
+ std::string* sampled_output_slow);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_factory.cc b/src/rocksdb/table/block_based/block_based_table_factory.cc
new file mode 100644
index 000000000..70a6f38d5
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_factory.cc
@@ -0,0 +1,649 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdint.h>
+#include <cinttypes>
+
+#include <memory>
+#include <string>
+
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/flush_block_policy.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void TailPrefetchStats::RecordEffectiveSize(size_t len) {
+ MutexLock l(&mutex_);
+ if (num_records_ < kNumTracked) {
+ num_records_++;
+ }
+ records_[next_++] = len;
+ if (next_ == kNumTracked) {
+ next_ = 0;
+ }
+}
+
+size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
+ std::vector<size_t> sorted;
+ {
+ MutexLock l(&mutex_);
+
+ if (num_records_ == 0) {
+ return 0;
+ }
+ sorted.assign(records_, records_ + num_records_);
+ }
+
+ // Of the historic size, we find the maximum one that satisifis the condtiion
+ // that if prefetching all, less than 1/8 will be wasted.
+ std::sort(sorted.begin(), sorted.end());
+
+ // Assuming we have 5 data points, and after sorting it looks like this:
+ //
+ // +---+
+ // +---+ | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // +---+ | | | |
+ // | | | | | |
+ // +---+ | | | | | |
+ // | | | | | | | |
+ // +---+ | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // +---+ +---+ +---+ +---+ +---+
+ //
+ // and we use every of the value as a candidate, and estimate how much we
+ // wasted, compared to read. For example, when we use the 3rd record
+ // as candiate. This area is what we read:
+ // +---+
+ // +---+ | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // *** *** *** ***+ *** *** *** *** **
+ // * | | | | | |
+ // +---+ | | | | | *
+ // * | | | | | | | |
+ // +---+ | | | | | | | *
+ // * | | | | X | | | | |
+ // | | | | | | | | | *
+ // * | | | | | | | | |
+ // | | | | | | | | | *
+ // * | | | | | | | | |
+ // *** *** ***-*** ***--*** ***--*** +****
+ // which is (size of the record) X (number of records).
+ //
+ // While wasted is this area:
+ // +---+
+ // +---+ | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // *** *** *** ****---+ | | | |
+ // * * | | | | |
+ // * *-*** *** | | | | |
+ // * * | | | | | | |
+ // *--** *** | | | | | | |
+ // | | | | | X | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // +---+ +---+ +---+ +---+ +---+
+ //
+ // Which can be calculated iteratively.
+ // The difference between wasted using 4st and 3rd record, will
+ // be following area:
+ // +---+
+ // +--+ +-+ ++ +-+ +-+ +---+ | |
+ // + xxxxxxxxxxxxxxxxxxxxxxxx | | | |
+ // xxxxxxxxxxxxxxxxxxxxxxxx | | | |
+ // + xxxxxxxxxxxxxxxxxxxxxxxx | | | |
+ // | xxxxxxxxxxxxxxxxxxxxxxxx | | | |
+ // +-+ +-+ +-+ ++ +---+ +--+ | | |
+ // | | | | | | |
+ // +---+ ++ | | | | | |
+ // | | | | | | X | | |
+ // +---+ ++ | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // +---+ +---+ +---+ +---+ +---+
+ //
+ // which will be the size difference between 4st and 3rd record,
+ // times 3, which is number of records before the 4st.
+ // Here we assume that all data within the prefetch range will be useful. In
+ // reality, it may not be the case when a partial block is inside the range,
+ // or there are data in the middle that is not read. We ignore those cases
+ // for simplicity.
+ assert(!sorted.empty());
+ size_t prev_size = sorted[0];
+ size_t max_qualified_size = sorted[0];
+ size_t wasted = 0;
+ for (size_t i = 1; i < sorted.size(); i++) {
+ size_t read = sorted[i] * sorted.size();
+ wasted += (sorted[i] - prev_size) * i;
+ if (wasted <= read / 8) {
+ max_qualified_size = sorted[i];
+ }
+ prev_size = sorted[i];
+ }
+ const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB
+ return std::min(kMaxPrefetchSize, max_qualified_size);
+}
+
+// TODO(myabandeh): We should return an error instead of silently changing the
+// options
+BlockBasedTableFactory::BlockBasedTableFactory(
+ const BlockBasedTableOptions& _table_options)
+ : table_options_(_table_options) {
+ if (table_options_.flush_block_policy_factory == nullptr) {
+ table_options_.flush_block_policy_factory.reset(
+ new FlushBlockBySizePolicyFactory());
+ }
+ if (table_options_.no_block_cache) {
+ table_options_.block_cache.reset();
+ } else if (table_options_.block_cache == nullptr) {
+ LRUCacheOptions co;
+ co.capacity = 8 << 20;
+ // It makes little sense to pay overhead for mid-point insertion while the
+ // block size is only 8MB.
+ co.high_pri_pool_ratio = 0.0;
+ table_options_.block_cache = NewLRUCache(co);
+ }
+ if (table_options_.block_size_deviation < 0 ||
+ table_options_.block_size_deviation > 100) {
+ table_options_.block_size_deviation = 0;
+ }
+ if (table_options_.block_restart_interval < 1) {
+ table_options_.block_restart_interval = 1;
+ }
+ if (table_options_.index_block_restart_interval < 1) {
+ table_options_.index_block_restart_interval = 1;
+ }
+ if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+ table_options_.index_block_restart_interval != 1) {
+ // Currently kHashSearch is incompatible with index_block_restart_interval > 1
+ table_options_.index_block_restart_interval = 1;
+ }
+ if (table_options_.partition_filters &&
+ table_options_.index_type !=
+ BlockBasedTableOptions::kTwoLevelIndexSearch) {
+ // We do not support partitioned filters without partitioning indexes
+ table_options_.partition_filters = false;
+ }
+}
+
+Status BlockBasedTableFactory::NewTableReader(
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader,
+ bool prefetch_index_and_filter_in_cache) const {
+ return BlockBasedTable::Open(
+ table_reader_options.ioptions, table_reader_options.env_options,
+ table_options_, table_reader_options.internal_comparator, std::move(file),
+ file_size, table_reader, table_reader_options.prefix_extractor,
+ prefetch_index_and_filter_in_cache, table_reader_options.skip_filters,
+ table_reader_options.level, table_reader_options.immortal,
+ table_reader_options.largest_seqno, &tail_prefetch_stats_,
+ table_reader_options.block_cache_tracer);
+}
+
+TableBuilder* BlockBasedTableFactory::NewTableBuilder(
+ const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+ WritableFileWriter* file) const {
+ auto table_builder = new BlockBasedTableBuilder(
+ table_builder_options.ioptions, table_builder_options.moptions,
+ table_options_, table_builder_options.internal_comparator,
+ table_builder_options.int_tbl_prop_collector_factories, column_family_id,
+ file, table_builder_options.compression_type,
+ table_builder_options.sample_for_compression,
+ table_builder_options.compression_opts,
+ table_builder_options.skip_filters,
+ table_builder_options.column_family_name, table_builder_options.level,
+ table_builder_options.creation_time,
+ table_builder_options.oldest_key_time,
+ table_builder_options.target_file_size,
+ table_builder_options.file_creation_time);
+
+ return table_builder;
+}
+
+Status BlockBasedTableFactory::SanitizeOptions(
+ const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+ if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+ cf_opts.prefix_extractor == nullptr) {
+ return Status::InvalidArgument(
+ "Hash index is specified for block-based "
+ "table, but prefix_extractor is not given");
+ }
+ if (table_options_.cache_index_and_filter_blocks &&
+ table_options_.no_block_cache) {
+ return Status::InvalidArgument(
+ "Enable cache_index_and_filter_blocks, "
+ ", but block cache is disabled");
+ }
+ if (table_options_.pin_l0_filter_and_index_blocks_in_cache &&
+ table_options_.no_block_cache) {
+ return Status::InvalidArgument(
+ "Enable pin_l0_filter_and_index_blocks_in_cache, "
+ ", but block cache is disabled");
+ }
+ if (!BlockBasedTableSupportedVersion(table_options_.format_version)) {
+ return Status::InvalidArgument(
+ "Unsupported BlockBasedTable format_version. Please check "
+ "include/rocksdb/table.h for more info");
+ }
+ if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
+ return Status::InvalidArgument(
+ "Enable block_align, but compression "
+ "enabled");
+ }
+ if (table_options_.block_align &&
+ (table_options_.block_size & (table_options_.block_size - 1))) {
+ return Status::InvalidArgument(
+ "Block alignment requested but block size is not a power of 2");
+ }
+ if (table_options_.block_size > port::kMaxUint32) {
+ return Status::InvalidArgument(
+ "block size exceeds maximum number (4GiB) allowed");
+ }
+ if (table_options_.data_block_index_type ==
+ BlockBasedTableOptions::kDataBlockBinaryAndHash &&
+ table_options_.data_block_hash_table_util_ratio <= 0) {
+ return Status::InvalidArgument(
+ "data_block_hash_table_util_ratio should be greater than 0 when "
+ "data_block_index_type is set to kDataBlockBinaryAndHash");
+ }
+ if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
+ // TODO(myabandeh): support it
+ return Status::InvalidArgument(
+ "max_successive_merges larger than 0 is currently inconsistent with "
+ "unordered_write");
+ }
+ return Status::OK();
+}
+
+std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
+ std::string ret;
+ ret.reserve(20000);
+ const int kBufferSize = 200;
+ char buffer[kBufferSize];
+
+ snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n",
+ table_options_.flush_block_policy_factory->Name(),
+ static_cast<void*>(table_options_.flush_block_policy_factory.get()));
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n",
+ table_options_.cache_index_and_filter_blocks);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize,
+ " cache_index_and_filter_blocks_with_high_priority: %d\n",
+ table_options_.cache_index_and_filter_blocks_with_high_priority);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize,
+ " pin_l0_filter_and_index_blocks_in_cache: %d\n",
+ table_options_.pin_l0_filter_and_index_blocks_in_cache);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n",
+ table_options_.pin_top_level_index_and_filter);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " index_type: %d\n",
+ table_options_.index_type);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " data_block_index_type: %d\n",
+ table_options_.data_block_index_type);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " index_shortening: %d\n",
+ static_cast<int>(table_options_.index_shortening));
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " data_block_hash_table_util_ratio: %lf\n",
+ table_options_.data_block_hash_table_util_ratio);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " hash_index_allow_collision: %d\n",
+ table_options_.hash_index_allow_collision);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " checksum: %d\n", table_options_.checksum);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " no_block_cache: %d\n",
+ table_options_.no_block_cache);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " block_cache: %p\n",
+ static_cast<void*>(table_options_.block_cache.get()));
+ ret.append(buffer);
+ if (table_options_.block_cache) {
+ const char* block_cache_name = table_options_.block_cache->Name();
+ if (block_cache_name != nullptr) {
+ snprintf(buffer, kBufferSize, " block_cache_name: %s\n",
+ block_cache_name);
+ ret.append(buffer);
+ }
+ ret.append(" block_cache_options:\n");
+ ret.append(table_options_.block_cache->GetPrintableOptions());
+ }
+ snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n",
+ static_cast<void*>(table_options_.block_cache_compressed.get()));
+ ret.append(buffer);
+ if (table_options_.block_cache_compressed) {
+ const char* block_cache_compressed_name =
+ table_options_.block_cache_compressed->Name();
+ if (block_cache_compressed_name != nullptr) {
+ snprintf(buffer, kBufferSize, " block_cache_name: %s\n",
+ block_cache_compressed_name);
+ ret.append(buffer);
+ }
+ ret.append(" block_cache_compressed_options:\n");
+ ret.append(table_options_.block_cache_compressed->GetPrintableOptions());
+ }
+ snprintf(buffer, kBufferSize, " persistent_cache: %p\n",
+ static_cast<void*>(table_options_.persistent_cache.get()));
+ ret.append(buffer);
+ if (table_options_.persistent_cache) {
+ snprintf(buffer, kBufferSize, " persistent_cache_options:\n");
+ ret.append(buffer);
+ ret.append(table_options_.persistent_cache->GetPrintableOptions());
+ }
+ snprintf(buffer, kBufferSize, " block_size: %" ROCKSDB_PRIszt "\n",
+ table_options_.block_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " block_size_deviation: %d\n",
+ table_options_.block_size_deviation);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " block_restart_interval: %d\n",
+ table_options_.block_restart_interval);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n",
+ table_options_.index_block_restart_interval);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " metadata_block_size: %" PRIu64 "\n",
+ table_options_.metadata_block_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " partition_filters: %d\n",
+ table_options_.partition_filters);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " use_delta_encoding: %d\n",
+ table_options_.use_delta_encoding);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " filter_policy: %s\n",
+ table_options_.filter_policy == nullptr
+ ? "nullptr"
+ : table_options_.filter_policy->Name());
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n",
+ table_options_.whole_key_filtering);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " verify_compression: %d\n",
+ table_options_.verify_compression);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " read_amp_bytes_per_bit: %d\n",
+ table_options_.read_amp_bytes_per_bit);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " format_version: %d\n",
+ table_options_.format_version);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " enable_index_compression: %d\n",
+ table_options_.enable_index_compression);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " block_align: %d\n",
+ table_options_.block_align);
+ ret.append(buffer);
+ return ret;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+bool SerializeSingleBlockBasedTableOption(
+ std::string* opt_string, const BlockBasedTableOptions& bbt_options,
+ const std::string& name, const std::string& delimiter) {
+ auto iter = block_based_table_type_info.find(name);
+ if (iter == block_based_table_type_info.end()) {
+ return false;
+ }
+ auto& opt_info = iter->second;
+ const char* opt_address =
+ reinterpret_cast<const char*>(&bbt_options) + opt_info.offset;
+ std::string value;
+ bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value);
+ if (result) {
+ *opt_string = name + "=" + value + delimiter;
+ }
+ return result;
+}
+} // namespace
+
+Status BlockBasedTableFactory::GetOptionString(
+ std::string* opt_string, const std::string& delimiter) const {
+ assert(opt_string);
+ opt_string->clear();
+ for (auto iter = block_based_table_type_info.begin();
+ iter != block_based_table_type_info.end(); ++iter) {
+ if (iter->second.verification == OptionVerificationType::kDeprecated) {
+ // If the option is no longer used in rocksdb and marked as deprecated,
+ // we skip it in the serialization.
+ continue;
+ }
+ std::string single_output;
+ bool result = SerializeSingleBlockBasedTableOption(
+ &single_output, table_options_, iter->first, delimiter);
+ assert(result);
+ if (result) {
+ opt_string->append(single_output);
+ }
+ }
+ return Status::OK();
+}
+#else
+Status BlockBasedTableFactory::GetOptionString(
+ std::string* /*opt_string*/, const std::string& /*delimiter*/) const {
+ return Status::OK();
+}
+#endif // !ROCKSDB_LITE
+
+const BlockBasedTableOptions& BlockBasedTableFactory::table_options() const {
+ return table_options_;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+std::string ParseBlockBasedTableOption(const std::string& name,
+ const std::string& org_value,
+ BlockBasedTableOptions* new_options,
+ bool input_strings_escaped = false,
+ bool ignore_unknown_options = false) {
+ const std::string& value =
+ input_strings_escaped ? UnescapeOptionString(org_value) : org_value;
+ if (!input_strings_escaped) {
+ // if the input string is not escaped, it means this function is
+ // invoked from SetOptions, which takes the old format.
+ if (name == "block_cache" || name == "block_cache_compressed") {
+ // cache options can be specified in the following format
+ // "block_cache={capacity=1M;num_shard_bits=4;
+ // strict_capacity_limit=true;high_pri_pool_ratio=0.5;}"
+ // To support backward compatibility, the following format
+ // is also supported.
+ // "block_cache=1M"
+ std::shared_ptr<Cache> cache;
+ // block_cache is specified in format block_cache=<cache_size>.
+ if (value.find('=') == std::string::npos) {
+ cache = NewLRUCache(ParseSizeT(value));
+ } else {
+ LRUCacheOptions cache_opts;
+ if (!ParseOptionHelper(reinterpret_cast<char*>(&cache_opts),
+ OptionType::kLRUCacheOptions, value)) {
+ return "Invalid cache options";
+ }
+ cache = NewLRUCache(cache_opts);
+ }
+
+ if (name == "block_cache") {
+ new_options->block_cache = cache;
+ } else {
+ new_options->block_cache_compressed = cache;
+ }
+ return "";
+ } else if (name == "filter_policy") {
+ // Expect the following format
+ // bloomfilter:int:bool
+ const std::string kName = "bloomfilter:";
+ if (value.compare(0, kName.size(), kName) != 0) {
+ return "Invalid filter policy name";
+ }
+ size_t pos = value.find(':', kName.size());
+ if (pos == std::string::npos) {
+ return "Invalid filter policy config, missing bits_per_key";
+ }
+ double bits_per_key =
+ ParseDouble(trim(value.substr(kName.size(), pos - kName.size())));
+ bool use_block_based_builder =
+ ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1)));
+ new_options->filter_policy.reset(
+ NewBloomFilterPolicy(bits_per_key, use_block_based_builder));
+ return "";
+ }
+ }
+ const auto iter = block_based_table_type_info.find(name);
+ if (iter == block_based_table_type_info.end()) {
+ if (ignore_unknown_options) {
+ return "";
+ } else {
+ return "Unrecognized option";
+ }
+ }
+ const auto& opt_info = iter->second;
+ if (opt_info.verification != OptionVerificationType::kDeprecated &&
+ !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset,
+ opt_info.type, value)) {
+ return "Invalid value";
+ }
+ return "";
+}
+} // namespace
+
+Status GetBlockBasedTableOptionsFromString(
+ const BlockBasedTableOptions& table_options, const std::string& opts_str,
+ BlockBasedTableOptions* new_table_options) {
+ std::unordered_map<std::string, std::string> opts_map;
+ Status s = StringToMap(opts_str, &opts_map);
+ if (!s.ok()) {
+ return s;
+ }
+
+ return GetBlockBasedTableOptionsFromMap(table_options, opts_map,
+ new_table_options);
+}
+
+Status GetBlockBasedTableOptionsFromMap(
+ const BlockBasedTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ BlockBasedTableOptions* new_table_options, bool input_strings_escaped,
+ bool ignore_unknown_options) {
+ assert(new_table_options);
+ *new_table_options = table_options;
+ for (const auto& o : opts_map) {
+ auto error_message = ParseBlockBasedTableOption(
+ o.first, o.second, new_table_options, input_strings_escaped,
+ ignore_unknown_options);
+ if (error_message != "") {
+ const auto iter = block_based_table_type_info.find(o.first);
+ if (iter == block_based_table_type_info.end() ||
+ !input_strings_escaped || // !input_strings_escaped indicates
+ // the old API, where everything is
+ // parsable.
+ (iter->second.verification != OptionVerificationType::kByName &&
+ iter->second.verification !=
+ OptionVerificationType::kByNameAllowNull &&
+ iter->second.verification !=
+ OptionVerificationType::kByNameAllowFromNull &&
+ iter->second.verification != OptionVerificationType::kDeprecated)) {
+ // Restore "new_options" to the default "base_options".
+ *new_table_options = table_options;
+ return Status::InvalidArgument("Can't parse BlockBasedTableOptions:",
+ o.first + " " + error_message);
+ }
+ }
+ }
+ return Status::OK();
+}
+
+Status VerifyBlockBasedTableFactory(
+ const BlockBasedTableFactory* base_tf,
+ const BlockBasedTableFactory* file_tf,
+ OptionsSanityCheckLevel sanity_check_level) {
+ if ((base_tf != nullptr) != (file_tf != nullptr) &&
+ sanity_check_level > kSanityLevelNone) {
+ return Status::Corruption(
+ "[RocksDBOptionsParser]: Inconsistent TableFactory class type");
+ }
+ if (base_tf == nullptr) {
+ return Status::OK();
+ }
+ assert(file_tf != nullptr);
+
+ const auto& base_opt = base_tf->table_options();
+ const auto& file_opt = file_tf->table_options();
+
+ for (auto& pair : block_based_table_type_info) {
+ if (pair.second.verification == OptionVerificationType::kDeprecated) {
+ // We skip checking deprecated variables as they might
+ // contain random values since they might not be initialized
+ continue;
+ }
+ if (BBTOptionSanityCheckLevel(pair.first) <= sanity_check_level) {
+ if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt),
+ reinterpret_cast<const char*>(&file_opt),
+ pair.second, pair.first, nullptr)) {
+ return Status::Corruption(
+ "[RocksDBOptionsParser]: "
+ "failed the verification on BlockBasedTableOptions::",
+ pair.first);
+ }
+ }
+ }
+ return Status::OK();
+}
+#endif // !ROCKSDB_LITE
+
+TableFactory* NewBlockBasedTableFactory(
+ const BlockBasedTableOptions& _table_options) {
+ return new BlockBasedTableFactory(_table_options);
+}
+
+const std::string BlockBasedTableFactory::kName = "BlockBasedTable";
+const std::string BlockBasedTablePropertyNames::kIndexType =
+ "rocksdb.block.based.table.index.type";
+const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
+ "rocksdb.block.based.table.whole.key.filtering";
+const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
+ "rocksdb.block.based.table.prefix.filtering";
+const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
+const std::string kHashIndexPrefixesMetadataBlock =
+ "rocksdb.hashindex.metadata";
+const std::string kPropTrue = "1";
+const std::string kPropFalse = "0";
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_factory.h b/src/rocksdb/table/block_based/block_based_table_factory.h
new file mode 100644
index 000000000..7c8633c07
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_factory.h
@@ -0,0 +1,195 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "db/dbformat.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct EnvOptions;
+
+class BlockBasedTableBuilder;
+
+// A class used to track actual bytes written from the tail in the recent SST
+// file opens, and provide a suggestion for following open.
+class TailPrefetchStats {
+ public:
+ void RecordEffectiveSize(size_t len);
+ // 0 indicates no information to determine.
+ size_t GetSuggestedPrefetchSize();
+
+ private:
+ const static size_t kNumTracked = 32;
+ size_t records_[kNumTracked];
+ port::Mutex mutex_;
+ size_t next_ = 0;
+ size_t num_records_ = 0;
+};
+
+class BlockBasedTableFactory : public TableFactory {
+ public:
+ explicit BlockBasedTableFactory(
+ const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+ ~BlockBasedTableFactory() {}
+
+ const char* Name() const override { return kName.c_str(); }
+
+ Status NewTableReader(
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader,
+ bool prefetch_index_and_filter_in_cache = true) const override;
+
+ TableBuilder* NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ uint32_t column_family_id, WritableFileWriter* file) const override;
+
+ // Sanitizes the specified DB Options.
+ Status SanitizeOptions(const DBOptions& db_opts,
+ const ColumnFamilyOptions& cf_opts) const override;
+
+ std::string GetPrintableTableOptions() const override;
+
+ Status GetOptionString(std::string* opt_string,
+ const std::string& delimiter) const override;
+
+ const BlockBasedTableOptions& table_options() const;
+
+ void* GetOptions() override { return &table_options_; }
+
+ bool IsDeleteRangeSupported() const override { return true; }
+
+ static const std::string kName;
+
+ private:
+ BlockBasedTableOptions table_options_;
+ mutable TailPrefetchStats tail_prefetch_stats_;
+};
+
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+extern const std::string kPropTrue;
+extern const std::string kPropFalse;
+
+#ifndef ROCKSDB_LITE
+extern Status VerifyBlockBasedTableFactory(
+ const BlockBasedTableFactory* base_tf,
+ const BlockBasedTableFactory* file_tf,
+ OptionsSanityCheckLevel sanity_check_level);
+
+static std::unordered_map<std::string, OptionTypeInfo>
+ block_based_table_type_info = {
+ /* currently not supported
+ std::shared_ptr<Cache> block_cache = nullptr;
+ std::shared_ptr<Cache> block_cache_compressed = nullptr;
+ */
+ {"flush_block_policy_factory",
+ {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory),
+ OptionType::kFlushBlockPolicyFactory, OptionVerificationType::kByName,
+ false, 0}},
+ {"cache_index_and_filter_blocks",
+ {offsetof(struct BlockBasedTableOptions,
+ cache_index_and_filter_blocks),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+ {"cache_index_and_filter_blocks_with_high_priority",
+ {offsetof(struct BlockBasedTableOptions,
+ cache_index_and_filter_blocks_with_high_priority),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+ {"pin_l0_filter_and_index_blocks_in_cache",
+ {offsetof(struct BlockBasedTableOptions,
+ pin_l0_filter_and_index_blocks_in_cache),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+ {"index_type",
+ {offsetof(struct BlockBasedTableOptions, index_type),
+ OptionType::kBlockBasedTableIndexType,
+ OptionVerificationType::kNormal, false, 0}},
+ {"hash_index_allow_collision",
+ {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+ {"data_block_index_type",
+ {offsetof(struct BlockBasedTableOptions, data_block_index_type),
+ OptionType::kBlockBasedTableDataBlockIndexType,
+ OptionVerificationType::kNormal, false, 0}},
+ {"index_shortening",
+ {offsetof(struct BlockBasedTableOptions, index_shortening),
+ OptionType::kBlockBasedTableIndexShorteningMode,
+ OptionVerificationType::kNormal, false, 0}},
+ {"data_block_hash_table_util_ratio",
+ {offsetof(struct BlockBasedTableOptions,
+ data_block_hash_table_util_ratio),
+ OptionType::kDouble, OptionVerificationType::kNormal, false, 0}},
+ {"checksum",
+ {offsetof(struct BlockBasedTableOptions, checksum),
+ OptionType::kChecksumType, OptionVerificationType::kNormal, false,
+ 0}},
+ {"no_block_cache",
+ {offsetof(struct BlockBasedTableOptions, no_block_cache),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+ {"block_size",
+ {offsetof(struct BlockBasedTableOptions, block_size),
+ OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
+ {"block_size_deviation",
+ {offsetof(struct BlockBasedTableOptions, block_size_deviation),
+ OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+ {"block_restart_interval",
+ {offsetof(struct BlockBasedTableOptions, block_restart_interval),
+ OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+ {"index_block_restart_interval",
+ {offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
+ OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+ {"index_per_partition",
+ {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, false,
+ 0}},
+ {"metadata_block_size",
+ {offsetof(struct BlockBasedTableOptions, metadata_block_size),
+ OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
+ {"partition_filters",
+ {offsetof(struct BlockBasedTableOptions, partition_filters),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+ {"filter_policy",
+ {offsetof(struct BlockBasedTableOptions, filter_policy),
+ OptionType::kFilterPolicy, OptionVerificationType::kByName, false,
+ 0}},
+ {"whole_key_filtering",
+ {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+ {"skip_table_builder_flush",
+ {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
+ 0}},
+ {"format_version",
+ {offsetof(struct BlockBasedTableOptions, format_version),
+ OptionType::kUInt32T, OptionVerificationType::kNormal, false, 0}},
+ {"verify_compression",
+ {offsetof(struct BlockBasedTableOptions, verify_compression),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+ {"read_amp_bytes_per_bit",
+ {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
+ OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
+ {"enable_index_compression",
+ {offsetof(struct BlockBasedTableOptions, enable_index_compression),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+ {"block_align",
+ {offsetof(struct BlockBasedTableOptions, block_align),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+ {"pin_top_level_index_and_filter",
+ {offsetof(struct BlockBasedTableOptions,
+ pin_top_level_index_and_filter),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
+#endif // !ROCKSDB_LITE
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader.cc b/src/rocksdb/table/block_based/block_based_table_reader.cc
new file mode 100644
index 000000000..9b37b431f
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader.cc
@@ -0,0 +1,4531 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_based_table_reader.h"
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+
+#include "file/file_prefetch_buffer.h"
+#include "file/random_access_file_reader.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/block_fetcher.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/multiget_context.h"
+#include "table/persistent_cache_helper.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/two_level_iterator.h"
+
+#include "monitoring/perf_context_imp.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/util.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+
+typedef BlockBasedTable::IndexReader IndexReader;
+
+// Found that 256 KB readahead size provides the best performance, based on
+// experiments, for auto readahead. Experiment data is in PR #3282.
+const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
+
+BlockBasedTable::~BlockBasedTable() {
+ delete rep_;
+}
+
+std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
+
+template <typename TBlocklike>
+class BlocklikeTraits;
+
+template <>
+class BlocklikeTraits<BlockContents> {
+ public:
+ static BlockContents* Create(BlockContents&& contents,
+ SequenceNumber /* global_seqno */,
+ size_t /* read_amp_bytes_per_bit */,
+ Statistics* /* statistics */,
+ bool /* using_zstd */,
+ const FilterPolicy* /* filter_policy */) {
+ return new BlockContents(std::move(contents));
+ }
+
+ static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
+ return 0;
+ }
+};
+
+template <>
+class BlocklikeTraits<ParsedFullFilterBlock> {
+ public:
+ static ParsedFullFilterBlock* Create(BlockContents&& contents,
+ SequenceNumber /* global_seqno */,
+ size_t /* read_amp_bytes_per_bit */,
+ Statistics* /* statistics */,
+ bool /* using_zstd */,
+ const FilterPolicy* filter_policy) {
+ return new ParsedFullFilterBlock(filter_policy, std::move(contents));
+ }
+
+ static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
+ return 0;
+ }
+};
+
+template <>
+class BlocklikeTraits<Block> {
+ public:
+ static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
+ size_t read_amp_bytes_per_bit, Statistics* statistics,
+ bool /* using_zstd */,
+ const FilterPolicy* /* filter_policy */) {
+ return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
+ statistics);
+ }
+
+ static uint32_t GetNumRestarts(const Block& block) {
+ return block.NumRestarts();
+ }
+};
+
+template <>
+class BlocklikeTraits<UncompressionDict> {
+ public:
+ static UncompressionDict* Create(BlockContents&& contents,
+ SequenceNumber /* global_seqno */,
+ size_t /* read_amp_bytes_per_bit */,
+ Statistics* /* statistics */,
+ bool using_zstd,
+ const FilterPolicy* /* filter_policy */) {
+ return new UncompressionDict(contents.data, std::move(contents.allocation),
+ using_zstd);
+ }
+
+ static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
+ return 0;
+ }
+};
+
+namespace {
+// Read the block identified by "handle" from "file".
+// The only relevant option is options.verify_checksums for now.
+// On failure return non-OK.
+// On success fill *result and return OK - caller owns *result
+// @param uncompression_dict Data for presetting the compression library's
+// dictionary.
+template <typename TBlocklike>
+Status ReadBlockFromFile(
+ RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+ const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
+ std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
+ bool do_uncompress, bool maybe_compressed, BlockType block_type,
+ const UncompressionDict& uncompression_dict,
+ const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
+ size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
+ bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) {
+ assert(result);
+
+ BlockContents contents;
+ BlockFetcher block_fetcher(
+ file, prefetch_buffer, footer, options, handle, &contents, ioptions,
+ do_uncompress, maybe_compressed, block_type, uncompression_dict,
+ cache_options, memory_allocator, nullptr, for_compaction);
+ Status s = block_fetcher.ReadBlockContents();
+ if (s.ok()) {
+ result->reset(BlocklikeTraits<TBlocklike>::Create(
+ std::move(contents), global_seqno, read_amp_bytes_per_bit,
+ ioptions.statistics, using_zstd, filter_policy));
+ }
+
+ return s;
+}
+
+inline MemoryAllocator* GetMemoryAllocator(
+ const BlockBasedTableOptions& table_options) {
+ return table_options.block_cache.get()
+ ? table_options.block_cache->memory_allocator()
+ : nullptr;
+}
+
+inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
+ const BlockBasedTableOptions& table_options) {
+ return table_options.block_cache_compressed.get()
+ ? table_options.block_cache_compressed->memory_allocator()
+ : nullptr;
+}
+
+// Delete the entry resided in the cache.
+template <class Entry>
+void DeleteCachedEntry(const Slice& /*key*/, void* value) {
+ auto entry = reinterpret_cast<Entry*>(value);
+ delete entry;
+}
+
+// Release the cached entry and decrement its ref count.
+void ForceReleaseCachedEntry(void* arg, void* h) {
+ Cache* cache = reinterpret_cast<Cache*>(arg);
+ Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+ cache->Release(handle, true /* force_erase */);
+}
+
+// Release the cached entry and decrement its ref count.
+// Do not force erase
+void ReleaseCachedEntry(void* arg, void* h) {
+ Cache* cache = reinterpret_cast<Cache*>(arg);
+ Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+ cache->Release(handle, false /* force_erase */);
+}
+
+// For hash based index, return true if prefix_extractor and
+// prefix_extractor_block mismatch, false otherwise. This flag will be used
+// as total_order_seek via NewIndexIterator
+bool PrefixExtractorChanged(const TableProperties* table_properties,
+ const SliceTransform* prefix_extractor) {
+ // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
+ // Turn off hash index in prefix_extractor is not set; if prefix_extractor
+ // is set but prefix_extractor_block is not set, also disable hash index
+ if (prefix_extractor == nullptr || table_properties == nullptr ||
+ table_properties->prefix_extractor_name.empty()) {
+ return true;
+ }
+
+ // prefix_extractor and prefix_extractor_block are both non-empty
+ if (table_properties->prefix_extractor_name.compare(
+ prefix_extractor->Name()) != 0) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
+ CacheAllocationPtr heap_buf;
+ heap_buf = AllocateBlock(buf.size(), allocator);
+ memcpy(heap_buf.get(), buf.data(), buf.size());
+ return heap_buf;
+}
+
+} // namespace
+
+// Encapsulates common functionality for the various index reader
+// implementations. Provides access to the index block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
+ public:
+ IndexReaderCommon(const BlockBasedTable* t,
+ CachableEntry<Block>&& index_block)
+ : table_(t), index_block_(std::move(index_block)) {
+ assert(table_ != nullptr);
+ }
+
+ protected:
+ static Status ReadIndexBlock(const BlockBasedTable* table,
+ FilePrefetchBuffer* prefetch_buffer,
+ const ReadOptions& read_options, bool use_cache,
+ GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<Block>* index_block);
+
+ const BlockBasedTable* table() const { return table_; }
+
+ const InternalKeyComparator* internal_comparator() const {
+ assert(table_ != nullptr);
+ assert(table_->get_rep() != nullptr);
+
+ return &table_->get_rep()->internal_comparator;
+ }
+
+ bool index_has_first_key() const {
+ assert(table_ != nullptr);
+ assert(table_->get_rep() != nullptr);
+ return table_->get_rep()->index_has_first_key;
+ }
+
+ bool index_key_includes_seq() const {
+ assert(table_ != nullptr);
+ assert(table_->get_rep() != nullptr);
+ return table_->get_rep()->index_key_includes_seq;
+ }
+
+ bool index_value_is_full() const {
+ assert(table_ != nullptr);
+ assert(table_->get_rep() != nullptr);
+ return table_->get_rep()->index_value_is_full;
+ }
+
+ bool cache_index_blocks() const {
+ assert(table_ != nullptr);
+ assert(table_->get_rep() != nullptr);
+ return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+ }
+
+ Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<Block>* index_block) const;
+
+ size_t ApproximateIndexBlockMemoryUsage() const {
+ assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
+ return index_block_.GetOwnValue()
+ ? index_block_.GetValue()->ApproximateMemoryUsage()
+ : 0;
+ }
+
+ private:
+ const BlockBasedTable* table_;
+ CachableEntry<Block> index_block_;
+};
+
+Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<Block>* index_block) {
+ PERF_TIMER_GUARD(read_index_block_nanos);
+
+ assert(table != nullptr);
+ assert(index_block != nullptr);
+ assert(index_block->IsEmpty());
+
+ const Rep* const rep = table->get_rep();
+ assert(rep != nullptr);
+
+ const Status s = table->RetrieveBlock(
+ prefetch_buffer, read_options, rep->footer.index_handle(),
+ UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
+ get_context, lookup_context, /* for_compaction */ false, use_cache);
+
+ return s;
+}
+
+Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
+ bool no_io, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<Block>* index_block) const {
+ assert(index_block != nullptr);
+
+ if (!index_block_.IsEmpty()) {
+ index_block->SetUnownedValue(index_block_.GetValue());
+ return Status::OK();
+ }
+
+ ReadOptions read_options;
+ if (no_io) {
+ read_options.read_tier = kBlockCacheTier;
+ }
+
+ return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
+ cache_index_blocks(), get_context, lookup_context,
+ index_block);
+}
+
+// Index that allows binary search lookup in a two-level index structure.
+class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+ // Read the partition index from the file and create an instance for
+ // `PartitionIndexReader`.
+ // On success, index_reader will be populated; otherwise it will remain
+ // unmodified.
+ static Status Create(const BlockBasedTable* table,
+ FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+ bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context,
+ std::unique_ptr<IndexReader>* index_reader) {
+ assert(table != nullptr);
+ assert(table->get_rep());
+ assert(!pin || prefetch);
+ assert(index_reader != nullptr);
+
+ CachableEntry<Block> index_block;
+ if (prefetch || !use_cache) {
+ const Status s =
+ ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
+ /*get_context=*/nullptr, lookup_context, &index_block);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (use_cache && !pin) {
+ index_block.Reset();
+ }
+ }
+
+ index_reader->reset(
+ new PartitionIndexReader(table, std::move(index_block)));
+
+ return Status::OK();
+ }
+
+ // return a two-level iterator: first level is on the partition index
+ InternalIteratorBase<IndexValue>* NewIterator(
+ const ReadOptions& read_options, bool /* disable_prefix_seek */,
+ IndexBlockIter* iter, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) override {
+ const bool no_io = (read_options.read_tier == kBlockCacheTier);
+ CachableEntry<Block> index_block;
+ const Status s =
+ GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+ if (!s.ok()) {
+ if (iter != nullptr) {
+ iter->Invalidate(s);
+ return iter;
+ }
+
+ return NewErrorInternalIterator<IndexValue>(s);
+ }
+
+ InternalIteratorBase<IndexValue>* it = nullptr;
+
+ Statistics* kNullStats = nullptr;
+ // Filters are already checked before seeking the index
+ if (!partition_map_.empty()) {
+ // We don't return pinned data from index blocks, so no need
+ // to set `block_contents_pinned`.
+ it = NewTwoLevelIterator(
+ new BlockBasedTable::PartitionedIndexIteratorState(table(),
+ &partition_map_),
+ index_block.GetValue()->NewIndexIterator(
+ internal_comparator(), internal_comparator()->user_comparator(),
+ nullptr, kNullStats, true, index_has_first_key(),
+ index_key_includes_seq(), index_value_is_full()));
+ } else {
+ ReadOptions ro;
+ ro.fill_cache = read_options.fill_cache;
+ // We don't return pinned data from index blocks, so no need
+ // to set `block_contents_pinned`.
+ it = new BlockBasedTableIterator<IndexBlockIter, IndexValue>(
+ table(), ro, *internal_comparator(),
+ index_block.GetValue()->NewIndexIterator(
+ internal_comparator(), internal_comparator()->user_comparator(),
+ nullptr, kNullStats, true, index_has_first_key(),
+ index_key_includes_seq(), index_value_is_full()),
+ false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
+ lookup_context ? lookup_context->caller
+ : TableReaderCaller::kUncategorized);
+ }
+
+ assert(it != nullptr);
+ index_block.TransferTo(it);
+
+ return it;
+
+ // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
+ // on-stack BlockIter while the state is on heap. Currentlly it assumes
+ // the first level iter is always on heap and will attempt to delete it
+ // in its destructor.
+ }
+
+ void CacheDependencies(bool pin) override {
+ // Before read partitions, prefetch them to avoid lots of IOs
+ BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+ const BlockBasedTable::Rep* rep = table()->rep_;
+ IndexBlockIter biter;
+ BlockHandle handle;
+ Statistics* kNullStats = nullptr;
+
+ CachableEntry<Block> index_block;
+ Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
+ &lookup_context, &index_block);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(rep->ioptions.info_log,
+ "Error retrieving top-level index block while trying to "
+ "cache index partitions: %s",
+ s.ToString().c_str());
+ return;
+ }
+
+ // We don't return pinned data from index blocks, so no need
+ // to set `block_contents_pinned`.
+ index_block.GetValue()->NewIndexIterator(
+ internal_comparator(), internal_comparator()->user_comparator(), &biter,
+ kNullStats, true, index_has_first_key(), index_key_includes_seq(),
+ index_value_is_full());
+ // Index partitions are assumed to be consecuitive. Prefetch them all.
+ // Read the first block offset
+ biter.SeekToFirst();
+ if (!biter.Valid()) {
+ // Empty index.
+ return;
+ }
+ handle = biter.value().handle;
+ uint64_t prefetch_off = handle.offset();
+
+ // Read the last block's offset
+ biter.SeekToLast();
+ if (!biter.Valid()) {
+ // Empty index.
+ return;
+ }
+ handle = biter.value().handle;
+ uint64_t last_off = handle.offset() + block_size(handle);
+ uint64_t prefetch_len = last_off - prefetch_off;
+ std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+ rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer);
+ s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
+ static_cast<size_t>(prefetch_len));
+
+ // After prefetch, read the partitions one by one
+ biter.SeekToFirst();
+ auto ro = ReadOptions();
+ for (; biter.Valid(); biter.Next()) {
+ handle = biter.value().handle;
+ CachableEntry<Block> block;
+ // TODO: Support counter batch update for partitioned index and
+ // filter blocks
+ s = table()->MaybeReadBlockAndLoadToCache(
+ prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
+ &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
+ /*contents=*/nullptr);
+
+ assert(s.ok() || block.GetValue() == nullptr);
+ if (s.ok() && block.GetValue() != nullptr) {
+ if (block.IsCached()) {
+ if (pin) {
+ partition_map_[handle.offset()] = std::move(block);
+ }
+ }
+ }
+ }
+ }
+
+ size_t ApproximateMemoryUsage() const override {
+ size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
+#else
+ usage += sizeof(*this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
+ return usage;
+ }
+
+ private:
+ PartitionIndexReader(const BlockBasedTable* t,
+ CachableEntry<Block>&& index_block)
+ : IndexReaderCommon(t, std::move(index_block)) {}
+
+ std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
+};
+
+// Index that allows binary search lookup for the first key of each block.
+// This class can be viewed as a thin wrapper for `Block` class which already
+// supports binary search.
+class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+ // Read index from the file and create an intance for
+ // `BinarySearchIndexReader`.
+ // On success, index_reader will be populated; otherwise it will remain
+ // unmodified.
+ static Status Create(const BlockBasedTable* table,
+ FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+ bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context,
+ std::unique_ptr<IndexReader>* index_reader) {
+ assert(table != nullptr);
+ assert(table->get_rep());
+ assert(!pin || prefetch);
+ assert(index_reader != nullptr);
+
+ CachableEntry<Block> index_block;
+ if (prefetch || !use_cache) {
+ const Status s =
+ ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
+ /*get_context=*/nullptr, lookup_context, &index_block);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (use_cache && !pin) {
+ index_block.Reset();
+ }
+ }
+
+ index_reader->reset(
+ new BinarySearchIndexReader(table, std::move(index_block)));
+
+ return Status::OK();
+ }
+
+ InternalIteratorBase<IndexValue>* NewIterator(
+ const ReadOptions& read_options, bool /* disable_prefix_seek */,
+ IndexBlockIter* iter, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) override {
+ const bool no_io = (read_options.read_tier == kBlockCacheTier);
+ CachableEntry<Block> index_block;
+ const Status s =
+ GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+ if (!s.ok()) {
+ if (iter != nullptr) {
+ iter->Invalidate(s);
+ return iter;
+ }
+
+ return NewErrorInternalIterator<IndexValue>(s);
+ }
+
+ Statistics* kNullStats = nullptr;
+ // We don't return pinned data from index blocks, so no need
+ // to set `block_contents_pinned`.
+ auto it = index_block.GetValue()->NewIndexIterator(
+ internal_comparator(), internal_comparator()->user_comparator(), iter,
+ kNullStats, true, index_has_first_key(), index_key_includes_seq(),
+ index_value_is_full());
+
+ assert(it != nullptr);
+ index_block.TransferTo(it);
+
+ return it;
+ }
+
+ size_t ApproximateMemoryUsage() const override {
+ size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
+#else
+ usage += sizeof(*this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ return usage;
+ }
+
+ private:
+ BinarySearchIndexReader(const BlockBasedTable* t,
+ CachableEntry<Block>&& index_block)
+ : IndexReaderCommon(t, std::move(index_block)) {}
+};
+
+// Index that leverages an internal hash table to quicken the lookup for a given
+// key.
+class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+ static Status Create(const BlockBasedTable* table,
+ FilePrefetchBuffer* prefetch_buffer,
+ InternalIterator* meta_index_iter, bool use_cache,
+ bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context,
+ std::unique_ptr<IndexReader>* index_reader) {
+ assert(table != nullptr);
+ assert(index_reader != nullptr);
+ assert(!pin || prefetch);
+
+ const BlockBasedTable::Rep* rep = table->get_rep();
+ assert(rep != nullptr);
+
+ CachableEntry<Block> index_block;
+ if (prefetch || !use_cache) {
+ const Status s =
+ ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
+ /*get_context=*/nullptr, lookup_context, &index_block);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (use_cache && !pin) {
+ index_block.Reset();
+ }
+ }
+
+ // Note, failure to create prefix hash index does not need to be a
+ // hard error. We can still fall back to the original binary search index.
+ // So, Create will succeed regardless, from this point on.
+
+ index_reader->reset(new HashIndexReader(table, std::move(index_block)));
+
+ // Get prefixes block
+ BlockHandle prefixes_handle;
+ Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
+ &prefixes_handle);
+ if (!s.ok()) {
+ // TODO: log error
+ return Status::OK();
+ }
+
+ // Get index metadata block
+ BlockHandle prefixes_meta_handle;
+ s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
+ &prefixes_meta_handle);
+ if (!s.ok()) {
+ // TODO: log error
+ return Status::OK();
+ }
+
+ RandomAccessFileReader* const file = rep->file.get();
+ const Footer& footer = rep->footer;
+ const ImmutableCFOptions& ioptions = rep->ioptions;
+ const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
+ MemoryAllocator* const memory_allocator =
+ GetMemoryAllocator(rep->table_options);
+
+ // Read contents for the blocks
+ BlockContents prefixes_contents;
+ BlockFetcher prefixes_block_fetcher(
+ file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
+ &prefixes_contents, ioptions, true /*decompress*/,
+ true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
+ UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+ s = prefixes_block_fetcher.ReadBlockContents();
+ if (!s.ok()) {
+ return s;
+ }
+ BlockContents prefixes_meta_contents;
+ BlockFetcher prefixes_meta_block_fetcher(
+ file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
+ &prefixes_meta_contents, ioptions, true /*decompress*/,
+ true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
+ UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+ s = prefixes_meta_block_fetcher.ReadBlockContents();
+ if (!s.ok()) {
+ // TODO: log error
+ return Status::OK();
+ }
+
+ BlockPrefixIndex* prefix_index = nullptr;
+ assert(rep->internal_prefix_transform.get() != nullptr);
+ s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
+ prefixes_contents.data,
+ prefixes_meta_contents.data, &prefix_index);
+ // TODO: log error
+ if (s.ok()) {
+ HashIndexReader* const hash_index_reader =
+ static_cast<HashIndexReader*>(index_reader->get());
+ hash_index_reader->prefix_index_.reset(prefix_index);
+ }
+
+ return Status::OK();
+ }
+
+ InternalIteratorBase<IndexValue>* NewIterator(
+ const ReadOptions& read_options, bool disable_prefix_seek,
+ IndexBlockIter* iter, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) override {
+ const bool no_io = (read_options.read_tier == kBlockCacheTier);
+ CachableEntry<Block> index_block;
+ const Status s =
+ GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+ if (!s.ok()) {
+ if (iter != nullptr) {
+ iter->Invalidate(s);
+ return iter;
+ }
+
+ return NewErrorInternalIterator<IndexValue>(s);
+ }
+
+ Statistics* kNullStats = nullptr;
+ const bool total_order_seek =
+ read_options.total_order_seek || disable_prefix_seek;
+ // We don't return pinned data from index blocks, so no need
+ // to set `block_contents_pinned`.
+ auto it = index_block.GetValue()->NewIndexIterator(
+ internal_comparator(), internal_comparator()->user_comparator(), iter,
+ kNullStats, total_order_seek, index_has_first_key(),
+ index_key_includes_seq(), index_value_is_full(),
+ false /* block_contents_pinned */, prefix_index_.get());
+
+ assert(it != nullptr);
+ index_block.TransferTo(it);
+
+ return it;
+ }
+
+ size_t ApproximateMemoryUsage() const override {
+ size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
+#else
+ if (prefix_index_) {
+ usage += prefix_index_->ApproximateMemoryUsage();
+ }
+ usage += sizeof(*this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ return usage;
+ }
+
+ private:
+ HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
+ : IndexReaderCommon(t, std::move(index_block)) {}
+
+ std::unique_ptr<BlockPrefixIndex> prefix_index_;
+};
+
+void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
+ GetContext* get_context,
+ size_t usage) const {
+ Statistics* const statistics = rep_->ioptions.statistics;
+
+ PERF_COUNTER_ADD(block_cache_hit_count, 1);
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
+ static_cast<uint32_t>(rep_->level));
+
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_hit;
+ get_context->get_context_stats_.num_cache_bytes_read += usage;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_HIT);
+ RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
+ }
+
+ switch (block_type) {
+ case BlockType::kFilter:
+ PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
+
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_filter_hit;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
+ }
+ break;
+
+ case BlockType::kCompressionDictionary:
+ // TODO: introduce perf counter for compression dictionary hit count
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_compression_dict_hit;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+ }
+ break;
+
+ case BlockType::kIndex:
+ PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
+
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_index_hit;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
+ }
+ break;
+
+ default:
+ // TODO: introduce dedicated tickers/statistics/counters
+ // for range tombstones
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_data_hit;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
+ }
+ break;
+ }
+}
+
+void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
+ GetContext* get_context) const {
+ Statistics* const statistics = rep_->ioptions.statistics;
+
+ // TODO: introduce aggregate (not per-level) block cache miss count
+ PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
+ static_cast<uint32_t>(rep_->level));
+
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_miss;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_MISS);
+ }
+
+ // TODO: introduce perf counters for misses per block type
+ switch (block_type) {
+ case BlockType::kFilter:
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_filter_miss;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
+ }
+ break;
+
+ case BlockType::kCompressionDictionary:
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_compression_dict_miss;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+ }
+ break;
+
+ case BlockType::kIndex:
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_index_miss;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
+ }
+ break;
+
+ default:
+ // TODO: introduce dedicated tickers/statistics/counters
+ // for range tombstones
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_data_miss;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
+ }
+ break;
+ }
+}
+
+void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
+ GetContext* get_context,
+ size_t usage) const {
+ Statistics* const statistics = rep_->ioptions.statistics;
+
+ // TODO: introduce perf counters for block cache insertions
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_add;
+ get_context->get_context_stats_.num_cache_bytes_write += usage;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_ADD);
+ RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
+ }
+
+ switch (block_type) {
+ case BlockType::kFilter:
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_filter_add;
+ get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
+ RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
+ }
+ break;
+
+ case BlockType::kCompressionDictionary:
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_compression_dict_add;
+ get_context->get_context_stats_
+ .num_cache_compression_dict_bytes_insert += usage;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+ RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+ usage);
+ }
+ break;
+
+ case BlockType::kIndex:
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_index_add;
+ get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
+ RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
+ }
+ break;
+
+ default:
+ // TODO: introduce dedicated tickers/statistics/counters
+ // for range tombstones
+ if (get_context) {
+ ++get_context->get_context_stats_.num_cache_data_add;
+ get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
+ RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
+ }
+ break;
+ }
+}
+
+Cache::Handle* BlockBasedTable::GetEntryFromCache(
+ Cache* block_cache, const Slice& key, BlockType block_type,
+ GetContext* get_context) const {
+ auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
+
+ if (cache_handle != nullptr) {
+ UpdateCacheHitMetrics(block_type, get_context,
+ block_cache->GetUsage(cache_handle));
+ } else {
+ UpdateCacheMissMetrics(block_type, get_context);
+ }
+
+ return cache_handle;
+}
+
+// Helper function to setup the cache key's prefix for the Table.
+void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
+ assert(kMaxCacheKeyPrefixSize >= 10);
+ rep->cache_key_prefix_size = 0;
+ rep->compressed_cache_key_prefix_size = 0;
+ if (rep->table_options.block_cache != nullptr) {
+ GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
+ &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
+ }
+ if (rep->table_options.persistent_cache != nullptr) {
+ GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
+ &rep->persistent_cache_key_prefix[0],
+ &rep->persistent_cache_key_prefix_size);
+ }
+ if (rep->table_options.block_cache_compressed != nullptr) {
+ GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
+ rep->file->file(), &rep->compressed_cache_key_prefix[0],
+ &rep->compressed_cache_key_prefix_size);
+ }
+}
+
+void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
+ char* buffer, size_t* size) {
+ // generate an id from the file
+ *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
+
+ // If the prefix wasn't generated or was too long,
+ // create one from the cache.
+ if (cc != nullptr && *size == 0) {
+ char* end = EncodeVarint64(buffer, cc->NewId());
+ *size = static_cast<size_t>(end - buffer);
+ }
+}
+
+void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file,
+ char* buffer, size_t* size) {
+ // generate an id from the file
+ *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
+
+ // If the prefix wasn't generated or was too long,
+ // create one from the cache.
+ if (cc != nullptr && *size == 0) {
+ char* end = EncodeVarint64(buffer, cc->NewId());
+ *size = static_cast<size_t>(end - buffer);
+ }
+}
+
+namespace {
+// Return True if table_properties has `user_prop_name` has a `true` value
+// or it doesn't contain this property (for backward compatible).
+bool IsFeatureSupported(const TableProperties& table_properties,
+ const std::string& user_prop_name, Logger* info_log) {
+ auto& props = table_properties.user_collected_properties;
+ auto pos = props.find(user_prop_name);
+ // Older version doesn't have this value set. Skip this check.
+ if (pos != props.end()) {
+ if (pos->second == kPropFalse) {
+ return false;
+ } else if (pos->second != kPropTrue) {
+ ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
+ user_prop_name.c_str(), pos->second.c_str());
+ }
+ }
+ return true;
+}
+
+// Caller has to ensure seqno is not nullptr.
+Status GetGlobalSequenceNumber(const TableProperties& table_properties,
+ SequenceNumber largest_seqno,
+ SequenceNumber* seqno) {
+ const auto& props = table_properties.user_collected_properties;
+ const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
+ const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+
+ *seqno = kDisableGlobalSequenceNumber;
+ if (version_pos == props.end()) {
+ if (seqno_pos != props.end()) {
+ std::array<char, 200> msg_buf;
+ // This is not an external sst file, global_seqno is not supported.
+ snprintf(
+ msg_buf.data(), msg_buf.max_size(),
+ "A non-external sst file have global seqno property with value %s",
+ seqno_pos->second.c_str());
+ return Status::Corruption(msg_buf.data());
+ }
+ return Status::OK();
+ }
+
+ uint32_t version = DecodeFixed32(version_pos->second.c_str());
+ if (version < 2) {
+ if (seqno_pos != props.end() || version != 1) {
+ std::array<char, 200> msg_buf;
+ // This is a v1 external sst file, global_seqno is not supported.
+ snprintf(msg_buf.data(), msg_buf.max_size(),
+ "An external sst file with version %u have global seqno "
+ "property with value %s",
+ version, seqno_pos->second.c_str());
+ return Status::Corruption(msg_buf.data());
+ }
+ return Status::OK();
+ }
+
+ // Since we have a plan to deprecate global_seqno, we do not return failure
+ // if seqno_pos == props.end(). We rely on version_pos to detect whether the
+ // SST is external.
+ SequenceNumber global_seqno(0);
+ if (seqno_pos != props.end()) {
+ global_seqno = DecodeFixed64(seqno_pos->second.c_str());
+ }
+ // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
+ // to denote it is unknown.
+ if (largest_seqno < kMaxSequenceNumber) {
+ if (global_seqno == 0) {
+ global_seqno = largest_seqno;
+ }
+ if (global_seqno != largest_seqno) {
+ std::array<char, 200> msg_buf;
+ snprintf(
+ msg_buf.data(), msg_buf.max_size(),
+ "An external sst file with version %u have global seqno property "
+ "with value %s, while largest seqno in the file is %llu",
+ version, seqno_pos->second.c_str(),
+ static_cast<unsigned long long>(largest_seqno));
+ return Status::Corruption(msg_buf.data());
+ }
+ }
+ *seqno = global_seqno;
+
+ if (global_seqno > kMaxSequenceNumber) {
+ std::array<char, 200> msg_buf;
+ snprintf(msg_buf.data(), msg_buf.max_size(),
+ "An external sst file with version %u have global seqno property "
+ "with value %llu, which is greater than kMaxSequenceNumber",
+ version, static_cast<unsigned long long>(global_seqno));
+ return Status::Corruption(msg_buf.data());
+ }
+
+ return Status::OK();
+}
+} // namespace
+
+Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
+ size_t cache_key_prefix_size,
+ const BlockHandle& handle, char* cache_key) {
+ assert(cache_key != nullptr);
+ assert(cache_key_prefix_size != 0);
+ assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
+ memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
+ char* end =
+ EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
+ return Slice(cache_key, static_cast<size_t>(end - cache_key));
+}
+
+Status BlockBasedTable::Open(
+ const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+ const BlockBasedTableOptions& table_options,
+ const InternalKeyComparator& internal_comparator,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader,
+ const SliceTransform* prefix_extractor,
+ const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
+ const int level, const bool immortal_table,
+ const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
+ BlockCacheTracer* const block_cache_tracer) {
+ table_reader->reset();
+
+ Status s;
+ Footer footer;
+ std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+
+ // prefetch both index and filters, down to all partitions
+ const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
+ const bool preload_all = !table_options.cache_index_and_filter_blocks;
+
+ if (!ioptions.allow_mmap_reads) {
+ s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
+ preload_all, &prefetch_buffer);
+ } else {
+ // Should not prefetch for mmap mode.
+ prefetch_buffer.reset(new FilePrefetchBuffer(
+ nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
+ }
+
+ // Read in the following order:
+ // 1. Footer
+ // 2. [metaindex block]
+ // 3. [meta block: properties]
+ // 4. [meta block: range deletion tombstone]
+ // 5. [meta block: compression dictionary]
+ // 6. [meta block: index]
+ // 7. [meta block: filter]
+ s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
+ kBlockBasedTableMagicNumber);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!BlockBasedTableSupportedVersion(footer.version())) {
+ return Status::Corruption(
+ "Unknown Footer version. Maybe this file was created with newer "
+ "version of RocksDB?");
+ }
+
+ // We've successfully read the footer. We are ready to serve requests.
+ // Better not mutate rep_ after the creation. eg. internal_prefix_transform
+ // raw pointer will be used to create HashIndexReader, whose reset may
+ // access a dangling pointer.
+ BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+ Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
+ internal_comparator, skip_filters, level,
+ immortal_table);
+ rep->file = std::move(file);
+ rep->footer = footer;
+ rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
+ // We need to wrap data with internal_prefix_transform to make sure it can
+ // handle prefix correctly.
+ if (prefix_extractor != nullptr) {
+ rep->internal_prefix_transform.reset(
+ new InternalKeySliceTransform(prefix_extractor));
+ }
+ SetupCacheKeyPrefix(rep);
+ std::unique_ptr<BlockBasedTable> new_table(
+ new BlockBasedTable(rep, block_cache_tracer));
+
+ // page cache options
+ rep->persistent_cache_options =
+ PersistentCacheOptions(rep->table_options.persistent_cache,
+ std::string(rep->persistent_cache_key_prefix,
+ rep->persistent_cache_key_prefix_size),
+ rep->ioptions.statistics);
+
+ // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
+ // handle to null, otherwise it may be seen as uninitialized during the below
+ // meta-block reads.
+ rep->compression_dict_handle = BlockHandle::NullBlockHandle();
+
+ // Read metaindex
+ std::unique_ptr<Block> metaindex;
+ std::unique_ptr<InternalIterator> metaindex_iter;
+ s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
+ &metaindex_iter);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Populates table_properties and some fields that depend on it,
+ // such as index_type.
+ s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
+ metaindex_iter.get(), largest_seqno);
+ if (!s.ok()) {
+ return s;
+ }
+ s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
+ internal_comparator, &lookup_context);
+ if (!s.ok()) {
+ return s;
+ }
+ s = new_table->PrefetchIndexAndFilterBlocks(
+ prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
+ prefetch_all, table_options, level, &lookup_context);
+
+ if (s.ok()) {
+ // Update tail prefetch stats
+ assert(prefetch_buffer.get() != nullptr);
+ if (tail_prefetch_stats != nullptr) {
+ assert(prefetch_buffer->min_offset_read() < file_size);
+ tail_prefetch_stats->RecordEffectiveSize(
+ static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
+ }
+
+ *table_reader = std::move(new_table);
+ }
+
+ return s;
+}
+
+Status BlockBasedTable::PrefetchTail(
+ RandomAccessFileReader* file, uint64_t file_size,
+ TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
+ const bool preload_all,
+ std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
+ size_t tail_prefetch_size = 0;
+ if (tail_prefetch_stats != nullptr) {
+ // Multiple threads may get a 0 (no history) when running in parallel,
+ // but it will get cleared after the first of them finishes.
+ tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
+ }
+ if (tail_prefetch_size == 0) {
+ // Before read footer, readahead backwards to prefetch data. Do more
+ // readahead if we're going to read index/filter.
+ // TODO: This may incorrectly select small readahead in case partitioned
+ // index/filter is enabled and top-level partition pinning is enabled.
+ // That's because we need to issue readahead before we read the properties,
+ // at which point we don't yet know the index type.
+ tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
+ }
+ size_t prefetch_off;
+ size_t prefetch_len;
+ if (file_size < tail_prefetch_size) {
+ prefetch_off = 0;
+ prefetch_len = static_cast<size_t>(file_size);
+ } else {
+ prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
+ prefetch_len = tail_prefetch_size;
+ }
+ TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
+ &tail_prefetch_size);
+ Status s;
+ // TODO should not have this special logic in the future.
+ if (!file->use_direct_io()) {
+ prefetch_buffer->reset(new FilePrefetchBuffer(
+ nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
+ s = file->Prefetch(prefetch_off, prefetch_len);
+ } else {
+ prefetch_buffer->reset(new FilePrefetchBuffer(
+ nullptr, 0, 0, true /* enable */, true /* track_min_offset */));
+ s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
+ }
+ return s;
+}
+
+Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
+ uint32_t expected) {
+ Status s;
+ uint32_t actual = 0;
+ switch (type) {
+ case kNoChecksum:
+ break;
+ case kCRC32c:
+ expected = crc32c::Unmask(expected);
+ actual = crc32c::Value(buf, len);
+ break;
+ case kxxHash:
+ actual = XXH32(buf, static_cast<int>(len), 0);
+ break;
+ case kxxHash64:
+ actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
+ uint64_t{0xffffffff});
+ break;
+ default:
+ s = Status::Corruption("unknown checksum type");
+ }
+ if (s.ok() && actual != expected) {
+ s = Status::Corruption("properties block checksum mismatched");
+ }
+ return s;
+}
+
+Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
+ FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
+ TableProperties** table_properties) {
+ assert(table_properties != nullptr);
+ // If this is an external SST file ingested with write_global_seqno set to
+ // true, then we expect the checksum mismatch because checksum was written
+ // by SstFileWriter, but its global seqno in the properties block may have
+ // been changed during ingestion. In this case, we read the properties
+ // block, copy it to a memory buffer, change the global seqno to its
+ // original value, i.e. 0, and verify the checksum again.
+ BlockHandle props_block_handle;
+ CacheAllocationPtr tmp_buf;
+ Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
+ rep_->footer, rep_->ioptions, table_properties,
+ false /* verify_checksum */, &props_block_handle,
+ &tmp_buf, false /* compression_type_missing */,
+ nullptr /* memory_allocator */);
+ if (s.ok() && tmp_buf) {
+ const auto seqno_pos_iter =
+ (*table_properties)
+ ->properties_offsets.find(
+ ExternalSstFilePropertyNames::kGlobalSeqno);
+ size_t block_size = static_cast<size_t>(props_block_handle.size());
+ if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
+ uint64_t global_seqno_offset = seqno_pos_iter->second;
+ EncodeFixed64(
+ tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
+ }
+ uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
+ s = ROCKSDB_NAMESPACE::VerifyChecksum(rep_->footer.checksum(),
+ tmp_buf.get(), block_size + 1, value);
+ }
+ return s;
+}
+
+Status BlockBasedTable::ReadPropertiesBlock(
+ FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+ const SequenceNumber largest_seqno) {
+ bool found_properties_block = true;
+ Status s;
+ s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
+
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(rep_->ioptions.info_log,
+ "Error when seeking to properties block from file: %s",
+ s.ToString().c_str());
+ } else if (found_properties_block) {
+ s = meta_iter->status();
+ TableProperties* table_properties = nullptr;
+ if (s.ok()) {
+ s = ReadProperties(
+ meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
+ rep_->ioptions, &table_properties, true /* verify_checksum */,
+ nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
+ false /* compression_type_missing */, nullptr /* memory_allocator */);
+ }
+
+ if (s.IsCorruption()) {
+ s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
+ &table_properties);
+ }
+ std::unique_ptr<TableProperties> props_guard;
+ if (table_properties != nullptr) {
+ props_guard.reset(table_properties);
+ }
+
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(rep_->ioptions.info_log,
+ "Encountered error while reading data from properties "
+ "block %s",
+ s.ToString().c_str());
+ } else {
+ assert(table_properties != nullptr);
+ rep_->table_properties.reset(props_guard.release());
+ rep_->blocks_maybe_compressed =
+ rep_->table_properties->compression_name !=
+ CompressionTypeToString(kNoCompression);
+ rep_->blocks_definitely_zstd_compressed =
+ (rep_->table_properties->compression_name ==
+ CompressionTypeToString(kZSTD) ||
+ rep_->table_properties->compression_name ==
+ CompressionTypeToString(kZSTDNotFinalCompression));
+ }
+ } else {
+ ROCKS_LOG_ERROR(rep_->ioptions.info_log,
+ "Cannot find Properties block from file.");
+ }
+#ifndef ROCKSDB_LITE
+ if (rep_->table_properties) {
+ ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
+ &(rep_->table_prefix_extractor));
+ }
+#endif // ROCKSDB_LITE
+
+ // Read the table properties, if provided.
+ if (rep_->table_properties) {
+ rep_->whole_key_filtering &=
+ IsFeatureSupported(*(rep_->table_properties),
+ BlockBasedTablePropertyNames::kWholeKeyFiltering,
+ rep_->ioptions.info_log);
+ rep_->prefix_filtering &=
+ IsFeatureSupported(*(rep_->table_properties),
+ BlockBasedTablePropertyNames::kPrefixFiltering,
+ rep_->ioptions.info_log);
+
+ rep_->index_key_includes_seq =
+ rep_->table_properties->index_key_is_user_key == 0;
+ rep_->index_value_is_full =
+ rep_->table_properties->index_value_is_delta_encoded == 0;
+
+ // Update index_type with the true type.
+ // If table properties don't contain index type, we assume that the table
+ // is in very old format and has kBinarySearch index type.
+ auto& props = rep_->table_properties->user_collected_properties;
+ auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+ if (pos != props.end()) {
+ rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
+ DecodeFixed32(pos->second.c_str()));
+ }
+
+ rep_->index_has_first_key =
+ rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
+
+ s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
+ &(rep_->global_seqno));
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
+ }
+ }
+ return s;
+}
+
+Status BlockBasedTable::ReadRangeDelBlock(
+ FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+ const InternalKeyComparator& internal_comparator,
+ BlockCacheLookupContext* lookup_context) {
+ Status s;
+ bool found_range_del_block;
+ BlockHandle range_del_handle;
+ s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(
+ rep_->ioptions.info_log,
+ "Error when seeking to range delete tombstones block from file: %s",
+ s.ToString().c_str());
+ } else if (found_range_del_block && !range_del_handle.IsNull()) {
+ ReadOptions read_options;
+ std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
+ read_options, range_del_handle,
+ /*input_iter=*/nullptr, BlockType::kRangeDeletion,
+ /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
+ assert(iter != nullptr);
+ s = iter->status();
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(
+ rep_->ioptions.info_log,
+ "Encountered error while reading data from range del block %s",
+ s.ToString().c_str());
+ } else {
+ rep_->fragmented_range_dels =
+ std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
+ internal_comparator);
+ }
+ }
+ return s;
+}
+
+Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
+ FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+ BlockBasedTable* new_table, bool prefetch_all,
+ const BlockBasedTableOptions& table_options, const int level,
+ BlockCacheLookupContext* lookup_context) {
+ Status s;
+
+ // Find filter handle and filter type
+ if (rep_->filter_policy) {
+ for (auto filter_type :
+ {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
+ Rep::FilterType::kBlockFilter}) {
+ std::string prefix;
+ switch (filter_type) {
+ case Rep::FilterType::kFullFilter:
+ prefix = kFullFilterBlockPrefix;
+ break;
+ case Rep::FilterType::kPartitionedFilter:
+ prefix = kPartitionedFilterBlockPrefix;
+ break;
+ case Rep::FilterType::kBlockFilter:
+ prefix = kFilterBlockPrefix;
+ break;
+ default:
+ assert(0);
+ }
+ std::string filter_block_key = prefix;
+ filter_block_key.append(rep_->filter_policy->Name());
+ if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
+ .ok()) {
+ rep_->filter_type = filter_type;
+ break;
+ }
+ }
+ }
+
+ // Find compression dictionary handle
+ bool found_compression_dict = false;
+ s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
+ &rep_->compression_dict_handle);
+ if (!s.ok()) {
+ return s;
+ }
+
+ BlockBasedTableOptions::IndexType index_type = rep_->index_type;
+
+ const bool use_cache = table_options.cache_index_and_filter_blocks;
+
+ // pin both index and filters, down to all partitions
+ const bool pin_all =
+ rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
+
+ // prefetch the first level of index
+ const bool prefetch_index =
+ prefetch_all ||
+ (table_options.pin_top_level_index_and_filter &&
+ index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+ // pin the first level of index
+ const bool pin_index =
+ pin_all || (table_options.pin_top_level_index_and_filter &&
+ index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+
+ std::unique_ptr<IndexReader> index_reader;
+ s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
+ prefetch_index, pin_index, lookup_context,
+ &index_reader);
+ if (!s.ok()) {
+ return s;
+ }
+
+ rep_->index_reader = std::move(index_reader);
+
+ // The partitions of partitioned index are always stored in cache. They
+ // are hence follow the configuration for pin and prefetch regardless of
+ // the value of cache_index_and_filter_blocks
+ if (prefetch_all) {
+ rep_->index_reader->CacheDependencies(pin_all);
+ }
+
+ // prefetch the first level of filter
+ const bool prefetch_filter =
+ prefetch_all ||
+ (table_options.pin_top_level_index_and_filter &&
+ rep_->filter_type == Rep::FilterType::kPartitionedFilter);
+ // Partition fitlers cannot be enabled without partition indexes
+ assert(!prefetch_filter || prefetch_index);
+ // pin the first level of filter
+ const bool pin_filter =
+ pin_all || (table_options.pin_top_level_index_and_filter &&
+ rep_->filter_type == Rep::FilterType::kPartitionedFilter);
+
+ if (rep_->filter_policy) {
+ auto filter = new_table->CreateFilterBlockReader(
+ prefetch_buffer, use_cache, prefetch_filter, pin_filter,
+ lookup_context);
+ if (filter) {
+ // Refer to the comment above about paritioned indexes always being cached
+ if (prefetch_all) {
+ filter->CacheDependencies(pin_all);
+ }
+
+ rep_->filter = std::move(filter);
+ }
+ }
+
+ if (!rep_->compression_dict_handle.IsNull()) {
+ std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
+ s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
+ prefetch_all, pin_all, lookup_context,
+ &uncompression_dict_reader);
+ if (!s.ok()) {
+ return s;
+ }
+
+ rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
+ }
+
+ assert(s.ok());
+ return s;
+}
+
+void BlockBasedTable::SetupForCompaction() {
+ switch (rep_->ioptions.access_hint_on_compaction_start) {
+ case Options::NONE:
+ break;
+ case Options::NORMAL:
+ rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
+ break;
+ case Options::SEQUENTIAL:
+ rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
+ break;
+ case Options::WILLNEED:
+ rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
+ break;
+ default:
+ assert(false);
+ }
+}
+
+std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
+ const {
+ return rep_->table_properties;
+}
+
+size_t BlockBasedTable::ApproximateMemoryUsage() const {
+ size_t usage = 0;
+ if (rep_->filter) {
+ usage += rep_->filter->ApproximateMemoryUsage();
+ }
+ if (rep_->index_reader) {
+ usage += rep_->index_reader->ApproximateMemoryUsage();
+ }
+ if (rep_->uncompression_dict_reader) {
+ usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
+ }
+ return usage;
+}
+
+// Load the meta-index-block from the file. On success, return the loaded
+// metaindex
+// block and its iterator.
+Status BlockBasedTable::ReadMetaIndexBlock(
+ FilePrefetchBuffer* prefetch_buffer,
+ std::unique_ptr<Block>* metaindex_block,
+ std::unique_ptr<InternalIterator>* iter) {
+ // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
+ // it is an empty block.
+ std::unique_ptr<Block> metaindex;
+ Status s = ReadBlockFromFile(
+ rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
+ rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
+ true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
+ UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
+ kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
+ GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
+ rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */);
+
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(rep_->ioptions.info_log,
+ "Encountered error while reading data from properties"
+ " block %s",
+ s.ToString().c_str());
+ return s;
+ }
+
+ *metaindex_block = std::move(metaindex);
+ // meta block uses bytewise comparator.
+ iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(),
+ BytewiseComparator()));
+ return Status::OK();
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::GetDataBlockFromCache(
+ const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+ Cache* block_cache, Cache* block_cache_compressed,
+ const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
+ const UncompressionDict& uncompression_dict, BlockType block_type,
+ GetContext* get_context) const {
+ const size_t read_amp_bytes_per_bit =
+ block_type == BlockType::kData
+ ? rep_->table_options.read_amp_bytes_per_bit
+ : 0;
+ assert(block);
+ assert(block->IsEmpty());
+
+ Status s;
+ BlockContents* compressed_block = nullptr;
+ Cache::Handle* block_cache_compressed_handle = nullptr;
+
+ // Lookup uncompressed cache first
+ if (block_cache != nullptr) {
+ auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
+ block_type, get_context);
+ if (cache_handle != nullptr) {
+ block->SetCachedValue(
+ reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
+ block_cache, cache_handle);
+ return s;
+ }
+ }
+
+ // If not found, search from the compressed block cache.
+ assert(block->IsEmpty());
+
+ if (block_cache_compressed == nullptr) {
+ return s;
+ }
+
+ assert(!compressed_block_cache_key.empty());
+ block_cache_compressed_handle =
+ block_cache_compressed->Lookup(compressed_block_cache_key);
+
+ Statistics* statistics = rep_->ioptions.statistics;
+
+ // if we found in the compressed cache, then uncompress and insert into
+ // uncompressed cache
+ if (block_cache_compressed_handle == nullptr) {
+ RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
+ return s;
+ }
+
+ // found compressed block
+ RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
+ compressed_block = reinterpret_cast<BlockContents*>(
+ block_cache_compressed->Value(block_cache_compressed_handle));
+ CompressionType compression_type = compressed_block->get_compression_type();
+ assert(compression_type != kNoCompression);
+
+ // Retrieve the uncompressed contents into a new buffer
+ BlockContents contents;
+ UncompressionContext context(compression_type);
+ UncompressionInfo info(context, uncompression_dict, compression_type);
+ s = UncompressBlockContents(
+ info, compressed_block->data.data(), compressed_block->data.size(),
+ &contents, rep_->table_options.format_version, rep_->ioptions,
+ GetMemoryAllocator(rep_->table_options));
+
+ // Insert uncompressed block into block cache
+ if (s.ok()) {
+ std::unique_ptr<TBlocklike> block_holder(
+ BlocklikeTraits<TBlocklike>::Create(
+ std::move(contents), rep_->get_global_seqno(block_type),
+ read_amp_bytes_per_bit, statistics,
+ rep_->blocks_definitely_zstd_compressed,
+ rep_->table_options.filter_policy.get())); // uncompressed block
+
+ if (block_cache != nullptr && block_holder->own_bytes() &&
+ read_options.fill_cache) {
+ size_t charge = block_holder->ApproximateMemoryUsage();
+ Cache::Handle* cache_handle = nullptr;
+ s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
+ &DeleteCachedEntry<TBlocklike>, &cache_handle);
+ if (s.ok()) {
+ assert(cache_handle != nullptr);
+ block->SetCachedValue(block_holder.release(), block_cache,
+ cache_handle);
+
+ UpdateCacheInsertionMetrics(block_type, get_context, charge);
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
+ }
+ } else {
+ block->SetOwnedValue(block_holder.release());
+ }
+ }
+
+ // Release hold on compressed cache entry
+ block_cache_compressed->Release(block_cache_compressed_handle);
+ return s;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::PutDataBlockToCache(
+ const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+ Cache* block_cache, Cache* block_cache_compressed,
+ CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
+ CompressionType raw_block_comp_type,
+ const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
+ MemoryAllocator* memory_allocator, BlockType block_type,
+ GetContext* get_context) const {
+ const ImmutableCFOptions& ioptions = rep_->ioptions;
+ const uint32_t format_version = rep_->table_options.format_version;
+ const size_t read_amp_bytes_per_bit =
+ block_type == BlockType::kData
+ ? rep_->table_options.read_amp_bytes_per_bit
+ : 0;
+ const Cache::Priority priority =
+ rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
+ (block_type == BlockType::kFilter ||
+ block_type == BlockType::kCompressionDictionary ||
+ block_type == BlockType::kIndex)
+ ? Cache::Priority::HIGH
+ : Cache::Priority::LOW;
+ assert(cached_block);
+ assert(cached_block->IsEmpty());
+
+ Status s;
+ Statistics* statistics = ioptions.statistics;
+
+ std::unique_ptr<TBlocklike> block_holder;
+ if (raw_block_comp_type != kNoCompression) {
+ // Retrieve the uncompressed contents into a new buffer
+ BlockContents uncompressed_block_contents;
+ UncompressionContext context(raw_block_comp_type);
+ UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
+ s = UncompressBlockContents(info, raw_block_contents->data.data(),
+ raw_block_contents->data.size(),
+ &uncompressed_block_contents, format_version,
+ ioptions, memory_allocator);
+ if (!s.ok()) {
+ return s;
+ }
+
+ block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+ std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
+ statistics, rep_->blocks_definitely_zstd_compressed,
+ rep_->table_options.filter_policy.get()));
+ } else {
+ block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+ std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
+ statistics, rep_->blocks_definitely_zstd_compressed,
+ rep_->table_options.filter_policy.get()));
+ }
+
+ // Insert compressed block into compressed block cache.
+ // Release the hold on the compressed cache entry immediately.
+ if (block_cache_compressed != nullptr &&
+ raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
+ raw_block_contents->own_bytes()) {
+#ifndef NDEBUG
+ assert(raw_block_contents->is_raw_block);
+#endif // NDEBUG
+
+ // We cannot directly put raw_block_contents because this could point to
+ // an object in the stack.
+ BlockContents* block_cont_for_comp_cache =
+ new BlockContents(std::move(*raw_block_contents));
+ s = block_cache_compressed->Insert(
+ compressed_block_cache_key, block_cont_for_comp_cache,
+ block_cont_for_comp_cache->ApproximateMemoryUsage(),
+ &DeleteCachedEntry<BlockContents>);
+ if (s.ok()) {
+ // Avoid the following code to delete this cached block.
+ RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+ delete block_cont_for_comp_cache;
+ }
+ }
+
+ // insert into uncompressed block cache
+ if (block_cache != nullptr && block_holder->own_bytes()) {
+ size_t charge = block_holder->ApproximateMemoryUsage();
+ Cache::Handle* cache_handle = nullptr;
+ s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
+ &DeleteCachedEntry<TBlocklike>, &cache_handle,
+ priority);
+ if (s.ok()) {
+ assert(cache_handle != nullptr);
+ cached_block->SetCachedValue(block_holder.release(), block_cache,
+ cache_handle);
+
+ UpdateCacheInsertionMetrics(block_type, get_context, charge);
+ } else {
+ RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
+ }
+ } else {
+ cached_block->SetOwnedValue(block_holder.release());
+ }
+
+ return s;
+}
+
+std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
+ FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+ bool pin, BlockCacheLookupContext* lookup_context) {
+ auto& rep = rep_;
+ auto filter_type = rep->filter_type;
+ if (filter_type == Rep::FilterType::kNoFilter) {
+ return std::unique_ptr<FilterBlockReader>();
+ }
+
+ assert(rep->filter_policy);
+
+ switch (filter_type) {
+ case Rep::FilterType::kPartitionedFilter:
+ return PartitionedFilterBlockReader::Create(
+ this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
+
+ case Rep::FilterType::kBlockFilter:
+ return BlockBasedFilterBlockReader::Create(
+ this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
+
+ case Rep::FilterType::kFullFilter:
+ return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
+ prefetch, pin, lookup_context);
+
+ default:
+ // filter_type is either kNoFilter (exited the function at the first if),
+ // or it must be covered in this switch block
+ assert(false);
+ return std::unique_ptr<FilterBlockReader>();
+ }
+}
+
+// disable_prefix_seek should be set to true when prefix_extractor found in SST
+// differs from the one in mutable_cf_options and index type is HashBasedIndex
+InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
+ const ReadOptions& read_options, bool disable_prefix_seek,
+ IndexBlockIter* input_iter, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) const {
+ assert(rep_ != nullptr);
+ assert(rep_->index_reader != nullptr);
+
+ // We don't return pinned data from index blocks, so no need
+ // to set `block_contents_pinned`.
+ return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
+ input_iter, get_context,
+ lookup_context);
+}
+
+// Convert an index iterator value (i.e., an encoded BlockHandle)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(
+ const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
+ BlockType block_type, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context, Status s,
+ FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
+ PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+ TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+ if (!s.ok()) {
+ iter->Invalidate(s);
+ return iter;
+ }
+
+ CachableEntry<UncompressionDict> uncompression_dict;
+ if (rep_->uncompression_dict_reader) {
+ const bool no_io = (ro.read_tier == kBlockCacheTier);
+ s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+ prefetch_buffer, no_io, get_context, lookup_context,
+ &uncompression_dict);
+ if (!s.ok()) {
+ iter->Invalidate(s);
+ return iter;
+ }
+ }
+
+ const UncompressionDict& dict = uncompression_dict.GetValue()
+ ? *uncompression_dict.GetValue()
+ : UncompressionDict::GetEmptyDict();
+
+ CachableEntry<Block> block;
+ s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
+ get_context, lookup_context, for_compaction,
+ /* use_cache */ true);
+
+ if (!s.ok()) {
+ assert(block.IsEmpty());
+ iter->Invalidate(s);
+ return iter;
+ }
+
+ assert(block.GetValue() != nullptr);
+
+ // Block contents are pinned and it is still pinned after the iterator
+ // is destroyed as long as cleanup functions are moved to another object,
+ // when:
+ // 1. block cache handle is set to be released in cleanup function, or
+ // 2. it's pointing to immortal source. If own_bytes is true then we are
+ // not reading data from the original source, whether immortal or not.
+ // Otherwise, the block is pinned iff the source is immortal.
+ const bool block_contents_pinned =
+ block.IsCached() ||
+ (!block.GetValue()->own_bytes() && rep_->immortal_table);
+ iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
+ block_contents_pinned);
+
+ if (!block.IsCached()) {
+ if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
+ // insert a dummy record to block cache to track the memory usage
+ Cache* const block_cache = rep_->table_options.block_cache.get();
+ Cache::Handle* cache_handle = nullptr;
+ // There are two other types of cache keys: 1) SST cache key added in
+ // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
+ // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
+ // from SST cache key(31 bytes), and use non-zero prefix to
+ // differentiate from `write_buffer_manager`
+ const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
+ char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
+ // Prefix: use rep_->cache_key_prefix padded by 0s
+ memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
+ assert(rep_->cache_key_prefix_size != 0);
+ assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
+ memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
+ char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
+ next_cache_key_id_++);
+ assert(end - cache_key <=
+ static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
+ const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
+ s = block_cache->Insert(unique_key, nullptr,
+ block.GetValue()->ApproximateMemoryUsage(),
+ nullptr, &cache_handle);
+
+ if (s.ok()) {
+ assert(cache_handle != nullptr);
+ iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+ cache_handle);
+ }
+ }
+ } else {
+ iter->SetCacheHandle(block.GetCacheHandle());
+ }
+
+ block.TransferTo(iter);
+
+ return iter;
+}
+
+template <>
+DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
+ const Rep* rep, Block* block, DataBlockIter* input_iter,
+ bool block_contents_pinned) {
+ return block->NewDataIterator(
+ &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+ input_iter, rep->ioptions.statistics, block_contents_pinned);
+}
+
+template <>
+IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
+ const Rep* rep, Block* block, IndexBlockIter* input_iter,
+ bool block_contents_pinned) {
+ return block->NewIndexIterator(
+ &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+ input_iter, rep->ioptions.statistics, /* total_order_seek */ true,
+ rep->index_has_first_key, rep->index_key_includes_seq,
+ rep->index_value_is_full, block_contents_pinned);
+}
+
+// Convert an uncompressed data block (i.e CachableEntry<Block>)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
+ CachableEntry<Block>& block,
+ TBlockIter* input_iter,
+ Status s) const {
+ PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+ TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+ if (!s.ok()) {
+ iter->Invalidate(s);
+ return iter;
+ }
+
+ assert(block.GetValue() != nullptr);
+ // Block contents are pinned and it is still pinned after the iterator
+ // is destroyed as long as cleanup functions are moved to another object,
+ // when:
+ // 1. block cache handle is set to be released in cleanup function, or
+ // 2. it's pointing to immortal source. If own_bytes is true then we are
+ // not reading data from the original source, whether immortal or not.
+ // Otherwise, the block is pinned iff the source is immortal.
+ const bool block_contents_pinned =
+ block.IsCached() ||
+ (!block.GetValue()->own_bytes() && rep_->immortal_table);
+ iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
+ block_contents_pinned);
+
+ if (!block.IsCached()) {
+ if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
+ // insert a dummy record to block cache to track the memory usage
+ Cache* const block_cache = rep_->table_options.block_cache.get();
+ Cache::Handle* cache_handle = nullptr;
+ // There are two other types of cache keys: 1) SST cache key added in
+ // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
+ // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
+ // from SST cache key(31 bytes), and use non-zero prefix to
+ // differentiate from `write_buffer_manager`
+ const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
+ char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
+ // Prefix: use rep_->cache_key_prefix padded by 0s
+ memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
+ assert(rep_->cache_key_prefix_size != 0);
+ assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
+ memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
+ char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
+ next_cache_key_id_++);
+ assert(end - cache_key <=
+ static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
+ const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
+ s = block_cache->Insert(unique_key, nullptr,
+ block.GetValue()->ApproximateMemoryUsage(),
+ nullptr, &cache_handle);
+ if (s.ok()) {
+ assert(cache_handle != nullptr);
+ iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+ cache_handle);
+ }
+ }
+ } else {
+ iter->SetCacheHandle(block.GetCacheHandle());
+ }
+
+ block.TransferTo(iter);
+ return iter;
+}
+
+// If contents is nullptr, this function looks up the block caches for the
+// data block referenced by handle, and read the block from disk if necessary.
+// If contents is non-null, it skips the cache lookup and disk read, since
+// the caller has already read it. In both cases, if ro.fill_cache is true,
+// it inserts the block into the block cache.
+template <typename TBlocklike>
+Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
+ FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+ const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+ CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+ GetContext* get_context, BlockCacheLookupContext* lookup_context,
+ BlockContents* contents) const {
+ assert(block_entry != nullptr);
+ const bool no_io = (ro.read_tier == kBlockCacheTier);
+ Cache* block_cache = rep_->table_options.block_cache.get();
+ // No point to cache compressed blocks if it never goes away
+ Cache* block_cache_compressed =
+ rep_->immortal_table ? nullptr
+ : rep_->table_options.block_cache_compressed.get();
+
+ // First, try to get the block from the cache
+ //
+ // If either block cache is enabled, we'll try to read from it.
+ Status s;
+ char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+ char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+ Slice key /* key to the block cache */;
+ Slice ckey /* key to the compressed block cache */;
+ bool is_cache_hit = false;
+ if (block_cache != nullptr || block_cache_compressed != nullptr) {
+ // create key for block cache
+ if (block_cache != nullptr) {
+ key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
+ handle, cache_key);
+ }
+
+ if (block_cache_compressed != nullptr) {
+ ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
+ rep_->compressed_cache_key_prefix_size, handle,
+ compressed_cache_key);
+ }
+
+ if (!contents) {
+ s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
+ ro, block_entry, uncompression_dict, block_type,
+ get_context);
+ if (block_entry->GetValue()) {
+ // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
+ // compressed block cache.
+ is_cache_hit = true;
+ }
+ }
+
+ // Can't find the block from the cache. If I/O is allowed, read from the
+ // file.
+ if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
+ Statistics* statistics = rep_->ioptions.statistics;
+ const bool maybe_compressed =
+ block_type != BlockType::kFilter &&
+ block_type != BlockType::kCompressionDictionary &&
+ rep_->blocks_maybe_compressed;
+ const bool do_uncompress = maybe_compressed && !block_cache_compressed;
+ CompressionType raw_block_comp_type;
+ BlockContents raw_block_contents;
+ if (!contents) {
+ StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
+ BlockFetcher block_fetcher(
+ rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
+ &raw_block_contents, rep_->ioptions, do_uncompress,
+ maybe_compressed, block_type, uncompression_dict,
+ rep_->persistent_cache_options,
+ GetMemoryAllocator(rep_->table_options),
+ GetMemoryAllocatorForCompressedBlock(rep_->table_options));
+ s = block_fetcher.ReadBlockContents();
+ raw_block_comp_type = block_fetcher.get_compression_type();
+ contents = &raw_block_contents;
+ } else {
+ raw_block_comp_type = contents->get_compression_type();
+ }
+
+ if (s.ok()) {
+ SequenceNumber seq_no = rep_->get_global_seqno(block_type);
+ // If filling cache is allowed and a cache is configured, try to put the
+ // block to the cache.
+ s = PutDataBlockToCache(
+ key, ckey, block_cache, block_cache_compressed, block_entry,
+ contents, raw_block_comp_type, uncompression_dict, seq_no,
+ GetMemoryAllocator(rep_->table_options), block_type, get_context);
+ }
+ }
+ }
+
+ // Fill lookup_context.
+ if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
+ lookup_context) {
+ size_t usage = 0;
+ uint64_t nkeys = 0;
+ if (block_entry->GetValue()) {
+ // Approximate the number of keys in the block using restarts.
+ nkeys =
+ rep_->table_options.block_restart_interval *
+ BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
+ usage = block_entry->GetValue()->ApproximateMemoryUsage();
+ }
+ TraceType trace_block_type = TraceType::kTraceMax;
+ switch (block_type) {
+ case BlockType::kData:
+ trace_block_type = TraceType::kBlockTraceDataBlock;
+ break;
+ case BlockType::kFilter:
+ trace_block_type = TraceType::kBlockTraceFilterBlock;
+ break;
+ case BlockType::kCompressionDictionary:
+ trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
+ break;
+ case BlockType::kRangeDeletion:
+ trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
+ break;
+ case BlockType::kIndex:
+ trace_block_type = TraceType::kBlockTraceIndexBlock;
+ break;
+ default:
+ // This cannot happen.
+ assert(false);
+ break;
+ }
+ bool no_insert = no_io || !ro.fill_cache;
+ if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
+ trace_block_type, lookup_context->caller)) {
+ // Defer logging the access to Get() and MultiGet() to trace additional
+ // information, e.g., referenced_key_exist_in_block.
+
+ // Make a copy of the block key here since it will be logged later.
+ lookup_context->FillLookupContext(
+ is_cache_hit, no_insert, trace_block_type,
+ /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
+ } else {
+ // Avoid making copy of block_key and cf_name when constructing the access
+ // record.
+ BlockCacheTraceRecord access_record(
+ rep_->ioptions.env->NowMicros(),
+ /*block_key=*/"", trace_block_type,
+ /*block_size=*/usage, rep_->cf_id_for_tracing(),
+ /*cf_name=*/"", rep_->level_for_tracing(),
+ rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
+ no_insert, lookup_context->get_id,
+ lookup_context->get_from_user_specified_snapshot,
+ /*referenced_key=*/"");
+ block_cache_tracer_->WriteBlockAccess(access_record, key,
+ rep_->cf_name_for_tracing(),
+ lookup_context->referenced_key);
+ }
+ }
+
+ assert(s.ok() || block_entry->GetValue() == nullptr);
+ return s;
+}
+
+// This function reads multiple data blocks from disk using Env::MultiRead()
+// and optionally inserts them into the block cache. It uses the scratch
+// buffer provided by the caller, which is contiguous. If scratch is a nullptr
+// it allocates a separate buffer for each block. Typically, if the blocks
+// need to be uncompressed and there is no compressed block cache, callers
+// can allocate a temporary scratch buffer in order to minimize memory
+// allocations.
+// If options.fill_cache is true, it inserts the blocks into cache. If its
+// false and scratch is non-null and the blocks are uncompressed, it copies
+// the buffers to heap. In any case, the CachableEntry<Block> returned will
+// own the data bytes.
+// If compression is enabled and also there is no compressed block cache,
+// the adjacent blocks are read out in one IO (combined read)
+// batch - A MultiGetRange with only those keys with unique data blocks not
+// found in cache
+// handles - A vector of block handles. Some of them me be NULL handles
+// scratch - An optional contiguous buffer to read compressed blocks into
+void BlockBasedTable::RetrieveMultipleBlocks(
+ const ReadOptions& options, const MultiGetRange* batch,
+ const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
+ autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+ autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
+ char* scratch, const UncompressionDict& uncompression_dict) const {
+ RandomAccessFileReader* file = rep_->file.get();
+ const Footer& footer = rep_->footer;
+ const ImmutableCFOptions& ioptions = rep_->ioptions;
+ SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData);
+ size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
+ MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
+
+ if (file->use_direct_io() || ioptions.allow_mmap_reads) {
+ size_t idx_in_batch = 0;
+ for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+ ++mget_iter, ++idx_in_batch) {
+ BlockCacheLookupContext lookup_data_block_context(
+ TableReaderCaller::kUserMultiGet);
+ const BlockHandle& handle = (*handles)[idx_in_batch];
+ if (handle.IsNull()) {
+ continue;
+ }
+
+ (*statuses)[idx_in_batch] =
+ RetrieveBlock(nullptr, options, handle, uncompression_dict,
+ &(*results)[idx_in_batch], BlockType::kData,
+ mget_iter->get_context, &lookup_data_block_context,
+ /* for_compaction */ false, /* use_cache */ true);
+ }
+ return;
+ }
+
+ autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
+ size_t buf_offset = 0;
+ size_t idx_in_batch = 0;
+
+ uint64_t prev_offset = 0;
+ size_t prev_len = 0;
+ autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block;
+ autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block;
+ for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+ ++mget_iter, ++idx_in_batch) {
+ const BlockHandle& handle = (*handles)[idx_in_batch];
+ if (handle.IsNull()) {
+ continue;
+ }
+
+ size_t prev_end = static_cast<size_t>(prev_offset) + prev_len;
+
+ // If current block is adjacent to the previous one, at the same time,
+ // compression is enabled and there is no compressed cache, we combine
+ // the two block read as one.
+ if (scratch != nullptr && prev_end == handle.offset()) {
+ req_offset_for_block.emplace_back(prev_len);
+ prev_len += block_size(handle);
+ } else {
+ // No compression or current block and previous one is not adjacent:
+ // Step 1, create a new request for previous blocks
+ if (prev_len != 0) {
+ FSReadRequest req;
+ req.offset = prev_offset;
+ req.len = prev_len;
+ if (scratch == nullptr) {
+ req.scratch = new char[req.len];
+ } else {
+ req.scratch = scratch + buf_offset;
+ buf_offset += req.len;
+ }
+ req.status = IOStatus::OK();
+ read_reqs.emplace_back(req);
+ }
+
+ // Step 2, remeber the previous block info
+ prev_offset = handle.offset();
+ prev_len = block_size(handle);
+ req_offset_for_block.emplace_back(0);
+ }
+ req_idx_for_block.emplace_back(read_reqs.size());
+ }
+ // Handle the last block and process the pending last request
+ if (prev_len != 0) {
+ FSReadRequest req;
+ req.offset = prev_offset;
+ req.len = prev_len;
+ if (scratch == nullptr) {
+ req.scratch = new char[req.len];
+ } else {
+ req.scratch = scratch + buf_offset;
+ }
+ req.status = IOStatus::OK();
+ read_reqs.emplace_back(req);
+ }
+
+ file->MultiRead(&read_reqs[0], read_reqs.size());
+
+ idx_in_batch = 0;
+ size_t valid_batch_idx = 0;
+ for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+ ++mget_iter, ++idx_in_batch) {
+ const BlockHandle& handle = (*handles)[idx_in_batch];
+
+ if (handle.IsNull()) {
+ continue;
+ }
+
+ assert(valid_batch_idx < req_idx_for_block.size());
+ assert(valid_batch_idx < req_offset_for_block.size());
+ assert(req_idx_for_block[valid_batch_idx] < read_reqs.size());
+ size_t& req_idx = req_idx_for_block[valid_batch_idx];
+ size_t& req_offset = req_offset_for_block[valid_batch_idx];
+ valid_batch_idx++;
+ FSReadRequest& req = read_reqs[req_idx];
+ Status s = req.status;
+ if (s.ok()) {
+ if (req.result.size() != req.len) {
+ s = Status::Corruption(
+ "truncated block read from " + rep_->file->file_name() +
+ " offset " + ToString(handle.offset()) + ", expected " +
+ ToString(req.len) + " bytes, got " + ToString(req.result.size()));
+ }
+ }
+
+ BlockContents raw_block_contents;
+ size_t cur_read_end = req_offset + block_size(handle);
+ if (cur_read_end > req.result.size()) {
+ s = Status::Corruption(
+ "truncated block read from " + rep_->file->file_name() + " offset " +
+ ToString(handle.offset()) + ", expected " + ToString(req.len) +
+ " bytes, got " + ToString(req.result.size()));
+ }
+
+ bool blocks_share_read_buffer = (req.result.size() != block_size(handle));
+ if (s.ok()) {
+ if (scratch == nullptr && !blocks_share_read_buffer) {
+ // We allocated a buffer for this block. Give ownership of it to
+ // BlockContents so it can free the memory
+ assert(req.result.data() == req.scratch);
+ std::unique_ptr<char[]> raw_block(req.scratch + req_offset);
+ raw_block_contents = BlockContents(std::move(raw_block), handle.size());
+ } else {
+ // We used the scratch buffer which are shared by the blocks.
+ // raw_block_contents does not have the ownership.
+ raw_block_contents =
+ BlockContents(Slice(req.scratch + req_offset, handle.size()));
+ }
+
+#ifndef NDEBUG
+ raw_block_contents.is_raw_block = true;
+#endif
+ if (options.verify_checksums) {
+ PERF_TIMER_GUARD(block_checksum_time);
+ const char* data = req.result.data();
+ uint32_t expected =
+ DecodeFixed32(data + req_offset + handle.size() + 1);
+ // Since the scratch might be shared. the offset of the data block in
+ // the buffer might not be 0. req.result.data() only point to the
+ // begin address of each read request, we need to add the offset
+ // in each read request. Checksum is stored in the block trailer,
+ // which is handle.size() + 1.
+ s = ROCKSDB_NAMESPACE::VerifyChecksum(footer.checksum(),
+ req.result.data() + req_offset,
+ handle.size() + 1, expected);
+ TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
+ }
+ }
+
+ if (s.ok()) {
+ // It handles a rare case: compression is set and these is no compressed
+ // cache (enable combined read). In this case, the scratch != nullptr.
+ // At the same time, some blocks are actually not compressed,
+ // since its compression space saving is smaller than the threshold. In
+ // this case, if the block shares the scratch memory, we need to copy it
+ // to the heap such that it can be added to the regular block cache.
+ CompressionType compression_type =
+ raw_block_contents.get_compression_type();
+ if (scratch != nullptr && compression_type == kNoCompression) {
+ Slice raw = Slice(req.scratch + req_offset, block_size(handle));
+ raw_block_contents = BlockContents(
+ CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
+ handle.size());
+#ifndef NDEBUG
+ raw_block_contents.is_raw_block = true;
+#endif
+ }
+ }
+
+ if (s.ok()) {
+ if (options.fill_cache) {
+ BlockCacheLookupContext lookup_data_block_context(
+ TableReaderCaller::kUserMultiGet);
+ CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
+ // MaybeReadBlockAndLoadToCache will insert into the block caches if
+ // necessary. Since we're passing the raw block contents, it will
+ // avoid looking up the block cache
+ s = MaybeReadBlockAndLoadToCache(
+ nullptr, options, handle, uncompression_dict, block_entry,
+ BlockType::kData, mget_iter->get_context,
+ &lookup_data_block_context, &raw_block_contents);
+
+ // block_entry value could be null if no block cache is present, i.e
+ // BlockBasedTableOptions::no_block_cache is true and no compressed
+ // block cache is configured. In that case, fall
+ // through and set up the block explicitly
+ if (block_entry->GetValue() != nullptr) {
+ continue;
+ }
+ }
+
+ CompressionType compression_type =
+ raw_block_contents.get_compression_type();
+ BlockContents contents;
+ if (compression_type != kNoCompression) {
+ UncompressionContext context(compression_type);
+ UncompressionInfo info(context, uncompression_dict, compression_type);
+ s = UncompressBlockContents(info, req.result.data() + req_offset,
+ handle.size(), &contents, footer.version(),
+ rep_->ioptions, memory_allocator);
+ } else {
+ // There are two cases here: 1) caller uses the scratch buffer; 2) we
+ // use the requst buffer. If scratch buffer is used, we ensure that
+ // all raw blocks are copyed to the heap as single blocks. If scratch
+ // buffer is not used, we also have no combined read, so the raw
+ // block can be used directly.
+ contents = std::move(raw_block_contents);
+ }
+ if (s.ok()) {
+ (*results)[idx_in_batch].SetOwnedValue(
+ new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
+ ioptions.statistics));
+ }
+ }
+ (*statuses)[idx_in_batch] = s;
+ }
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::RetrieveBlock(
+ FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+ const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+ CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+ GetContext* get_context, BlockCacheLookupContext* lookup_context,
+ bool for_compaction, bool use_cache) const {
+ assert(block_entry);
+ assert(block_entry->IsEmpty());
+
+ Status s;
+ if (use_cache) {
+ s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
+ uncompression_dict, block_entry,
+ block_type, get_context, lookup_context,
+ /*contents=*/nullptr);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (block_entry->GetValue() != nullptr) {
+ assert(s.ok());
+ return s;
+ }
+ }
+
+ assert(block_entry->IsEmpty());
+
+ const bool no_io = ro.read_tier == kBlockCacheTier;
+ if (no_io) {
+ return Status::Incomplete("no blocking io");
+ }
+
+ const bool maybe_compressed =
+ block_type != BlockType::kFilter &&
+ block_type != BlockType::kCompressionDictionary &&
+ rep_->blocks_maybe_compressed;
+ const bool do_uncompress = maybe_compressed;
+ std::unique_ptr<TBlocklike> block;
+
+ {
+ StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
+ READ_BLOCK_GET_MICROS);
+ s = ReadBlockFromFile(
+ rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
+ rep_->ioptions, do_uncompress, maybe_compressed, block_type,
+ uncompression_dict, rep_->persistent_cache_options,
+ rep_->get_global_seqno(block_type),
+ block_type == BlockType::kData
+ ? rep_->table_options.read_amp_bytes_per_bit
+ : 0,
+ GetMemoryAllocator(rep_->table_options), for_compaction,
+ rep_->blocks_definitely_zstd_compressed,
+ rep_->table_options.filter_policy.get());
+ }
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ block_entry->SetOwnedValue(block.release());
+
+ assert(s.ok());
+ return s;
+}
+
+// Explicitly instantiate templates for both "blocklike" types we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template Status BlockBasedTable::RetrieveBlock<BlockContents>(
+ FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+ const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+ CachableEntry<BlockContents>* block_entry, BlockType block_type,
+ GetContext* get_context, BlockCacheLookupContext* lookup_context,
+ bool for_compaction, bool use_cache) const;
+
+template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
+ FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+ const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+ CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
+ GetContext* get_context, BlockCacheLookupContext* lookup_context,
+ bool for_compaction, bool use_cache) const;
+
+template Status BlockBasedTable::RetrieveBlock<Block>(
+ FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+ const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+ CachableEntry<Block>* block_entry, BlockType block_type,
+ GetContext* get_context, BlockCacheLookupContext* lookup_context,
+ bool for_compaction, bool use_cache) const;
+
+template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
+ FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+ const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+ CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
+ GetContext* get_context, BlockCacheLookupContext* lookup_context,
+ bool for_compaction, bool use_cache) const;
+
+BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
+ const BlockBasedTable* table,
+ std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
+ : table_(table), block_map_(block_map) {}
+
+InternalIteratorBase<IndexValue>*
+BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
+ const BlockHandle& handle) {
+ // Return a block iterator on the index partition
+ auto block = block_map_->find(handle.offset());
+ // This is a possible scenario since block cache might not have had space
+ // for the partition
+ if (block != block_map_->end()) {
+ const Rep* rep = table_->get_rep();
+ assert(rep);
+
+ Statistics* kNullStats = nullptr;
+ // We don't return pinned data from index blocks, so no need
+ // to set `block_contents_pinned`.
+ return block->second.GetValue()->NewIndexIterator(
+ &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+ nullptr, kNullStats, true, rep->index_has_first_key,
+ rep->index_key_includes_seq, rep->index_value_is_full);
+ }
+ // Create an empty iterator
+ return new IndexBlockIter();
+}
+
+// This will be broken if the user specifies an unusual implementation
+// of Options.comparator, or if the user specifies an unusual
+// definition of prefixes in BlockBasedTableOptions.filter_policy.
+// In particular, we require the following three properties:
+//
+// 1) key.starts_with(prefix(key))
+// 2) Compare(prefix(key), key) <= 0.
+// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
+//
+// Otherwise, this method guarantees no I/O will be incurred.
+//
+// REQUIRES: this method shouldn't be called while the DB lock is held.
+bool BlockBasedTable::PrefixMayMatch(
+ const Slice& internal_key, const ReadOptions& read_options,
+ const SliceTransform* options_prefix_extractor,
+ const bool need_upper_bound_check,
+ BlockCacheLookupContext* lookup_context) const {
+ if (!rep_->filter_policy) {
+ return true;
+ }
+
+ const SliceTransform* prefix_extractor;
+
+ if (rep_->table_prefix_extractor == nullptr) {
+ if (need_upper_bound_check) {
+ return true;
+ }
+ prefix_extractor = options_prefix_extractor;
+ } else {
+ prefix_extractor = rep_->table_prefix_extractor.get();
+ }
+ auto user_key = ExtractUserKey(internal_key);
+ if (!prefix_extractor->InDomain(user_key)) {
+ return true;
+ }
+
+ bool may_match = true;
+ Status s;
+
+ // First, try check with full filter
+ FilterBlockReader* const filter = rep_->filter.get();
+ bool filter_checked = true;
+ if (filter != nullptr) {
+ if (!filter->IsBlockBased()) {
+ const Slice* const const_ikey_ptr = &internal_key;
+ may_match = filter->RangeMayExist(
+ read_options.iterate_upper_bound, user_key, prefix_extractor,
+ rep_->internal_comparator.user_comparator(), const_ikey_ptr,
+ &filter_checked, need_upper_bound_check, lookup_context);
+ } else {
+ // if prefix_extractor changed for block based filter, skip filter
+ if (need_upper_bound_check) {
+ return true;
+ }
+ auto prefix = prefix_extractor->Transform(user_key);
+ InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
+ auto internal_prefix = internal_key_prefix.Encode();
+
+ // To prevent any io operation in this method, we set `read_tier` to make
+ // sure we always read index or filter only when they have already been
+ // loaded to memory.
+ ReadOptions no_io_read_options;
+ no_io_read_options.read_tier = kBlockCacheTier;
+
+ // Then, try find it within each block
+ // we already know prefix_extractor and prefix_extractor_name must match
+ // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
+ std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
+ no_io_read_options,
+ /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+ /*get_context=*/nullptr, lookup_context));
+ iiter->Seek(internal_prefix);
+
+ if (!iiter->Valid()) {
+ // we're past end of file
+ // if it's incomplete, it means that we avoided I/O
+ // and we're not really sure that we're past the end
+ // of the file
+ may_match = iiter->status().IsIncomplete();
+ } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
+ : iiter->key())
+ .starts_with(ExtractUserKey(internal_prefix))) {
+ // we need to check for this subtle case because our only
+ // guarantee is that "the key is a string >= last key in that data
+ // block" according to the doc/table_format.txt spec.
+ //
+ // Suppose iiter->key() starts with the desired prefix; it is not
+ // necessarily the case that the corresponding data block will
+ // contain the prefix, since iiter->key() need not be in the
+ // block. However, the next data block may contain the prefix, so
+ // we return true to play it safe.
+ may_match = true;
+ } else if (filter->IsBlockBased()) {
+ // iiter->key() does NOT start with the desired prefix. Because
+ // Seek() finds the first key that is >= the seek target, this
+ // means that iiter->key() > prefix. Thus, any data blocks coming
+ // after the data block corresponding to iiter->key() cannot
+ // possibly contain the key. Thus, the corresponding data block
+ // is the only on could potentially contain the prefix.
+ BlockHandle handle = iiter->value().handle;
+ may_match = filter->PrefixMayMatch(
+ prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
+ /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
+ }
+ }
+ }
+
+ if (filter_checked) {
+ Statistics* statistics = rep_->ioptions.statistics;
+ RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
+ if (!may_match) {
+ RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
+ }
+ }
+
+ return may_match;
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
+ SeekImpl(&target);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
+ SeekImpl(nullptr);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
+ const Slice* target) {
+ is_out_of_bound_ = false;
+ is_at_first_key_from_index_ = false;
+ if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
+ ResetDataIter();
+ return;
+ }
+
+ bool need_seek_index = true;
+ if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
+ // Reseek.
+ prev_block_offset_ = index_iter_->value().handle.offset();
+
+ if (target) {
+ // We can avoid an index seek if:
+ // 1. The new seek key is larger than the current key
+ // 2. The new seek key is within the upper bound of the block
+ // Since we don't necessarily know the internal key for either
+ // the current key or the upper bound, we check user keys and
+ // exclude the equality case. Considering internal keys can
+ // improve for the boundary cases, but it would complicate the
+ // code.
+ if (user_comparator_.Compare(ExtractUserKey(*target),
+ block_iter_.user_key()) > 0 &&
+ user_comparator_.Compare(ExtractUserKey(*target),
+ index_iter_->user_key()) < 0) {
+ need_seek_index = false;
+ }
+ }
+ }
+
+ if (need_seek_index) {
+ if (target) {
+ index_iter_->Seek(*target);
+ } else {
+ index_iter_->SeekToFirst();
+ }
+
+ if (!index_iter_->Valid()) {
+ ResetDataIter();
+ return;
+ }
+ }
+
+ IndexValue v = index_iter_->value();
+ const bool same_block = block_iter_points_to_real_block_ &&
+ v.handle.offset() == prev_block_offset_;
+
+ // TODO(kolmike): Remove the != kBlockCacheTier condition.
+ if (!v.first_internal_key.empty() && !same_block &&
+ (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
+ read_options_.read_tier != kBlockCacheTier) {
+ // Index contains the first key of the block, and it's >= target.
+ // We can defer reading the block.
+ is_at_first_key_from_index_ = true;
+ // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
+ // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
+ // as that will be done later when the data block is actually read.
+ ResetDataIter();
+ } else {
+ // Need to use the data block.
+ if (!same_block) {
+ InitDataBlock();
+ } else {
+ // When the user does a reseek, the iterate_upper_bound might have
+ // changed. CheckDataBlockWithinUpperBound() needs to be called
+ // explicitly if the reseek ends up in the same data block.
+ // If the reseek ends up in a different block, InitDataBlock() will do
+ // the iterator upper bound check.
+ CheckDataBlockWithinUpperBound();
+ }
+
+ if (target) {
+ block_iter_.Seek(*target);
+ } else {
+ block_iter_.SeekToFirst();
+ }
+ FindKeyForward();
+ }
+
+ CheckOutOfBound();
+
+ if (target) {
+ assert(!Valid() || ((block_type_ == BlockType::kIndex &&
+ !table_->get_rep()->index_key_includes_seq)
+ ? (user_comparator_.Compare(ExtractUserKey(*target),
+ key()) <= 0)
+ : (icomp_.Compare(*target, key()) <= 0)));
+ }
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
+ const Slice& target) {
+ is_out_of_bound_ = false;
+ is_at_first_key_from_index_ = false;
+ // For now totally disable prefix seek in auto prefix mode because we don't
+ // have logic
+ if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
+ ResetDataIter();
+ return;
+ }
+
+ SavePrevIndexValue();
+
+ // Call Seek() rather than SeekForPrev() in the index block, because the
+ // target data block will likely to contain the position for `target`, the
+ // same as Seek(), rather than than before.
+ // For example, if we have three data blocks, each containing two keys:
+ // [2, 4] [6, 8] [10, 12]
+ // (the keys in the index block would be [4, 8, 12])
+ // and the user calls SeekForPrev(7), we need to go to the second block,
+ // just like if they call Seek(7).
+ // The only case where the block is difference is when they seek to a position
+ // in the boundary. For example, if they SeekForPrev(5), we should go to the
+ // first block, rather than the second. However, we don't have the information
+ // to distinguish the two unless we read the second block. In this case, we'll
+ // end up with reading two blocks.
+ index_iter_->Seek(target);
+
+ if (!index_iter_->Valid()) {
+ auto seek_status = index_iter_->status();
+ // Check for IO error
+ if (!seek_status.IsNotFound() && !seek_status.ok()) {
+ ResetDataIter();
+ return;
+ }
+
+ // With prefix index, Seek() returns NotFound if the prefix doesn't exist
+ if (seek_status.IsNotFound()) {
+ // Any key less than the target is fine for prefix seek
+ ResetDataIter();
+ return;
+ } else {
+ index_iter_->SeekToLast();
+ }
+ // Check for IO error
+ if (!index_iter_->Valid()) {
+ ResetDataIter();
+ return;
+ }
+ }
+
+ InitDataBlock();
+
+ block_iter_.SeekForPrev(target);
+
+ FindKeyBackward();
+ CheckDataBlockWithinUpperBound();
+ assert(!block_iter_.Valid() ||
+ icomp_.Compare(target, block_iter_.key()) >= 0);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
+ is_out_of_bound_ = false;
+ is_at_first_key_from_index_ = false;
+ SavePrevIndexValue();
+ index_iter_->SeekToLast();
+ if (!index_iter_->Valid()) {
+ ResetDataIter();
+ return;
+ }
+ InitDataBlock();
+ block_iter_.SeekToLast();
+ FindKeyBackward();
+ CheckDataBlockWithinUpperBound();
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
+ if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
+ return;
+ }
+ assert(block_iter_points_to_real_block_);
+ block_iter_.Next();
+ FindKeyForward();
+ CheckOutOfBound();
+}
+
+template <class TBlockIter, typename TValue>
+bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
+ IterateResult* result) {
+ Next();
+ bool is_valid = Valid();
+ if (is_valid) {
+ result->key = key();
+ result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+ }
+ return is_valid;
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
+ if (is_at_first_key_from_index_) {
+ is_at_first_key_from_index_ = false;
+
+ index_iter_->Prev();
+ if (!index_iter_->Valid()) {
+ return;
+ }
+
+ InitDataBlock();
+ block_iter_.SeekToLast();
+ } else {
+ assert(block_iter_points_to_real_block_);
+ block_iter_.Prev();
+ }
+
+ FindKeyBackward();
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
+ BlockHandle data_block_handle = index_iter_->value().handle;
+ if (!block_iter_points_to_real_block_ ||
+ data_block_handle.offset() != prev_block_offset_ ||
+ // if previous attempt of reading the block missed cache, try again
+ block_iter_.status().IsIncomplete()) {
+ if (block_iter_points_to_real_block_) {
+ ResetDataIter();
+ }
+ auto* rep = table_->get_rep();
+
+ // Prefetch additional data for range scans (iterators). Enabled only for
+ // user reads.
+ // Implicit auto readahead:
+ // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
+ // Explicit user requested readahead:
+ // Enabled from the very first IO when ReadOptions.readahead_size is set.
+ if (lookup_context_.caller != TableReaderCaller::kCompaction) {
+ if (read_options_.readahead_size == 0) {
+ // Implicit auto readahead
+ num_file_reads_++;
+ if (num_file_reads_ >
+ BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
+ if (!rep->file->use_direct_io() &&
+ (data_block_handle.offset() +
+ static_cast<size_t>(block_size(data_block_handle)) >
+ readahead_limit_)) {
+ // Buffered I/O
+ // Discarding the return status of Prefetch calls intentionally, as
+ // we can fallback to reading from disk if Prefetch fails.
+ rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
+ readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
+ readahead_size_);
+ // Keep exponentially increasing readahead size until
+ // kMaxAutoReadaheadSize.
+ readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
+ readahead_size_ * 2);
+ } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
+ // Direct I/O
+ // Let FilePrefetchBuffer take care of the readahead.
+ rep->CreateFilePrefetchBuffer(
+ BlockBasedTable::kInitAutoReadaheadSize,
+ BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
+ }
+ }
+ } else if (!prefetch_buffer_) {
+ // Explicit user requested readahead
+ // The actual condition is:
+ // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
+ rep->CreateFilePrefetchBuffer(read_options_.readahead_size,
+ read_options_.readahead_size,
+ &prefetch_buffer_);
+ }
+ } else if (!prefetch_buffer_) {
+ rep->CreateFilePrefetchBuffer(compaction_readahead_size_,
+ compaction_readahead_size_,
+ &prefetch_buffer_);
+ }
+
+ Status s;
+ table_->NewDataBlockIterator<TBlockIter>(
+ read_options_, data_block_handle, &block_iter_, block_type_,
+ /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
+ /*for_compaction=*/lookup_context_.caller ==
+ TableReaderCaller::kCompaction);
+ block_iter_points_to_real_block_ = true;
+ CheckDataBlockWithinUpperBound();
+ }
+}
+
+template <class TBlockIter, typename TValue>
+bool BlockBasedTableIterator<TBlockIter, TValue>::MaterializeCurrentBlock() {
+ assert(is_at_first_key_from_index_);
+ assert(!block_iter_points_to_real_block_);
+ assert(index_iter_->Valid());
+
+ is_at_first_key_from_index_ = false;
+ InitDataBlock();
+ assert(block_iter_points_to_real_block_);
+ block_iter_.SeekToFirst();
+
+ if (!block_iter_.Valid() ||
+ icomp_.Compare(block_iter_.key(),
+ index_iter_->value().first_internal_key) != 0) {
+ // Uh oh.
+ block_iter_.Invalidate(Status::Corruption(
+ "first key in index doesn't match first key in block"));
+ return false;
+ }
+
+ return true;
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
+ // This method's code is kept short to make it likely to be inlined.
+
+ assert(!is_out_of_bound_);
+ assert(block_iter_points_to_real_block_);
+
+ if (!block_iter_.Valid()) {
+ // This is the only call site of FindBlockForward(), but it's extracted into
+ // a separate method to keep FindKeyForward() short and likely to be
+ // inlined. When transitioning to a different block, we call
+ // FindBlockForward(), which is much longer and is probably not inlined.
+ FindBlockForward();
+ } else {
+ // This is the fast path that avoids a function call.
+ }
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
+ // TODO the while loop inherits from two-level-iterator. We don't know
+ // whether a block can be empty so it can be replaced by an "if".
+ do {
+ if (!block_iter_.status().ok()) {
+ return;
+ }
+ // Whether next data block is out of upper bound, if there is one.
+ const bool next_block_is_out_of_bound =
+ read_options_.iterate_upper_bound != nullptr &&
+ block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
+ assert(!next_block_is_out_of_bound ||
+ user_comparator_.Compare(*read_options_.iterate_upper_bound,
+ index_iter_->user_key()) <= 0);
+ ResetDataIter();
+ index_iter_->Next();
+ if (next_block_is_out_of_bound) {
+ // The next block is out of bound. No need to read it.
+ TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
+ // We need to make sure this is not the last data block before setting
+ // is_out_of_bound_, since the index key for the last data block can be
+ // larger than smallest key of the next file on the same level.
+ if (index_iter_->Valid()) {
+ is_out_of_bound_ = true;
+ }
+ return;
+ }
+
+ if (!index_iter_->Valid()) {
+ return;
+ }
+
+ IndexValue v = index_iter_->value();
+
+ // TODO(kolmike): Remove the != kBlockCacheTier condition.
+ if (!v.first_internal_key.empty() &&
+ read_options_.read_tier != kBlockCacheTier) {
+ // Index contains the first key of the block. Defer reading the block.
+ is_at_first_key_from_index_ = true;
+ return;
+ }
+
+ InitDataBlock();
+ block_iter_.SeekToFirst();
+ } while (!block_iter_.Valid());
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
+ while (!block_iter_.Valid()) {
+ if (!block_iter_.status().ok()) {
+ return;
+ }
+
+ ResetDataIter();
+ index_iter_->Prev();
+
+ if (index_iter_->Valid()) {
+ InitDataBlock();
+ block_iter_.SeekToLast();
+ } else {
+ return;
+ }
+ }
+
+ // We could have check lower bound here too, but we opt not to do it for
+ // code simplicity.
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
+ if (read_options_.iterate_upper_bound != nullptr && Valid()) {
+ is_out_of_bound_ = user_comparator_.Compare(
+ *read_options_.iterate_upper_bound, user_key()) <= 0;
+ }
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter,
+ TValue>::CheckDataBlockWithinUpperBound() {
+ if (read_options_.iterate_upper_bound != nullptr &&
+ block_iter_points_to_real_block_) {
+ data_block_within_upper_bound_ =
+ (user_comparator_.Compare(*read_options_.iterate_upper_bound,
+ index_iter_->user_key()) > 0);
+ }
+}
+
+InternalIterator* BlockBasedTable::NewIterator(
+ const ReadOptions& read_options, const SliceTransform* prefix_extractor,
+ Arena* arena, bool skip_filters, TableReaderCaller caller,
+ size_t compaction_readahead_size) {
+ BlockCacheLookupContext lookup_context{caller};
+ bool need_upper_bound_check =
+ read_options.auto_prefix_mode ||
+ PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
+ if (arena == nullptr) {
+ return new BlockBasedTableIterator<DataBlockIter>(
+ this, read_options, rep_->internal_comparator,
+ NewIndexIterator(
+ read_options,
+ need_upper_bound_check &&
+ rep_->index_type == BlockBasedTableOptions::kHashSearch,
+ /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
+ !skip_filters && !read_options.total_order_seek &&
+ prefix_extractor != nullptr,
+ need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
+ compaction_readahead_size);
+ } else {
+ auto* mem =
+ arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
+ return new (mem) BlockBasedTableIterator<DataBlockIter>(
+ this, read_options, rep_->internal_comparator,
+ NewIndexIterator(
+ read_options,
+ need_upper_bound_check &&
+ rep_->index_type == BlockBasedTableOptions::kHashSearch,
+ /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
+ !skip_filters && !read_options.total_order_seek &&
+ prefix_extractor != nullptr,
+ need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
+ compaction_readahead_size);
+ }
+}
+
+FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
+ const ReadOptions& read_options) {
+ if (rep_->fragmented_range_dels == nullptr) {
+ return nullptr;
+ }
+ SequenceNumber snapshot = kMaxSequenceNumber;
+ if (read_options.snapshot != nullptr) {
+ snapshot = read_options.snapshot->GetSequenceNumber();
+ }
+ return new FragmentedRangeTombstoneIterator(
+ rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
+}
+
+bool BlockBasedTable::FullFilterKeyMayMatch(
+ const ReadOptions& read_options, FilterBlockReader* filter,
+ const Slice& internal_key, const bool no_io,
+ const SliceTransform* prefix_extractor, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) const {
+ if (filter == nullptr || filter->IsBlockBased()) {
+ return true;
+ }
+ Slice user_key = ExtractUserKey(internal_key);
+ const Slice* const const_ikey_ptr = &internal_key;
+ bool may_match = true;
+ if (rep_->whole_key_filtering) {
+ size_t ts_sz =
+ rep_->internal_comparator.user_comparator()->timestamp_size();
+ Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
+ may_match =
+ filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
+ no_io, const_ikey_ptr, get_context, lookup_context);
+ } else if (!read_options.total_order_seek && prefix_extractor &&
+ rep_->table_properties->prefix_extractor_name.compare(
+ prefix_extractor->Name()) == 0 &&
+ prefix_extractor->InDomain(user_key) &&
+ !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
+ prefix_extractor, kNotValid, no_io,
+ const_ikey_ptr, get_context,
+ lookup_context)) {
+ may_match = false;
+ }
+ if (may_match) {
+ RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
+ }
+ return may_match;
+}
+
+void BlockBasedTable::FullFilterKeysMayMatch(
+ const ReadOptions& read_options, FilterBlockReader* filter,
+ MultiGetRange* range, const bool no_io,
+ const SliceTransform* prefix_extractor,
+ BlockCacheLookupContext* lookup_context) const {
+ if (filter == nullptr || filter->IsBlockBased()) {
+ return;
+ }
+ if (rep_->whole_key_filtering) {
+ filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
+ lookup_context);
+ } else if (!read_options.total_order_seek && prefix_extractor &&
+ rep_->table_properties->prefix_extractor_name.compare(
+ prefix_extractor->Name()) == 0) {
+ filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
+ lookup_context);
+ }
+}
+
+Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
+ GetContext* get_context,
+ const SliceTransform* prefix_extractor,
+ bool skip_filters) {
+ assert(key.size() >= 8); // key must be internal key
+ assert(get_context != nullptr);
+ Status s;
+ const bool no_io = read_options.read_tier == kBlockCacheTier;
+
+ FilterBlockReader* const filter =
+ !skip_filters ? rep_->filter.get() : nullptr;
+
+ // First check the full filter
+ // If full filter not useful, Then go into each block
+ uint64_t tracing_get_id = get_context->get_tracing_get_id();
+ BlockCacheLookupContext lookup_context{
+ TableReaderCaller::kUserGet, tracing_get_id,
+ /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+ if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+ // Trace the key since it contains both user key and sequence number.
+ lookup_context.referenced_key = key.ToString();
+ lookup_context.get_from_user_specified_snapshot =
+ read_options.snapshot != nullptr;
+ }
+ const bool may_match =
+ FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
+ get_context, &lookup_context);
+ if (!may_match) {
+ RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
+ } else {
+ IndexBlockIter iiter_on_stack;
+ // if prefix_extractor found in block differs from options, disable
+ // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
+ bool need_upper_bound_check = false;
+ if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
+ need_upper_bound_check = PrefixExtractorChanged(
+ rep_->table_properties.get(), prefix_extractor);
+ }
+ auto iiter =
+ NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+ get_context, &lookup_context);
+ std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+ if (iiter != &iiter_on_stack) {
+ iiter_unique_ptr.reset(iiter);
+ }
+
+ size_t ts_sz =
+ rep_->internal_comparator.user_comparator()->timestamp_size();
+ bool matched = false; // if such user key mathced a key in SST
+ bool done = false;
+ for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
+ IndexValue v = iiter->value();
+
+ bool not_exist_in_filter =
+ filter != nullptr && filter->IsBlockBased() == true &&
+ !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
+ prefix_extractor, v.handle.offset(), no_io,
+ /*const_ikey_ptr=*/nullptr, get_context,
+ &lookup_context);
+
+ if (not_exist_in_filter) {
+ // Not found
+ // TODO: think about interaction with Merge. If a user key cannot
+ // cross one data block, we should be fine.
+ RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
+ break;
+ }
+
+ if (!v.first_internal_key.empty() && !skip_filters &&
+ UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+ .Compare(ExtractUserKey(key),
+ ExtractUserKey(v.first_internal_key)) < 0) {
+ // The requested key falls between highest key in previous block and
+ // lowest key in current block.
+ break;
+ }
+
+ BlockCacheLookupContext lookup_data_block_context{
+ TableReaderCaller::kUserGet, tracing_get_id,
+ /*get_from_user_specified_snapshot=*/read_options.snapshot !=
+ nullptr};
+ bool does_referenced_key_exist = false;
+ DataBlockIter biter;
+ uint64_t referenced_data_size = 0;
+ NewDataBlockIterator<DataBlockIter>(
+ read_options, v.handle, &biter, BlockType::kData, get_context,
+ &lookup_data_block_context,
+ /*s=*/Status(), /*prefetch_buffer*/ nullptr);
+
+ if (no_io && biter.status().IsIncomplete()) {
+ // couldn't get block from block_cache
+ // Update Saver.state to Found because we are only looking for
+ // whether we can guarantee the key is not there when "no_io" is set
+ get_context->MarkKeyMayExist();
+ break;
+ }
+ if (!biter.status().ok()) {
+ s = biter.status();
+ break;
+ }
+
+ bool may_exist = biter.SeekForGet(key);
+ // If user-specified timestamp is supported, we cannot end the search
+ // just because hash index lookup indicates the key+ts does not exist.
+ if (!may_exist && ts_sz == 0) {
+ // HashSeek cannot find the key this block and the the iter is not
+ // the end of the block, i.e. cannot be in the following blocks
+ // either. In this case, the seek_key cannot be found, so we break
+ // from the top level for-loop.
+ done = true;
+ } else {
+ // Call the *saver function on each entry/block until it returns false
+ for (; biter.Valid(); biter.Next()) {
+ ParsedInternalKey parsed_key;
+ if (!ParseInternalKey(biter.key(), &parsed_key)) {
+ s = Status::Corruption(Slice());
+ }
+
+ if (!get_context->SaveValue(
+ parsed_key, biter.value(), &matched,
+ biter.IsValuePinned() ? &biter : nullptr)) {
+ if (get_context->State() == GetContext::GetState::kFound) {
+ does_referenced_key_exist = true;
+ referenced_data_size = biter.key().size() + biter.value().size();
+ }
+ done = true;
+ break;
+ }
+ }
+ s = biter.status();
+ }
+ // Write the block cache access record.
+ if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+ // Avoid making copy of block_key, cf_name, and referenced_key when
+ // constructing the access record.
+ Slice referenced_key;
+ if (does_referenced_key_exist) {
+ referenced_key = biter.key();
+ } else {
+ referenced_key = key;
+ }
+ BlockCacheTraceRecord access_record(
+ rep_->ioptions.env->NowMicros(),
+ /*block_key=*/"", lookup_data_block_context.block_type,
+ lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+ /*cf_name=*/"", rep_->level_for_tracing(),
+ rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+ lookup_data_block_context.is_cache_hit,
+ lookup_data_block_context.no_insert,
+ lookup_data_block_context.get_id,
+ lookup_data_block_context.get_from_user_specified_snapshot,
+ /*referenced_key=*/"", referenced_data_size,
+ lookup_data_block_context.num_keys_in_block,
+ does_referenced_key_exist);
+ block_cache_tracer_->WriteBlockAccess(
+ access_record, lookup_data_block_context.block_key,
+ rep_->cf_name_for_tracing(), referenced_key);
+ }
+
+ if (done) {
+ // Avoid the extra Next which is expensive in two-level indexes
+ break;
+ }
+ }
+ if (matched && filter != nullptr && !filter->IsBlockBased()) {
+ RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+ rep_->level);
+ }
+ if (s.ok() && !iiter->status().IsNotFound()) {
+ s = iiter->status();
+ }
+ }
+
+ return s;
+}
+
+using MultiGetRange = MultiGetContext::Range;
+void BlockBasedTable::MultiGet(const ReadOptions& read_options,
+ const MultiGetRange* mget_range,
+ const SliceTransform* prefix_extractor,
+ bool skip_filters) {
+ FilterBlockReader* const filter =
+ !skip_filters ? rep_->filter.get() : nullptr;
+ MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
+ mget_range->end());
+
+ // First check the full filter
+ // If full filter not useful, Then go into each block
+ const bool no_io = read_options.read_tier == kBlockCacheTier;
+ uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+ if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
+ tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
+ }
+ BlockCacheLookupContext lookup_context{
+ TableReaderCaller::kUserMultiGet, tracing_mget_id,
+ /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+ FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
+ prefix_extractor, &lookup_context);
+
+ if (skip_filters || !sst_file_range.empty()) {
+ IndexBlockIter iiter_on_stack;
+ // if prefix_extractor found in block differs from options, disable
+ // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
+ bool need_upper_bound_check = false;
+ if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
+ need_upper_bound_check = PrefixExtractorChanged(
+ rep_->table_properties.get(), prefix_extractor);
+ }
+ auto iiter =
+ NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+ sst_file_range.begin()->get_context, &lookup_context);
+ std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+ if (iiter != &iiter_on_stack) {
+ iiter_unique_ptr.reset(iiter);
+ }
+
+ uint64_t offset = std::numeric_limits<uint64_t>::max();
+ autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
+ autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
+ autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
+ char stack_buf[kMultiGetReadStackBufSize];
+ std::unique_ptr<char[]> block_buf;
+ {
+ MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
+ sst_file_range.end());
+
+ CachableEntry<UncompressionDict> uncompression_dict;
+ Status uncompression_dict_status;
+ if (rep_->uncompression_dict_reader) {
+ uncompression_dict_status =
+ rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+ nullptr /* prefetch_buffer */, no_io,
+ sst_file_range.begin()->get_context, &lookup_context,
+ &uncompression_dict);
+ }
+
+ const UncompressionDict& dict = uncompression_dict.GetValue()
+ ? *uncompression_dict.GetValue()
+ : UncompressionDict::GetEmptyDict();
+
+ size_t total_len = 0;
+ ReadOptions ro = read_options;
+ ro.read_tier = kBlockCacheTier;
+
+ for (auto miter = data_block_range.begin();
+ miter != data_block_range.end(); ++miter) {
+ const Slice& key = miter->ikey;
+ iiter->Seek(miter->ikey);
+
+ IndexValue v;
+ if (iiter->Valid()) {
+ v = iiter->value();
+ }
+ if (!iiter->Valid() ||
+ (!v.first_internal_key.empty() && !skip_filters &&
+ UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+ .Compare(ExtractUserKey(key),
+ ExtractUserKey(v.first_internal_key)) < 0)) {
+ // The requested key falls between highest key in previous block and
+ // lowest key in current block.
+ *(miter->s) = iiter->status();
+ data_block_range.SkipKey(miter);
+ sst_file_range.SkipKey(miter);
+ continue;
+ }
+
+ if (!uncompression_dict_status.ok()) {
+ *(miter->s) = uncompression_dict_status;
+ data_block_range.SkipKey(miter);
+ sst_file_range.SkipKey(miter);
+ continue;
+ }
+
+ statuses.emplace_back();
+ results.emplace_back();
+ if (v.handle.offset() == offset) {
+ // We're going to reuse the block for this key later on. No need to
+ // look it up now. Place a null handle
+ block_handles.emplace_back(BlockHandle::NullBlockHandle());
+ continue;
+ }
+ // Lookup the cache for the given data block referenced by an index
+ // iterator value (i.e BlockHandle). If it exists in the cache,
+ // initialize block to the contents of the data block.
+ offset = v.handle.offset();
+ BlockHandle handle = v.handle;
+ BlockCacheLookupContext lookup_data_block_context(
+ TableReaderCaller::kUserMultiGet);
+ Status s = RetrieveBlock(
+ nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
+ miter->get_context, &lookup_data_block_context,
+ /* for_compaction */ false, /* use_cache */ true);
+ if (s.IsIncomplete()) {
+ s = Status::OK();
+ }
+ if (s.ok() && !results.back().IsEmpty()) {
+ // Found it in the cache. Add NULL handle to indicate there is
+ // nothing to read from disk
+ block_handles.emplace_back(BlockHandle::NullBlockHandle());
+ } else {
+ block_handles.emplace_back(handle);
+ total_len += block_size(handle);
+ }
+ }
+
+ if (total_len) {
+ char* scratch = nullptr;
+ // If the blocks need to be uncompressed and we don't need the
+ // compressed blocks, then we can use a contiguous block of
+ // memory to read in all the blocks as it will be temporary
+ // storage
+ // 1. If blocks are compressed and compressed block cache is there,
+ // alloc heap bufs
+ // 2. If blocks are uncompressed, alloc heap bufs
+ // 3. If blocks are compressed and no compressed block cache, use
+ // stack buf
+ if (rep_->table_options.block_cache_compressed == nullptr &&
+ rep_->blocks_maybe_compressed) {
+ if (total_len <= kMultiGetReadStackBufSize) {
+ scratch = stack_buf;
+ } else {
+ scratch = new char[total_len];
+ block_buf.reset(scratch);
+ }
+ }
+ RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
+ &statuses, &results, scratch, dict);
+ }
+ }
+
+ DataBlockIter first_biter;
+ DataBlockIter next_biter;
+ size_t idx_in_batch = 0;
+ for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
+ ++miter) {
+ Status s;
+ GetContext* get_context = miter->get_context;
+ const Slice& key = miter->ikey;
+ bool matched = false; // if such user key matched a key in SST
+ bool done = false;
+ bool first_block = true;
+ do {
+ DataBlockIter* biter = nullptr;
+ bool reusing_block = true;
+ uint64_t referenced_data_size = 0;
+ bool does_referenced_key_exist = false;
+ BlockCacheLookupContext lookup_data_block_context(
+ TableReaderCaller::kUserMultiGet, tracing_mget_id,
+ /*get_from_user_specified_snapshot=*/read_options.snapshot !=
+ nullptr);
+ if (first_block) {
+ if (!block_handles[idx_in_batch].IsNull() ||
+ !results[idx_in_batch].IsEmpty()) {
+ first_biter.Invalidate(Status::OK());
+ NewDataBlockIterator<DataBlockIter>(
+ read_options, results[idx_in_batch], &first_biter,
+ statuses[idx_in_batch]);
+ reusing_block = false;
+ }
+ biter = &first_biter;
+ idx_in_batch++;
+ } else {
+ IndexValue v = iiter->value();
+ if (!v.first_internal_key.empty() && !skip_filters &&
+ UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+ .Compare(ExtractUserKey(key),
+ ExtractUserKey(v.first_internal_key)) < 0) {
+ // The requested key falls between highest key in previous block and
+ // lowest key in current block.
+ break;
+ }
+
+ next_biter.Invalidate(Status::OK());
+ NewDataBlockIterator<DataBlockIter>(
+ read_options, iiter->value().handle, &next_biter,
+ BlockType::kData, get_context, &lookup_data_block_context,
+ Status(), nullptr);
+ biter = &next_biter;
+ reusing_block = false;
+ }
+
+ if (read_options.read_tier == kBlockCacheTier &&
+ biter->status().IsIncomplete()) {
+ // couldn't get block from block_cache
+ // Update Saver.state to Found because we are only looking for
+ // whether we can guarantee the key is not there when "no_io" is set
+ get_context->MarkKeyMayExist();
+ break;
+ }
+ if (!biter->status().ok()) {
+ s = biter->status();
+ break;
+ }
+
+ bool may_exist = biter->SeekForGet(key);
+ if (!may_exist) {
+ // HashSeek cannot find the key this block and the the iter is not
+ // the end of the block, i.e. cannot be in the following blocks
+ // either. In this case, the seek_key cannot be found, so we break
+ // from the top level for-loop.
+ break;
+ }
+
+ // Call the *saver function on each entry/block until it returns false
+ for (; biter->Valid(); biter->Next()) {
+ ParsedInternalKey parsed_key;
+ Cleanable dummy;
+ Cleanable* value_pinner = nullptr;
+ if (!ParseInternalKey(biter->key(), &parsed_key)) {
+ s = Status::Corruption(Slice());
+ }
+ if (biter->IsValuePinned()) {
+ if (reusing_block) {
+ Cache* block_cache = rep_->table_options.block_cache.get();
+ assert(biter->cache_handle() != nullptr);
+ block_cache->Ref(biter->cache_handle());
+ dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
+ biter->cache_handle());
+ value_pinner = &dummy;
+ } else {
+ value_pinner = biter;
+ }
+ }
+ if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
+ value_pinner)) {
+ if (get_context->State() == GetContext::GetState::kFound) {
+ does_referenced_key_exist = true;
+ referenced_data_size =
+ biter->key().size() + biter->value().size();
+ }
+ done = true;
+ break;
+ }
+ s = biter->status();
+ }
+ // Write the block cache access.
+ if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+ // Avoid making copy of block_key, cf_name, and referenced_key when
+ // constructing the access record.
+ Slice referenced_key;
+ if (does_referenced_key_exist) {
+ referenced_key = biter->key();
+ } else {
+ referenced_key = key;
+ }
+ BlockCacheTraceRecord access_record(
+ rep_->ioptions.env->NowMicros(),
+ /*block_key=*/"", lookup_data_block_context.block_type,
+ lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+ /*cf_name=*/"", rep_->level_for_tracing(),
+ rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+ lookup_data_block_context.is_cache_hit,
+ lookup_data_block_context.no_insert,
+ lookup_data_block_context.get_id,
+ lookup_data_block_context.get_from_user_specified_snapshot,
+ /*referenced_key=*/"", referenced_data_size,
+ lookup_data_block_context.num_keys_in_block,
+ does_referenced_key_exist);
+ block_cache_tracer_->WriteBlockAccess(
+ access_record, lookup_data_block_context.block_key,
+ rep_->cf_name_for_tracing(), referenced_key);
+ }
+ s = biter->status();
+ if (done) {
+ // Avoid the extra Next which is expensive in two-level indexes
+ break;
+ }
+ if (first_block) {
+ iiter->Seek(key);
+ }
+ first_block = false;
+ iiter->Next();
+ } while (iiter->Valid());
+
+ if (matched && filter != nullptr && !filter->IsBlockBased()) {
+ RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+ PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+ rep_->level);
+ }
+ if (s.ok()) {
+ s = iiter->status();
+ }
+ *(miter->s) = s;
+ }
+ }
+}
+
+Status BlockBasedTable::Prefetch(const Slice* const begin,
+ const Slice* const end) {
+ auto& comparator = rep_->internal_comparator;
+ UserComparatorWrapper user_comparator(comparator.user_comparator());
+ // pre-condition
+ if (begin && end && comparator.Compare(*begin, *end) > 0) {
+ return Status::InvalidArgument(*begin, *end);
+ }
+ BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+ IndexBlockIter iiter_on_stack;
+ auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+ &iiter_on_stack, /*get_context=*/nullptr,
+ &lookup_context);
+ std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+ if (iiter != &iiter_on_stack) {
+ iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+ }
+
+ if (!iiter->status().ok()) {
+ // error opening index iterator
+ return iiter->status();
+ }
+
+ // indicates if we are on the last page that need to be pre-fetched
+ bool prefetching_boundary_page = false;
+
+ for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
+ iiter->Next()) {
+ BlockHandle block_handle = iiter->value().handle;
+ const bool is_user_key = !rep_->index_key_includes_seq;
+ if (end &&
+ ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
+ (is_user_key &&
+ user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
+ if (prefetching_boundary_page) {
+ break;
+ }
+
+ // The index entry represents the last key in the data block.
+ // We should load this page into memory as well, but no more
+ prefetching_boundary_page = true;
+ }
+
+ // Load the block specified by the block_handle into the block cache
+ DataBlockIter biter;
+
+ NewDataBlockIterator<DataBlockIter>(
+ ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
+ /*get_context=*/nullptr, &lookup_context, Status(),
+ /*prefetch_buffer=*/nullptr);
+
+ if (!biter.status().ok()) {
+ // there was an unexpected error while pre-fetching
+ return biter.status();
+ }
+ }
+
+ return Status::OK();
+}
+
+Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
+ TableReaderCaller caller) {
+ Status s;
+ // Check Meta blocks
+ std::unique_ptr<Block> metaindex;
+ std::unique_ptr<InternalIterator> metaindex_iter;
+ s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
+ &metaindex_iter);
+ if (s.ok()) {
+ s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ return s;
+ }
+ // Check Data blocks
+ IndexBlockIter iiter_on_stack;
+ BlockCacheLookupContext context{caller};
+ InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
+ read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
+ /*get_context=*/nullptr, &context);
+ std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+ if (iiter != &iiter_on_stack) {
+ iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+ }
+ if (!iiter->status().ok()) {
+ // error opening index iterator
+ return iiter->status();
+ }
+ s = VerifyChecksumInBlocks(read_options, iiter);
+ return s;
+}
+
+Status BlockBasedTable::VerifyChecksumInBlocks(
+ const ReadOptions& read_options,
+ InternalIteratorBase<IndexValue>* index_iter) {
+ Status s;
+ // We are scanning the whole file, so no need to do exponential
+ // increasing of the buffer size.
+ size_t readahead_size = (read_options.readahead_size != 0)
+ ? read_options.readahead_size
+ : kMaxAutoReadaheadSize;
+ // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
+ // needed there.
+ FilePrefetchBuffer prefetch_buffer(
+ rep_->file.get(), readahead_size /* readadhead_size */,
+ readahead_size /* max_readahead_size */,
+ !rep_->ioptions.allow_mmap_reads /* enable */);
+
+ for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+ s = index_iter->status();
+ if (!s.ok()) {
+ break;
+ }
+ BlockHandle handle = index_iter->value().handle;
+ BlockContents contents;
+ BlockFetcher block_fetcher(
+ rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
+ &contents, rep_->ioptions, false /* decompress */,
+ false /*maybe_compressed*/, BlockType::kData,
+ UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+ s = block_fetcher.ReadBlockContents();
+ if (!s.ok()) {
+ break;
+ }
+ }
+ return s;
+}
+
+BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
+ const Slice& meta_block_name) {
+ if (meta_block_name.starts_with(kFilterBlockPrefix) ||
+ meta_block_name.starts_with(kFullFilterBlockPrefix) ||
+ meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
+ return BlockType::kFilter;
+ }
+
+ if (meta_block_name == kPropertiesBlock) {
+ return BlockType::kProperties;
+ }
+
+ if (meta_block_name == kCompressionDictBlock) {
+ return BlockType::kCompressionDictionary;
+ }
+
+ if (meta_block_name == kRangeDelBlock) {
+ return BlockType::kRangeDeletion;
+ }
+
+ if (meta_block_name == kHashIndexPrefixesBlock) {
+ return BlockType::kHashIndexPrefixes;
+ }
+
+ if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
+ return BlockType::kHashIndexMetadata;
+ }
+
+ assert(false);
+ return BlockType::kInvalid;
+}
+
+Status BlockBasedTable::VerifyChecksumInMetaBlocks(
+ InternalIteratorBase<Slice>* index_iter) {
+ Status s;
+ for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+ s = index_iter->status();
+ if (!s.ok()) {
+ break;
+ }
+ BlockHandle handle;
+ Slice input = index_iter->value();
+ s = handle.DecodeFrom(&input);
+ BlockContents contents;
+ const Slice meta_block_name = index_iter->key();
+ BlockFetcher block_fetcher(
+ rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
+ ReadOptions(), handle, &contents, rep_->ioptions,
+ false /* decompress */, false /*maybe_compressed*/,
+ GetBlockTypeForMetaBlockByName(meta_block_name),
+ UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+ s = block_fetcher.ReadBlockContents();
+ if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
+ TableProperties* table_properties;
+ s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
+ index_iter->value(),
+ &table_properties);
+ delete table_properties;
+ }
+ if (!s.ok()) {
+ break;
+ }
+ }
+ return s;
+}
+
+bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
+ assert(rep_ != nullptr);
+
+ Cache* const cache = rep_->table_options.block_cache.get();
+ if (cache == nullptr) {
+ return false;
+ }
+
+ char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+ Slice cache_key =
+ GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
+ cache_key_storage);
+
+ Cache::Handle* const cache_handle = cache->Lookup(cache_key);
+ if (cache_handle == nullptr) {
+ return false;
+ }
+
+ cache->Release(cache_handle);
+
+ return true;
+}
+
+bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
+ const Slice& key) {
+ std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
+ options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+ /*get_context=*/nullptr, /*lookup_context=*/nullptr));
+ iiter->Seek(key);
+ assert(iiter->Valid());
+
+ return TEST_BlockInCache(iiter->value().handle);
+}
+
+// REQUIRES: The following fields of rep_ should have already been populated:
+// 1. file
+// 2. index_handle,
+// 3. options
+// 4. internal_comparator
+// 5. index_type
+Status BlockBasedTable::CreateIndexReader(
+ FilePrefetchBuffer* prefetch_buffer,
+ InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
+ bool pin, BlockCacheLookupContext* lookup_context,
+ std::unique_ptr<IndexReader>* index_reader) {
+ // kHashSearch requires non-empty prefix_extractor but bypass checking
+ // prefix_extractor here since we have no access to MutableCFOptions.
+ // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator.
+ // If prefix_extractor does not match prefix_extractor_name from table
+ // properties, turn off Hash Index by setting total_order_seek to true
+
+ switch (rep_->index_type) {
+ case BlockBasedTableOptions::kTwoLevelIndexSearch: {
+ return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
+ prefetch, pin, lookup_context,
+ index_reader);
+ }
+ case BlockBasedTableOptions::kBinarySearch:
+ FALLTHROUGH_INTENDED;
+ case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
+ return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
+ prefetch, pin, lookup_context,
+ index_reader);
+ }
+ case BlockBasedTableOptions::kHashSearch: {
+ std::unique_ptr<Block> metaindex_guard;
+ std::unique_ptr<InternalIterator> metaindex_iter_guard;
+ auto meta_index_iter = preloaded_meta_index_iter;
+ bool should_fallback = false;
+ if (rep_->internal_prefix_transform.get() == nullptr) {
+ ROCKS_LOG_WARN(rep_->ioptions.info_log,
+ "No prefix extractor passed in. Fall back to binary"
+ " search index.");
+ should_fallback = true;
+ } else if (meta_index_iter == nullptr) {
+ auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
+ &metaindex_iter_guard);
+ if (!s.ok()) {
+ // we simply fall back to binary search in case there is any
+ // problem with prefix hash index loading.
+ ROCKS_LOG_WARN(rep_->ioptions.info_log,
+ "Unable to read the metaindex block."
+ " Fall back to binary search index.");
+ should_fallback = true;
+ }
+ meta_index_iter = metaindex_iter_guard.get();
+ }
+
+ if (should_fallback) {
+ return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
+ prefetch, pin, lookup_context,
+ index_reader);
+ } else {
+ return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
+ use_cache, prefetch, pin, lookup_context,
+ index_reader);
+ }
+ }
+ default: {
+ std::string error_message =
+ "Unrecognized index type: " + ToString(rep_->index_type);
+ return Status::InvalidArgument(error_message.c_str());
+ }
+ }
+}
+
+uint64_t BlockBasedTable::ApproximateOffsetOf(
+ const InternalIteratorBase<IndexValue>& index_iter) const {
+ uint64_t result = 0;
+ if (index_iter.Valid()) {
+ BlockHandle handle = index_iter.value().handle;
+ result = handle.offset();
+ } else {
+ // The iterator is past the last key in the file. If table_properties is not
+ // available, approximate the offset by returning the offset of the
+ // metaindex block (which is right near the end of the file).
+ if (rep_->table_properties) {
+ result = rep_->table_properties->data_size;
+ }
+ // table_properties is not present in the table.
+ if (result == 0) {
+ result = rep_->footer.metaindex_handle().offset();
+ }
+ }
+
+ return result;
+}
+
+uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
+ TableReaderCaller caller) {
+ BlockCacheLookupContext context(caller);
+ IndexBlockIter iiter_on_stack;
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ auto index_iter =
+ NewIndexIterator(ro, /*disable_prefix_seek=*/true,
+ /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+ /*lookup_context=*/&context);
+ std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+ if (index_iter != &iiter_on_stack) {
+ iiter_unique_ptr.reset(index_iter);
+ }
+
+ index_iter->Seek(key);
+ return ApproximateOffsetOf(*index_iter);
+}
+
+uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
+ TableReaderCaller caller) {
+ assert(rep_->internal_comparator.Compare(start, end) <= 0);
+
+ BlockCacheLookupContext context(caller);
+ IndexBlockIter iiter_on_stack;
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ auto index_iter =
+ NewIndexIterator(ro, /*disable_prefix_seek=*/true,
+ /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+ /*lookup_context=*/&context);
+ std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+ if (index_iter != &iiter_on_stack) {
+ iiter_unique_ptr.reset(index_iter);
+ }
+
+ index_iter->Seek(start);
+ uint64_t start_offset = ApproximateOffsetOf(*index_iter);
+ index_iter->Seek(end);
+ uint64_t end_offset = ApproximateOffsetOf(*index_iter);
+
+ assert(end_offset >= start_offset);
+ return end_offset - start_offset;
+}
+
+bool BlockBasedTable::TEST_FilterBlockInCache() const {
+ assert(rep_ != nullptr);
+ return TEST_BlockInCache(rep_->filter_handle);
+}
+
+bool BlockBasedTable::TEST_IndexBlockInCache() const {
+ assert(rep_ != nullptr);
+
+ return TEST_BlockInCache(rep_->footer.index_handle());
+}
+
+Status BlockBasedTable::GetKVPairsFromDataBlocks(
+ std::vector<KVPairBlock>* kv_pair_blocks) {
+ std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+ NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+ /*input_iter=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_contex=*/nullptr));
+
+ Status s = blockhandles_iter->status();
+ if (!s.ok()) {
+ // Cannot read Index Block
+ return s;
+ }
+
+ for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+ blockhandles_iter->Next()) {
+ s = blockhandles_iter->status();
+
+ if (!s.ok()) {
+ break;
+ }
+
+ std::unique_ptr<InternalIterator> datablock_iter;
+ datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+ ReadOptions(), blockhandles_iter->value().handle,
+ /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+ /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
+ /*prefetch_buffer=*/nullptr));
+ s = datablock_iter->status();
+
+ if (!s.ok()) {
+ // Error reading the block - Skipped
+ continue;
+ }
+
+ KVPairBlock kv_pair_block;
+ for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+ datablock_iter->Next()) {
+ s = datablock_iter->status();
+ if (!s.ok()) {
+ // Error reading the block - Skipped
+ break;
+ }
+ const Slice& key = datablock_iter->key();
+ const Slice& value = datablock_iter->value();
+ std::string key_copy = std::string(key.data(), key.size());
+ std::string value_copy = std::string(value.data(), value.size());
+
+ kv_pair_block.push_back(
+ std::make_pair(std::move(key_copy), std::move(value_copy)));
+ }
+ kv_pair_blocks->push_back(std::move(kv_pair_block));
+ }
+ return Status::OK();
+}
+
+Status BlockBasedTable::DumpTable(WritableFile* out_file) {
+ // Output Footer
+ out_file->Append(
+ "Footer Details:\n"
+ "--------------------------------------\n"
+ " ");
+ out_file->Append(rep_->footer.ToString().c_str());
+ out_file->Append("\n");
+
+ // Output MetaIndex
+ out_file->Append(
+ "Metaindex Details:\n"
+ "--------------------------------------\n");
+ std::unique_ptr<Block> metaindex;
+ std::unique_ptr<InternalIterator> metaindex_iter;
+ Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
+ &metaindex_iter);
+ if (s.ok()) {
+ for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+ metaindex_iter->Next()) {
+ s = metaindex_iter->status();
+ if (!s.ok()) {
+ return s;
+ }
+ if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kPropertiesBlock) {
+ out_file->Append(" Properties block handle: ");
+ out_file->Append(metaindex_iter->value().ToString(true).c_str());
+ out_file->Append("\n");
+ } else if (metaindex_iter->key() ==
+ ROCKSDB_NAMESPACE::kCompressionDictBlock) {
+ out_file->Append(" Compression dictionary block handle: ");
+ out_file->Append(metaindex_iter->value().ToString(true).c_str());
+ out_file->Append("\n");
+ } else if (strstr(metaindex_iter->key().ToString().c_str(),
+ "filter.rocksdb.") != nullptr) {
+ out_file->Append(" Filter block handle: ");
+ out_file->Append(metaindex_iter->value().ToString(true).c_str());
+ out_file->Append("\n");
+ } else if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kRangeDelBlock) {
+ out_file->Append(" Range deletion block handle: ");
+ out_file->Append(metaindex_iter->value().ToString(true).c_str());
+ out_file->Append("\n");
+ }
+ }
+ out_file->Append("\n");
+ } else {
+ return s;
+ }
+
+ // Output TableProperties
+ const ROCKSDB_NAMESPACE::TableProperties* table_properties;
+ table_properties = rep_->table_properties.get();
+
+ if (table_properties != nullptr) {
+ out_file->Append(
+ "Table Properties:\n"
+ "--------------------------------------\n"
+ " ");
+ out_file->Append(table_properties->ToString("\n ", ": ").c_str());
+ out_file->Append("\n");
+ }
+
+ if (rep_->filter) {
+ out_file->Append(
+ "Filter Details:\n"
+ "--------------------------------------\n"
+ " ");
+ out_file->Append(rep_->filter->ToString().c_str());
+ out_file->Append("\n");
+ }
+
+ // Output Index block
+ s = DumpIndexBlock(out_file);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Output compression dictionary
+ if (rep_->uncompression_dict_reader) {
+ CachableEntry<UncompressionDict> uncompression_dict;
+ s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+ nullptr /* prefetch_buffer */, false /* no_io */,
+ nullptr /* get_context */, nullptr /* lookup_context */,
+ &uncompression_dict);
+ if (!s.ok()) {
+ return s;
+ }
+
+ assert(uncompression_dict.GetValue());
+
+ const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
+ out_file->Append(
+ "Compression Dictionary:\n"
+ "--------------------------------------\n");
+ out_file->Append(" size (bytes): ");
+ out_file->Append(ROCKSDB_NAMESPACE::ToString(raw_dict.size()));
+ out_file->Append("\n\n");
+ out_file->Append(" HEX ");
+ out_file->Append(raw_dict.ToString(true).c_str());
+ out_file->Append("\n\n");
+ }
+
+ // Output range deletions block
+ auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
+ if (range_del_iter != nullptr) {
+ range_del_iter->SeekToFirst();
+ if (range_del_iter->Valid()) {
+ out_file->Append(
+ "Range deletions:\n"
+ "--------------------------------------\n"
+ " ");
+ for (; range_del_iter->Valid(); range_del_iter->Next()) {
+ DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
+ }
+ out_file->Append("\n");
+ }
+ delete range_del_iter;
+ }
+ // Output Data blocks
+ s = DumpDataBlocks(out_file);
+
+ return s;
+}
+
+Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
+ out_file->Append(
+ "Index Details:\n"
+ "--------------------------------------\n");
+ std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+ NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+ /*input_iter=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_contex=*/nullptr));
+ Status s = blockhandles_iter->status();
+ if (!s.ok()) {
+ out_file->Append("Can not read Index Block \n\n");
+ return s;
+ }
+
+ out_file->Append(" Block key hex dump: Data block handle\n");
+ out_file->Append(" Block key ascii\n\n");
+ for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+ blockhandles_iter->Next()) {
+ s = blockhandles_iter->status();
+ if (!s.ok()) {
+ break;
+ }
+ Slice key = blockhandles_iter->key();
+ Slice user_key;
+ InternalKey ikey;
+ if (!rep_->index_key_includes_seq) {
+ user_key = key;
+ } else {
+ ikey.DecodeFrom(key);
+ user_key = ikey.user_key();
+ }
+
+ out_file->Append(" HEX ");
+ out_file->Append(user_key.ToString(true).c_str());
+ out_file->Append(": ");
+ out_file->Append(blockhandles_iter->value()
+ .ToString(true, rep_->index_has_first_key)
+ .c_str());
+ out_file->Append("\n");
+
+ std::string str_key = user_key.ToString();
+ std::string res_key("");
+ char cspace = ' ';
+ for (size_t i = 0; i < str_key.size(); i++) {
+ res_key.append(&str_key[i], 1);
+ res_key.append(1, cspace);
+ }
+ out_file->Append(" ASCII ");
+ out_file->Append(res_key.c_str());
+ out_file->Append("\n ------\n");
+ }
+ out_file->Append("\n");
+ return Status::OK();
+}
+
+Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
+ std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+ NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+ /*input_iter=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_contex=*/nullptr));
+ Status s = blockhandles_iter->status();
+ if (!s.ok()) {
+ out_file->Append("Can not read Index Block \n\n");
+ return s;
+ }
+
+ uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
+ uint64_t datablock_size_max = 0;
+ uint64_t datablock_size_sum = 0;
+
+ size_t block_id = 1;
+ for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+ block_id++, blockhandles_iter->Next()) {
+ s = blockhandles_iter->status();
+ if (!s.ok()) {
+ break;
+ }
+
+ BlockHandle bh = blockhandles_iter->value().handle;
+ uint64_t datablock_size = bh.size();
+ datablock_size_min = std::min(datablock_size_min, datablock_size);
+ datablock_size_max = std::max(datablock_size_max, datablock_size);
+ datablock_size_sum += datablock_size;
+
+ out_file->Append("Data Block # ");
+ out_file->Append(ROCKSDB_NAMESPACE::ToString(block_id));
+ out_file->Append(" @ ");
+ out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
+ out_file->Append("\n");
+ out_file->Append("--------------------------------------\n");
+
+ std::unique_ptr<InternalIterator> datablock_iter;
+ datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+ ReadOptions(), blockhandles_iter->value().handle,
+ /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+ /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
+ /*prefetch_buffer=*/nullptr));
+ s = datablock_iter->status();
+
+ if (!s.ok()) {
+ out_file->Append("Error reading the block - Skipped \n\n");
+ continue;
+ }
+
+ for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+ datablock_iter->Next()) {
+ s = datablock_iter->status();
+ if (!s.ok()) {
+ out_file->Append("Error reading the block - Skipped \n");
+ break;
+ }
+ DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
+ }
+ out_file->Append("\n");
+ }
+
+ uint64_t num_datablocks = block_id - 1;
+ if (num_datablocks) {
+ double datablock_size_avg =
+ static_cast<double>(datablock_size_sum) / num_datablocks;
+ out_file->Append("Data Block Summary:\n");
+ out_file->Append("--------------------------------------");
+ out_file->Append("\n # data blocks: ");
+ out_file->Append(ROCKSDB_NAMESPACE::ToString(num_datablocks));
+ out_file->Append("\n min data block size: ");
+ out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_min));
+ out_file->Append("\n max data block size: ");
+ out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_max));
+ out_file->Append("\n avg data block size: ");
+ out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_avg));
+ out_file->Append("\n");
+ }
+
+ return Status::OK();
+}
+
+void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
+ WritableFile* out_file) {
+ InternalKey ikey;
+ ikey.DecodeFrom(key);
+
+ out_file->Append(" HEX ");
+ out_file->Append(ikey.user_key().ToString(true).c_str());
+ out_file->Append(": ");
+ out_file->Append(value.ToString(true).c_str());
+ out_file->Append("\n");
+
+ std::string str_key = ikey.user_key().ToString();
+ std::string str_value = value.ToString();
+ std::string res_key(""), res_value("");
+ char cspace = ' ';
+ for (size_t i = 0; i < str_key.size(); i++) {
+ if (str_key[i] == '\0') {
+ res_key.append("\\0", 2);
+ } else {
+ res_key.append(&str_key[i], 1);
+ }
+ res_key.append(1, cspace);
+ }
+ for (size_t i = 0; i < str_value.size(); i++) {
+ if (str_value[i] == '\0') {
+ res_value.append("\\0", 2);
+ } else {
+ res_value.append(&str_value[i], 1);
+ }
+ res_value.append(1, cspace);
+ }
+
+ out_file->Append(" ASCII ");
+ out_file->Append(res_key.c_str());
+ out_file->Append(": ");
+ out_file->Append(res_value.c_str());
+ out_file->Append("\n ------\n");
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader.h b/src/rocksdb/table/block_based/block_based_table_reader.h
new file mode 100644
index 000000000..28a378988
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader.h
@@ -0,0 +1,824 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/range_tombstone_fragmenter.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/uncompression_dict_reader.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/multiget_context.h"
+#include "table/persistent_cache_helper.h"
+#include "table/table_properties_internal.h"
+#include "table/table_reader.h"
+#include "table/two_level_iterator.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/coding.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class FilterBlockReader;
+class BlockBasedFilterBlockReader;
+class FullFilterBlockReader;
+class Footer;
+class InternalKeyComparator;
+class Iterator;
+class FSRandomAccessFile;
+class TableCache;
+class TableReader;
+class WritableFile;
+struct BlockBasedTableOptions;
+struct EnvOptions;
+struct ReadOptions;
+class GetContext;
+
+typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
+
+// Reader class for BlockBasedTable format.
+// For the format of BlockBasedTable refer to
+// https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
+// This is the default table type. Data is chucked into fixed size blocks and
+// each block in-turn stores entries. When storing data, we can compress and/or
+// encode data efficiently within a block, which often results in a much smaller
+// data size compared with the raw data size. As for the record retrieval, we'll
+// first locate the block where target record may reside, then read the block to
+// memory, and finally search that record within the block. Of course, to avoid
+// frequent reads of the same block, we introduced the block cache to keep the
+// loaded blocks in the memory.
+class BlockBasedTable : public TableReader {
+ public:
+ static const std::string kFilterBlockPrefix;
+ static const std::string kFullFilterBlockPrefix;
+ static const std::string kPartitionedFilterBlockPrefix;
+ // The longest prefix of the cache key used to identify blocks.
+ // For Posix files the unique ID is three varints.
+ static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
+
+ // All the below fields control iterator readahead
+ static const size_t kInitAutoReadaheadSize = 8 * 1024;
+ // Found that 256 KB readahead size provides the best performance, based on
+ // experiments, for auto readahead. Experiment data is in PR #3282.
+ static const size_t kMaxAutoReadaheadSize;
+ static const int kMinNumFileReadsToStartAutoReadahead = 2;
+
+ // Attempt to open the table that is stored in bytes [0..file_size)
+ // of "file", and read the metadata entries necessary to allow
+ // retrieving data from the table.
+ //
+ // If successful, returns ok and sets "*table_reader" to the newly opened
+ // table. The client should delete "*table_reader" when no longer needed.
+ // If there was an error while initializing the table, sets "*table_reader"
+ // to nullptr and returns a non-ok status.
+ //
+ // @param file must remain live while this Table is in use.
+ // @param prefetch_index_and_filter_in_cache can be used to disable
+ // prefetching of
+ // index and filter blocks into block cache at startup
+ // @param skip_filters Disables loading/accessing the filter block. Overrides
+ // prefetch_index_and_filter_in_cache, so filter will be skipped if both
+ // are set.
+ static Status Open(const ImmutableCFOptions& ioptions,
+ const EnvOptions& env_options,
+ const BlockBasedTableOptions& table_options,
+ const InternalKeyComparator& internal_key_comparator,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader,
+ const SliceTransform* prefix_extractor = nullptr,
+ bool prefetch_index_and_filter_in_cache = true,
+ bool skip_filters = false, int level = -1,
+ const bool immortal_table = false,
+ const SequenceNumber largest_seqno = 0,
+ TailPrefetchStats* tail_prefetch_stats = nullptr,
+ BlockCacheTracer* const block_cache_tracer = nullptr);
+
+ bool PrefixMayMatch(const Slice& internal_key,
+ const ReadOptions& read_options,
+ const SliceTransform* options_prefix_extractor,
+ const bool need_upper_bound_check,
+ BlockCacheLookupContext* lookup_context) const;
+
+ // Returns a new iterator over the table contents.
+ // The result of NewIterator() is initially invalid (caller must
+ // call one of the Seek methods on the iterator before using it).
+ // @param skip_filters Disables loading/accessing the filter block
+ // compaction_readahead_size: its value will only be used if caller =
+ // kCompaction.
+ InternalIterator* NewIterator(const ReadOptions&,
+ const SliceTransform* prefix_extractor,
+ Arena* arena, bool skip_filters,
+ TableReaderCaller caller,
+ size_t compaction_readahead_size = 0) override;
+
+ FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+ const ReadOptions& read_options) override;
+
+ // @param skip_filters Disables loading/accessing the filter block
+ Status Get(const ReadOptions& readOptions, const Slice& key,
+ GetContext* get_context, const SliceTransform* prefix_extractor,
+ bool skip_filters = false) override;
+
+ void MultiGet(const ReadOptions& readOptions,
+ const MultiGetContext::Range* mget_range,
+ const SliceTransform* prefix_extractor,
+ bool skip_filters = false) override;
+
+ // Pre-fetch the disk blocks that correspond to the key range specified by
+ // (kbegin, kend). The call will return error status in the event of
+ // IO or iteration error.
+ Status Prefetch(const Slice* begin, const Slice* end) override;
+
+ // Given a key, return an approximate byte offset in the file where
+ // the data for that key begins (or would begin if the key were
+ // present in the file). The returned value is in terms of file
+ // bytes, and so includes effects like compression of the underlying data.
+ // E.g., the approximate offset of the last key in the table will
+ // be close to the file length.
+ uint64_t ApproximateOffsetOf(const Slice& key,
+ TableReaderCaller caller) override;
+
+ // Given start and end keys, return the approximate data size in the file
+ // between the keys. The returned value is in terms of file bytes, and so
+ // includes effects like compression of the underlying data.
+ // The start key must not be greater than the end key.
+ uint64_t ApproximateSize(const Slice& start, const Slice& end,
+ TableReaderCaller caller) override;
+
+ bool TEST_BlockInCache(const BlockHandle& handle) const;
+
+ // Returns true if the block for the specified key is in cache.
+ // REQUIRES: key is in this table && block cache enabled
+ bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
+
+ // Set up the table for Compaction. Might change some parameters with
+ // posix_fadvise
+ void SetupForCompaction() override;
+
+ std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+ size_t ApproximateMemoryUsage() const override;
+
+ // convert SST file to a human readable form
+ Status DumpTable(WritableFile* out_file) override;
+
+ Status VerifyChecksum(const ReadOptions& readOptions,
+ TableReaderCaller caller) override;
+
+ ~BlockBasedTable();
+
+ bool TEST_FilterBlockInCache() const;
+ bool TEST_IndexBlockInCache() const;
+
+ // IndexReader is the interface that provides the functionality for index
+ // access.
+ class IndexReader {
+ public:
+ virtual ~IndexReader() = default;
+
+ // Create an iterator for index access. If iter is null, then a new object
+ // is created on the heap, and the callee will have the ownership.
+ // If a non-null iter is passed in, it will be used, and the returned value
+ // is either the same as iter or a new on-heap object that
+ // wraps the passed iter. In the latter case the return value points
+ // to a different object then iter, and the callee has the ownership of the
+ // returned object.
+ virtual InternalIteratorBase<IndexValue>* NewIterator(
+ const ReadOptions& read_options, bool disable_prefix_seek,
+ IndexBlockIter* iter, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) = 0;
+
+ // Report an approximation of how much memory has been used other than
+ // memory that was allocated in block cache.
+ virtual size_t ApproximateMemoryUsage() const = 0;
+ // Cache the dependencies of the index reader (e.g. the partitions
+ // of a partitioned index).
+ virtual void CacheDependencies(bool /* pin */) {}
+ };
+
+ class IndexReaderCommon;
+
+ static Slice GetCacheKey(const char* cache_key_prefix,
+ size_t cache_key_prefix_size,
+ const BlockHandle& handle, char* cache_key);
+
+ // Retrieve all key value pairs from data blocks in the table.
+ // The key retrieved are internal keys.
+ Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
+
+ struct Rep;
+
+ Rep* get_rep() { return rep_; }
+ const Rep* get_rep() const { return rep_; }
+
+ // input_iter: if it is not null, update this one and return it as Iterator
+ template <typename TBlockIter>
+ TBlockIter* NewDataBlockIterator(
+ const ReadOptions& ro, const BlockHandle& block_handle,
+ TBlockIter* input_iter, BlockType block_type, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context, Status s,
+ FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const;
+
+ // input_iter: if it is not null, update this one and return it as Iterator
+ template <typename TBlockIter>
+ TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
+ CachableEntry<Block>& block,
+ TBlockIter* input_iter, Status s) const;
+
+ class PartitionedIndexIteratorState;
+
+ template <typename TBlocklike>
+ friend class FilterBlockReaderCommon;
+
+ friend class PartitionIndexReader;
+
+ friend class UncompressionDictReader;
+
+ protected:
+ Rep* rep_;
+ explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
+ : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
+ // No copying allowed
+ explicit BlockBasedTable(const TableReader&) = delete;
+ void operator=(const TableReader&) = delete;
+
+ private:
+ friend class MockedBlockBasedTable;
+ static std::atomic<uint64_t> next_cache_key_id_;
+ BlockCacheTracer* const block_cache_tracer_;
+
+ void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
+ size_t usage) const;
+ void UpdateCacheMissMetrics(BlockType block_type,
+ GetContext* get_context) const;
+ void UpdateCacheInsertionMetrics(BlockType block_type,
+ GetContext* get_context, size_t usage) const;
+ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
+ BlockType block_type,
+ GetContext* get_context) const;
+
+ // Either Block::NewDataIterator() or Block::NewIndexIterator().
+ template <typename TBlockIter>
+ static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
+ TBlockIter* input_iter,
+ bool block_contents_pinned);
+
+ // If block cache enabled (compressed or uncompressed), looks for the block
+ // identified by handle in (1) uncompressed cache, (2) compressed cache, and
+ // then (3) file. If found, inserts into the cache(s) that were searched
+ // unsuccessfully (e.g., if found in file, will add to both uncompressed and
+ // compressed caches if they're enabled).
+ //
+ // @param block_entry value is set to the uncompressed block if found. If
+ // in uncompressed block cache, also sets cache_handle to reference that
+ // block.
+ template <typename TBlocklike>
+ Status MaybeReadBlockAndLoadToCache(
+ FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+ const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+ CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+ GetContext* get_context, BlockCacheLookupContext* lookup_context,
+ BlockContents* contents) const;
+
+ // Similar to the above, with one crucial difference: it will retrieve the
+ // block from the file even if there are no caches configured (assuming the
+ // read options allow I/O).
+ template <typename TBlocklike>
+ Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
+ const ReadOptions& ro, const BlockHandle& handle,
+ const UncompressionDict& uncompression_dict,
+ CachableEntry<TBlocklike>* block_entry,
+ BlockType block_type, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ bool for_compaction, bool use_cache) const;
+
+ void RetrieveMultipleBlocks(
+ const ReadOptions& options, const MultiGetRange* batch,
+ const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
+ autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+ autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>*
+ results,
+ char* scratch, const UncompressionDict& uncompression_dict) const;
+
+ // Get the iterator from the index reader.
+ //
+ // If input_iter is not set, return a new Iterator.
+ // If input_iter is set, try to update it and return it as Iterator.
+ // However note that in some cases the returned iterator may be different
+ // from input_iter. In such case the returned iterator should be freed.
+ //
+ // Note: ErrorIterator with Status::Incomplete shall be returned if all the
+ // following conditions are met:
+ // 1. We enabled table_options.cache_index_and_filter_blocks.
+ // 2. index is not present in block cache.
+ // 3. We disallowed any io to be performed, that is, read_options ==
+ // kBlockCacheTier
+ InternalIteratorBase<IndexValue>* NewIndexIterator(
+ const ReadOptions& read_options, bool need_upper_bound_check,
+ IndexBlockIter* input_iter, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) const;
+
+ // Read block cache from block caches (if set): block_cache and
+ // block_cache_compressed.
+ // On success, Status::OK with be returned and @block will be populated with
+ // pointer to the block as well as its block handle.
+ // @param uncompression_dict Data for presetting the compression library's
+ // dictionary.
+ template <typename TBlocklike>
+ Status GetDataBlockFromCache(
+ const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+ Cache* block_cache, Cache* block_cache_compressed,
+ const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
+ const UncompressionDict& uncompression_dict, BlockType block_type,
+ GetContext* get_context) const;
+
+ // Put a raw block (maybe compressed) to the corresponding block caches.
+ // This method will perform decompression against raw_block if needed and then
+ // populate the block caches.
+ // On success, Status::OK will be returned; also @block will be populated with
+ // uncompressed block and its cache handle.
+ //
+ // Allocated memory managed by raw_block_contents will be transferred to
+ // PutDataBlockToCache(). After the call, the object will be invalid.
+ // @param uncompression_dict Data for presetting the compression library's
+ // dictionary.
+ template <typename TBlocklike>
+ Status PutDataBlockToCache(
+ const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+ Cache* block_cache, Cache* block_cache_compressed,
+ CachableEntry<TBlocklike>* cached_block,
+ BlockContents* raw_block_contents, CompressionType raw_block_comp_type,
+ const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
+ MemoryAllocator* memory_allocator, BlockType block_type,
+ GetContext* get_context) const;
+
+ // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
+ // after a call to Seek(key), until handle_result returns false.
+ // May not make such a call if filter policy says that key is not present.
+ friend class TableCache;
+ friend class BlockBasedTableBuilder;
+
+ // Create a index reader based on the index type stored in the table.
+ // Optionally, user can pass a preloaded meta_index_iter for the index that
+ // need to access extra meta blocks for index construction. This parameter
+ // helps avoid re-reading meta index block if caller already created one.
+ Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer,
+ InternalIterator* preloaded_meta_index_iter,
+ bool use_cache, bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context,
+ std::unique_ptr<IndexReader>* index_reader);
+
+ bool FullFilterKeyMayMatch(const ReadOptions& read_options,
+ FilterBlockReader* filter, const Slice& user_key,
+ const bool no_io,
+ const SliceTransform* prefix_extractor,
+ GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) const;
+
+ void FullFilterKeysMayMatch(const ReadOptions& read_options,
+ FilterBlockReader* filter, MultiGetRange* range,
+ const bool no_io,
+ const SliceTransform* prefix_extractor,
+ BlockCacheLookupContext* lookup_context) const;
+
+ static Status PrefetchTail(
+ RandomAccessFileReader* file, uint64_t file_size,
+ TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
+ const bool preload_all,
+ std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
+ Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer,
+ std::unique_ptr<Block>* metaindex_block,
+ std::unique_ptr<InternalIterator>* iter);
+ Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer,
+ const Slice& handle_value,
+ TableProperties** table_properties);
+ Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer,
+ InternalIterator* meta_iter,
+ const SequenceNumber largest_seqno);
+ Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer,
+ InternalIterator* meta_iter,
+ const InternalKeyComparator& internal_comparator,
+ BlockCacheLookupContext* lookup_context);
+ Status PrefetchIndexAndFilterBlocks(
+ FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+ BlockBasedTable* new_table, bool prefetch_all,
+ const BlockBasedTableOptions& table_options, const int level,
+ BlockCacheLookupContext* lookup_context);
+
+ static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
+
+ Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
+ Status VerifyChecksumInBlocks(const ReadOptions& read_options,
+ InternalIteratorBase<IndexValue>* index_iter);
+
+ // Create the filter from the filter block.
+ std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
+ FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+ bool pin, BlockCacheLookupContext* lookup_context);
+
+ static void SetupCacheKeyPrefix(Rep* rep);
+
+ // Generate a cache key prefix from the file
+ static void GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
+ char* buffer, size_t* size);
+ static void GenerateCachePrefix(Cache* cc, FSWritableFile* file, char* buffer,
+ size_t* size);
+
+ // Given an iterator return its offset in file.
+ uint64_t ApproximateOffsetOf(
+ const InternalIteratorBase<IndexValue>& index_iter) const;
+
+ // Helper functions for DumpTable()
+ Status DumpIndexBlock(WritableFile* out_file);
+ Status DumpDataBlocks(WritableFile* out_file);
+ void DumpKeyValue(const Slice& key, const Slice& value,
+ WritableFile* out_file);
+
+ // A cumulative data block file read in MultiGet lower than this size will
+ // use a stack buffer
+ static constexpr size_t kMultiGetReadStackBufSize = 8192;
+
+ friend class PartitionedFilterBlockReader;
+ friend class PartitionedFilterBlockTest;
+ friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
+};
+
+// Maitaning state of a two-level iteration on a partitioned index structure.
+class BlockBasedTable::PartitionedIndexIteratorState
+ : public TwoLevelIteratorState {
+ public:
+ PartitionedIndexIteratorState(
+ const BlockBasedTable* table,
+ std::unordered_map<uint64_t, CachableEntry<Block>>* block_map);
+ InternalIteratorBase<IndexValue>* NewSecondaryIterator(
+ const BlockHandle& index_value) override;
+
+ private:
+ // Don't own table_
+ const BlockBasedTable* table_;
+ std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
+};
+
+// Stores all the properties associated with a BlockBasedTable.
+// These are immutable.
+struct BlockBasedTable::Rep {
+ Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
+ const BlockBasedTableOptions& _table_opt,
+ const InternalKeyComparator& _internal_comparator, bool skip_filters,
+ int _level, const bool _immortal_table)
+ : ioptions(_ioptions),
+ env_options(_env_options),
+ table_options(_table_opt),
+ filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
+ internal_comparator(_internal_comparator),
+ filter_type(FilterType::kNoFilter),
+ index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
+ hash_index_allow_collision(false),
+ whole_key_filtering(_table_opt.whole_key_filtering),
+ prefix_filtering(true),
+ global_seqno(kDisableGlobalSequenceNumber),
+ level(_level),
+ immortal_table(_immortal_table) {}
+
+ const ImmutableCFOptions& ioptions;
+ const EnvOptions& env_options;
+ const BlockBasedTableOptions table_options;
+ const FilterPolicy* const filter_policy;
+ const InternalKeyComparator& internal_comparator;
+ Status status;
+ std::unique_ptr<RandomAccessFileReader> file;
+ char cache_key_prefix[kMaxCacheKeyPrefixSize];
+ size_t cache_key_prefix_size = 0;
+ char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
+ size_t persistent_cache_key_prefix_size = 0;
+ char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
+ size_t compressed_cache_key_prefix_size = 0;
+ PersistentCacheOptions persistent_cache_options;
+
+ // Footer contains the fixed table information
+ Footer footer;
+
+ std::unique_ptr<IndexReader> index_reader;
+ std::unique_ptr<FilterBlockReader> filter;
+ std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
+
+ enum class FilterType {
+ kNoFilter,
+ kFullFilter,
+ kBlockFilter,
+ kPartitionedFilter,
+ };
+ FilterType filter_type;
+ BlockHandle filter_handle;
+ BlockHandle compression_dict_handle;
+
+ std::shared_ptr<const TableProperties> table_properties;
+ BlockBasedTableOptions::IndexType index_type;
+ bool hash_index_allow_collision;
+ bool whole_key_filtering;
+ bool prefix_filtering;
+ // TODO(kailiu) It is very ugly to use internal key in table, since table
+ // module should not be relying on db module. However to make things easier
+ // and compatible with existing code, we introduce a wrapper that allows
+ // block to extract prefix without knowing if a key is internal or not.
+ // null if no prefix_extractor is passed in when opening the table reader.
+ std::unique_ptr<SliceTransform> internal_prefix_transform;
+ std::shared_ptr<const SliceTransform> table_prefix_extractor;
+
+ std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
+
+ // If global_seqno is used, all Keys in this file will have the same
+ // seqno with value `global_seqno`.
+ //
+ // A value of kDisableGlobalSequenceNumber means that this feature is disabled
+ // and every key have it's own seqno.
+ SequenceNumber global_seqno;
+
+ // the level when the table is opened, could potentially change when trivial
+ // move is involved
+ int level;
+
+ // If false, blocks in this file are definitely all uncompressed. Knowing this
+ // before reading individual blocks enables certain optimizations.
+ bool blocks_maybe_compressed = true;
+
+ // If true, data blocks in this file are definitely ZSTD compressed. If false
+ // they might not be. When false we skip creating a ZSTD digested
+ // uncompression dictionary. Even if we get a false negative, things should
+ // still work, just not as quickly.
+ bool blocks_definitely_zstd_compressed = false;
+
+ // These describe how index is encoded.
+ bool index_has_first_key = false;
+ bool index_key_includes_seq = true;
+ bool index_value_is_full = true;
+
+ const bool immortal_table;
+
+ SequenceNumber get_global_seqno(BlockType block_type) const {
+ return (block_type == BlockType::kFilter ||
+ block_type == BlockType::kCompressionDictionary)
+ ? kDisableGlobalSequenceNumber
+ : global_seqno;
+ }
+
+ uint64_t cf_id_for_tracing() const {
+ return table_properties
+ ? table_properties->column_family_id
+ : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context::
+ kUnknownColumnFamily;
+ }
+
+ Slice cf_name_for_tracing() const {
+ return table_properties ? table_properties->column_family_name
+ : BlockCacheTraceHelper::kUnknownColumnFamilyName;
+ }
+
+ uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
+
+ uint64_t sst_number_for_tracing() const {
+ return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
+ }
+ void CreateFilePrefetchBuffer(
+ size_t readahead_size, size_t max_readahead_size,
+ std::unique_ptr<FilePrefetchBuffer>* fpb) const {
+ fpb->reset(new FilePrefetchBuffer(file.get(), readahead_size,
+ max_readahead_size,
+ !ioptions.allow_mmap_reads /* enable */));
+ }
+};
+
+// Iterates over the contents of BlockBasedTable.
+template <class TBlockIter, typename TValue = Slice>
+class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
+ // compaction_readahead_size: its value will only be used if for_compaction =
+ // true
+ public:
+ BlockBasedTableIterator(const BlockBasedTable* table,
+ const ReadOptions& read_options,
+ const InternalKeyComparator& icomp,
+ InternalIteratorBase<IndexValue>* index_iter,
+ bool check_filter, bool need_upper_bound_check,
+ const SliceTransform* prefix_extractor,
+ BlockType block_type, TableReaderCaller caller,
+ size_t compaction_readahead_size = 0)
+ : table_(table),
+ read_options_(read_options),
+ icomp_(icomp),
+ user_comparator_(icomp.user_comparator()),
+ index_iter_(index_iter),
+ pinned_iters_mgr_(nullptr),
+ block_iter_points_to_real_block_(false),
+ check_filter_(check_filter),
+ need_upper_bound_check_(need_upper_bound_check),
+ prefix_extractor_(prefix_extractor),
+ block_type_(block_type),
+ lookup_context_(caller),
+ compaction_readahead_size_(compaction_readahead_size) {}
+
+ ~BlockBasedTableIterator() { delete index_iter_; }
+
+ void Seek(const Slice& target) override;
+ void SeekForPrev(const Slice& target) override;
+ void SeekToFirst() override;
+ void SeekToLast() override;
+ void Next() final override;
+ bool NextAndGetResult(IterateResult* result) override;
+ void Prev() override;
+ bool Valid() const override {
+ return !is_out_of_bound_ &&
+ (is_at_first_key_from_index_ ||
+ (block_iter_points_to_real_block_ && block_iter_.Valid()));
+ }
+ Slice key() const override {
+ assert(Valid());
+ if (is_at_first_key_from_index_) {
+ return index_iter_->value().first_internal_key;
+ } else {
+ return block_iter_.key();
+ }
+ }
+ Slice user_key() const override {
+ assert(Valid());
+ if (is_at_first_key_from_index_) {
+ return ExtractUserKey(index_iter_->value().first_internal_key);
+ } else {
+ return block_iter_.user_key();
+ }
+ }
+ TValue value() const override {
+ assert(Valid());
+
+ // Load current block if not loaded.
+ if (is_at_first_key_from_index_ &&
+ !const_cast<BlockBasedTableIterator*>(this)
+ ->MaterializeCurrentBlock()) {
+ // Oops, index is not consistent with block contents, but we have
+ // no good way to report error at this point. Let's return empty value.
+ return TValue();
+ }
+
+ return block_iter_.value();
+ }
+ Status status() const override {
+ // Prefix index set status to NotFound when the prefix does not exist
+ if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
+ return index_iter_->status();
+ } else if (block_iter_points_to_real_block_) {
+ return block_iter_.status();
+ } else {
+ return Status::OK();
+ }
+ }
+
+ // Whether iterator invalidated for being out of bound.
+ bool IsOutOfBound() override { return is_out_of_bound_; }
+
+ inline bool MayBeOutOfUpperBound() override {
+ assert(Valid());
+ return !data_block_within_upper_bound_;
+ }
+
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ }
+ bool IsKeyPinned() const override {
+ // Our key comes either from block_iter_'s current key
+ // or index_iter_'s current *value*.
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
+ (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
+ }
+ bool IsValuePinned() const override {
+ // Load current block if not loaded.
+ if (is_at_first_key_from_index_) {
+ const_cast<BlockBasedTableIterator*>(this)->MaterializeCurrentBlock();
+ }
+ // BlockIter::IsValuePinned() is always true. No need to check
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ block_iter_points_to_real_block_;
+ }
+
+ void ResetDataIter() {
+ if (block_iter_points_to_real_block_) {
+ if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
+ block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
+ }
+ block_iter_.Invalidate(Status::OK());
+ block_iter_points_to_real_block_ = false;
+ }
+ }
+
+ void SavePrevIndexValue() {
+ if (block_iter_points_to_real_block_) {
+ // Reseek. If they end up with the same data block, we shouldn't re-fetch
+ // the same data block.
+ prev_block_offset_ = index_iter_->value().handle.offset();
+ }
+ }
+
+ private:
+ enum class IterDirection {
+ kForward,
+ kBackward,
+ };
+
+ const BlockBasedTable* table_;
+ const ReadOptions read_options_;
+ const InternalKeyComparator& icomp_;
+ UserComparatorWrapper user_comparator_;
+ InternalIteratorBase<IndexValue>* index_iter_;
+ PinnedIteratorsManager* pinned_iters_mgr_;
+ TBlockIter block_iter_;
+
+ // True if block_iter_ is initialized and points to the same block
+ // as index iterator.
+ bool block_iter_points_to_real_block_;
+ // See InternalIteratorBase::IsOutOfBound().
+ bool is_out_of_bound_ = false;
+ // Whether current data block being fully within iterate upper bound.
+ bool data_block_within_upper_bound_ = false;
+ // True if we're standing at the first key of a block, and we haven't loaded
+ // that block yet. A call to value() will trigger loading the block.
+ bool is_at_first_key_from_index_ = false;
+ bool check_filter_;
+ // TODO(Zhongyi): pick a better name
+ bool need_upper_bound_check_;
+ const SliceTransform* prefix_extractor_;
+ BlockType block_type_;
+ uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
+ BlockCacheLookupContext lookup_context_;
+ // Readahead size used in compaction, its value is used only if
+ // lookup_context_.caller = kCompaction.
+ size_t compaction_readahead_size_;
+
+ size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
+ size_t readahead_limit_ = 0;
+ int64_t num_file_reads_ = 0;
+ std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
+
+ // If `target` is null, seek to first.
+ void SeekImpl(const Slice* target);
+
+ void InitDataBlock();
+ bool MaterializeCurrentBlock();
+ void FindKeyForward();
+ void FindBlockForward();
+ void FindKeyBackward();
+ void CheckOutOfBound();
+
+ // Check if data block is fully within iterate_upper_bound.
+ //
+ // Note MyRocks may update iterate bounds between seek. To workaround it,
+ // we need to check and update data_block_within_upper_bound_ accordingly.
+ void CheckDataBlockWithinUpperBound();
+
+ bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
+ if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
+ // Upper bound check isn't sufficnet for backward direction to
+ // guarantee the same result as total order, so disable prefix
+ // check.
+ return true;
+ }
+ if (check_filter_ &&
+ !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
+ need_upper_bound_check_, &lookup_context_)) {
+ // TODO remember the iterator is invalidated because of prefix
+ // match. This can avoid the upper level file iterator to falsely
+ // believe the position is the end of the SST file and move to
+ // the first key of the next file.
+ ResetDataIter();
+ return false;
+ }
+ return true;
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_builder.cc b/src/rocksdb/table/block_based/block_builder.cc
new file mode 100644
index 000000000..6f77ef97c
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_builder.cc
@@ -0,0 +1,196 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// BlockBuilder generates blocks where keys are prefix-compressed:
+//
+// When we store a key, we drop the prefix shared with the previous
+// string. This helps reduce the space requirement significantly.
+// Furthermore, once every K keys, we do not apply the prefix
+// compression and store the entire key. We call this a "restart
+// point". The tail end of the block stores the offsets of all of the
+// restart points, and can be used to do a binary search when looking
+// for a particular key. Values are stored as-is (without compression)
+// immediately following the corresponding key.
+//
+// An entry for a particular key-value pair has the form:
+// shared_bytes: varint32
+// unshared_bytes: varint32
+// value_length: varint32
+// key_delta: char[unshared_bytes]
+// value: char[value_length]
+// shared_bytes == 0 for restart points.
+//
+// The trailer of the block has the form:
+// restarts: uint32[num_restarts]
+// num_restarts: uint32
+// restarts[i] contains the offset within the block of the ith restart point.
+
+#include "table/block_based/block_builder.h"
+
+#include <assert.h>
+#include <algorithm>
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "table/block_based/data_block_footer.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlockBuilder::BlockBuilder(
+ int block_restart_interval, bool use_delta_encoding,
+ bool use_value_delta_encoding,
+ BlockBasedTableOptions::DataBlockIndexType index_type,
+ double data_block_hash_table_util_ratio)
+ : block_restart_interval_(block_restart_interval),
+ use_delta_encoding_(use_delta_encoding),
+ use_value_delta_encoding_(use_value_delta_encoding),
+ restarts_(),
+ counter_(0),
+ finished_(false) {
+ switch (index_type) {
+ case BlockBasedTableOptions::kDataBlockBinarySearch:
+ break;
+ case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+ data_block_hash_index_builder_.Initialize(
+ data_block_hash_table_util_ratio);
+ break;
+ default:
+ assert(0);
+ }
+ assert(block_restart_interval_ >= 1);
+ restarts_.push_back(0); // First restart point is at offset 0
+ estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
+}
+
+void BlockBuilder::Reset() {
+ buffer_.clear();
+ restarts_.clear();
+ restarts_.push_back(0); // First restart point is at offset 0
+ estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
+ counter_ = 0;
+ finished_ = false;
+ last_key_.clear();
+ if (data_block_hash_index_builder_.Valid()) {
+ data_block_hash_index_builder_.Reset();
+ }
+}
+
+size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key,
+ const Slice& value) const {
+ size_t estimate = CurrentSizeEstimate();
+ // Note: this is an imprecise estimate as it accounts for the whole key size
+ // instead of non-shared key size.
+ estimate += key.size();
+ // In value delta encoding we estimate the value delta size as half the full
+ // value size since only the size field of block handle is encoded.
+ estimate +=
+ !use_value_delta_encoding_ || (counter_ >= block_restart_interval_)
+ ? value.size()
+ : value.size() / 2;
+
+ if (counter_ >= block_restart_interval_) {
+ estimate += sizeof(uint32_t); // a new restart entry.
+ }
+
+ estimate += sizeof(int32_t); // varint for shared prefix length.
+ // Note: this is an imprecise estimate as we will have to encoded size, one
+ // for shared key and one for non-shared key.
+ estimate += VarintLength(key.size()); // varint for key length.
+ if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) {
+ estimate += VarintLength(value.size()); // varint for value length.
+ }
+
+ return estimate;
+}
+
+Slice BlockBuilder::Finish() {
+ // Append restart array
+ for (size_t i = 0; i < restarts_.size(); i++) {
+ PutFixed32(&buffer_, restarts_[i]);
+ }
+
+ uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
+ BlockBasedTableOptions::DataBlockIndexType index_type =
+ BlockBasedTableOptions::kDataBlockBinarySearch;
+ if (data_block_hash_index_builder_.Valid() &&
+ CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
+ data_block_hash_index_builder_.Finish(buffer_);
+ index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+ }
+
+ // footer is a packed format of data_block_index_type and num_restarts
+ uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
+
+ PutFixed32(&buffer_, block_footer);
+ finished_ = true;
+ return Slice(buffer_);
+}
+
+void BlockBuilder::Add(const Slice& key, const Slice& value,
+ const Slice* const delta_value) {
+ assert(!finished_);
+ assert(counter_ <= block_restart_interval_);
+ assert(!use_value_delta_encoding_ || delta_value);
+ size_t shared = 0; // number of bytes shared with prev key
+ if (counter_ >= block_restart_interval_) {
+ // Restart compression
+ restarts_.push_back(static_cast<uint32_t>(buffer_.size()));
+ estimate_ += sizeof(uint32_t);
+ counter_ = 0;
+
+ if (use_delta_encoding_) {
+ // Update state
+ last_key_.assign(key.data(), key.size());
+ }
+ } else if (use_delta_encoding_) {
+ Slice last_key_piece(last_key_);
+ // See how much sharing to do with previous string
+ shared = key.difference_offset(last_key_piece);
+
+ // Update state
+ // We used to just copy the changed data here, but it appears to be
+ // faster to just copy the whole thing.
+ last_key_.assign(key.data(), key.size());
+ }
+
+ const size_t non_shared = key.size() - shared;
+ const size_t curr_size = buffer_.size();
+
+ if (use_value_delta_encoding_) {
+ // Add "<shared><non_shared>" to buffer_
+ PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+ static_cast<uint32_t>(non_shared));
+ } else {
+ // Add "<shared><non_shared><value_size>" to buffer_
+ PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+ static_cast<uint32_t>(non_shared),
+ static_cast<uint32_t>(value.size()));
+ }
+
+ // Add string delta to buffer_ followed by value
+ buffer_.append(key.data() + shared, non_shared);
+ // Use value delta encoding only when the key has shared bytes. This would
+ // simplify the decoding, where it can figure which decoding to use simply by
+ // looking at the shared bytes size.
+ if (shared != 0 && use_value_delta_encoding_) {
+ buffer_.append(delta_value->data(), delta_value->size());
+ } else {
+ buffer_.append(value.data(), value.size());
+ }
+
+ if (data_block_hash_index_builder_.Valid()) {
+ data_block_hash_index_builder_.Add(ExtractUserKey(key),
+ restarts_.size() - 1);
+ }
+
+ counter_++;
+ estimate_ += buffer_.size() - curr_size;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_builder.h b/src/rocksdb/table/block_based/block_builder.h
new file mode 100644
index 000000000..42c996e5b
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_builder.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+
+#include <stdint.h>
+#include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "table/block_based/data_block_hash_index.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder {
+ public:
+ BlockBuilder(const BlockBuilder&) = delete;
+ void operator=(const BlockBuilder&) = delete;
+
+ explicit BlockBuilder(int block_restart_interval,
+ bool use_delta_encoding = true,
+ bool use_value_delta_encoding = false,
+ BlockBasedTableOptions::DataBlockIndexType index_type =
+ BlockBasedTableOptions::kDataBlockBinarySearch,
+ double data_block_hash_table_util_ratio = 0.75);
+
+ // Reset the contents as if the BlockBuilder was just constructed.
+ void Reset();
+
+ // REQUIRES: Finish() has not been called since the last call to Reset().
+ // REQUIRES: key is larger than any previously added key
+ void Add(const Slice& key, const Slice& value,
+ const Slice* const delta_value = nullptr);
+
+ // Finish building the block and return a slice that refers to the
+ // block contents. The returned slice will remain valid for the
+ // lifetime of this builder or until Reset() is called.
+ Slice Finish();
+
+ // Returns an estimate of the current (uncompressed) size of the block
+ // we are building.
+ inline size_t CurrentSizeEstimate() const {
+ return estimate_ + (data_block_hash_index_builder_.Valid()
+ ? data_block_hash_index_builder_.EstimateSize()
+ : 0);
+ }
+
+ // Returns an estimated block size after appending key and value.
+ size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const;
+
+ // Return true iff no entries have been added since the last Reset()
+ bool empty() const { return buffer_.empty(); }
+
+ private:
+ const int block_restart_interval_;
+ // TODO(myabandeh): put it into a separate IndexBlockBuilder
+ const bool use_delta_encoding_;
+ // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values
+ const bool use_value_delta_encoding_;
+
+ std::string buffer_; // Destination buffer
+ std::vector<uint32_t> restarts_; // Restart points
+ size_t estimate_;
+ int counter_; // Number of entries emitted since restart
+ bool finished_; // Has Finish() been called?
+ std::string last_key_;
+ DataBlockHashIndexBuilder data_block_hash_index_builder_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefix_index.cc b/src/rocksdb/table/block_based/block_prefix_index.cc
new file mode 100644
index 000000000..f9d92c74c
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefix_index.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/block_prefix_index.h"
+
+#include <vector>
+
+#include "memory/arena.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+inline uint32_t Hash(const Slice& s) {
+ return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0);
+}
+
+inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
+ return Hash(prefix) % num_buckets;
+}
+
+// The prefix block index is simply a bucket array, with each entry pointing to
+// the blocks that span the prefixes hashed to this bucket.
+//
+// To reduce memory footprint, if there is only one block per bucket, the entry
+// stores the block id directly. If there are more than one blocks per bucket,
+// because of hash collision or a single prefix spanning multiple blocks,
+// the entry points to an array of block ids. The block array is an array of
+// uint32_t's. The first uint32_t indicates the total number of blocks, followed
+// by the block ids.
+//
+// To differentiate the two cases, the high order bit of the entry indicates
+// whether it is a 'pointer' into a separate block array.
+// 0x7FFFFFFF is reserved for empty bucket.
+
+const uint32_t kNoneBlock = 0x7FFFFFFF;
+const uint32_t kBlockArrayMask = 0x80000000;
+
+inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; }
+
+inline bool IsBlockId(uint32_t block_id) {
+ return (block_id & kBlockArrayMask) == 0;
+}
+
+inline uint32_t DecodeIndex(uint32_t block_id) {
+ uint32_t index = block_id ^ kBlockArrayMask;
+ assert(index < kBlockArrayMask);
+ return index;
+}
+
+inline uint32_t EncodeIndex(uint32_t index) {
+ assert(index < kBlockArrayMask);
+ return index | kBlockArrayMask;
+}
+
+// temporary storage for prefix information during index building
+struct PrefixRecord {
+ Slice prefix;
+ uint32_t start_block;
+ uint32_t end_block;
+ uint32_t num_blocks;
+ PrefixRecord* next;
+};
+
+class BlockPrefixIndex::Builder {
+ public:
+ explicit Builder(const SliceTransform* internal_prefix_extractor)
+ : internal_prefix_extractor_(internal_prefix_extractor) {}
+
+ void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) {
+ PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
+ arena_.AllocateAligned(sizeof(PrefixRecord)));
+ record->prefix = key_prefix;
+ record->start_block = start_block;
+ record->end_block = start_block + num_blocks - 1;
+ record->num_blocks = num_blocks;
+ prefixes_.push_back(record);
+ }
+
+ BlockPrefixIndex* Finish() {
+ // For now, use roughly 1:1 prefix to bucket ratio.
+ uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
+
+ // Collect prefix records that hash to the same bucket, into a single
+ // linklist.
+ std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
+ std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
+ for (PrefixRecord* current : prefixes_) {
+ uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
+ // merge the prefix block span if the first block of this prefix is
+ // connected to the last block of the previous prefix.
+ PrefixRecord* prev = prefixes_per_bucket[bucket];
+ if (prev) {
+ assert(current->start_block >= prev->end_block);
+ auto distance = current->start_block - prev->end_block;
+ if (distance <= 1) {
+ prev->end_block = current->end_block;
+ prev->num_blocks = prev->end_block - prev->start_block + 1;
+ num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
+ continue;
+ }
+ }
+ current->next = prev;
+ prefixes_per_bucket[bucket] = current;
+ num_blocks_per_bucket[bucket] += current->num_blocks;
+ }
+
+ // Calculate the block array buffer size
+ uint32_t total_block_array_entries = 0;
+ for (uint32_t i = 0; i < num_buckets; i++) {
+ uint32_t num_blocks = num_blocks_per_bucket[i];
+ if (num_blocks > 1) {
+ total_block_array_entries += (num_blocks + 1);
+ }
+ }
+
+ // Populate the final prefix block index
+ uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
+ uint32_t* buckets = new uint32_t[num_buckets];
+ uint32_t offset = 0;
+ for (uint32_t i = 0; i < num_buckets; i++) {
+ uint32_t num_blocks = num_blocks_per_bucket[i];
+ if (num_blocks == 0) {
+ assert(prefixes_per_bucket[i] == nullptr);
+ buckets[i] = kNoneBlock;
+ } else if (num_blocks == 1) {
+ assert(prefixes_per_bucket[i] != nullptr);
+ assert(prefixes_per_bucket[i]->next == nullptr);
+ buckets[i] = prefixes_per_bucket[i]->start_block;
+ } else {
+ assert(total_block_array_entries > 0);
+ assert(prefixes_per_bucket[i] != nullptr);
+ buckets[i] = EncodeIndex(offset);
+ block_array_buffer[offset] = num_blocks;
+ uint32_t* last_block = &block_array_buffer[offset + num_blocks];
+ auto current = prefixes_per_bucket[i];
+ // populate block ids from largest to smallest
+ while (current != nullptr) {
+ for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
+ *last_block = current->end_block - iter;
+ last_block--;
+ }
+ current = current->next;
+ }
+ assert(last_block == &block_array_buffer[offset]);
+ offset += (num_blocks + 1);
+ }
+ }
+
+ assert(offset == total_block_array_entries);
+
+ return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets,
+ buckets, total_block_array_entries,
+ block_array_buffer);
+ }
+
+ private:
+ const SliceTransform* internal_prefix_extractor_;
+
+ std::vector<PrefixRecord*> prefixes_;
+ Arena arena_;
+};
+
+Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
+ const Slice& prefixes, const Slice& prefix_meta,
+ BlockPrefixIndex** prefix_index) {
+ uint64_t pos = 0;
+ auto meta_pos = prefix_meta;
+ Status s;
+ Builder builder(internal_prefix_extractor);
+
+ while (!meta_pos.empty()) {
+ uint32_t prefix_size = 0;
+ uint32_t entry_index = 0;
+ uint32_t num_blocks = 0;
+ if (!GetVarint32(&meta_pos, &prefix_size) ||
+ !GetVarint32(&meta_pos, &entry_index) ||
+ !GetVarint32(&meta_pos, &num_blocks)) {
+ s = Status::Corruption(
+ "Corrupted prefix meta block: unable to read from it.");
+ break;
+ }
+ if (pos + prefix_size > prefixes.size()) {
+ s = Status::Corruption(
+ "Corrupted prefix meta block: size inconsistency.");
+ break;
+ }
+ Slice prefix(prefixes.data() + pos, prefix_size);
+ builder.Add(prefix, entry_index, num_blocks);
+
+ pos += prefix_size;
+ }
+
+ if (s.ok() && pos != prefixes.size()) {
+ s = Status::Corruption("Corrupted prefix meta block");
+ }
+
+ if (s.ok()) {
+ *prefix_index = builder.Finish();
+ }
+
+ return s;
+}
+
+uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) {
+ Slice prefix = internal_prefix_extractor_->Transform(key);
+
+ uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
+ uint32_t block_id = buckets_[bucket];
+
+ if (IsNone(block_id)) {
+ return 0;
+ } else if (IsBlockId(block_id)) {
+ *blocks = &buckets_[bucket];
+ return 1;
+ } else {
+ uint32_t index = DecodeIndex(block_id);
+ assert(index < num_block_array_buffer_entries_);
+ *blocks = &block_array_buffer_[index + 1];
+ uint32_t num_blocks = block_array_buffer_[index];
+ assert(num_blocks > 1);
+ assert(index + num_blocks < num_block_array_buffer_entries_);
+ return num_blocks;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefix_index.h b/src/rocksdb/table/block_based/block_prefix_index.h
new file mode 100644
index 000000000..04121320e
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefix_index.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+class Iterator;
+class Slice;
+class SliceTransform;
+
+// Build a hash-based index to speed up the lookup for "index block".
+// BlockHashIndex accepts a key and, if found, returns its restart index within
+// that index block.
+class BlockPrefixIndex {
+ public:
+ // Maps a key to a list of data blocks that could potentially contain
+ // the key, based on the prefix.
+ // Returns the total number of relevant blocks, 0 means the key does
+ // not exist.
+ uint32_t GetBlocks(const Slice& key, uint32_t** blocks);
+
+ size_t ApproximateMemoryUsage() const {
+ return sizeof(BlockPrefixIndex) +
+ (num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t);
+ }
+
+ // Create hash index by reading from the metadata blocks.
+ // @params prefixes: a sequence of prefixes.
+ // @params prefix_meta: contains the "metadata" to of the prefixes.
+ static Status Create(const SliceTransform* hash_key_extractor,
+ const Slice& prefixes, const Slice& prefix_meta,
+ BlockPrefixIndex** prefix_index);
+
+ ~BlockPrefixIndex() {
+ delete[] buckets_;
+ delete[] block_array_buffer_;
+ }
+
+ private:
+ class Builder;
+ friend Builder;
+
+ BlockPrefixIndex(const SliceTransform* internal_prefix_extractor,
+ uint32_t num_buckets, uint32_t* buckets,
+ uint32_t num_block_array_buffer_entries,
+ uint32_t* block_array_buffer)
+ : internal_prefix_extractor_(internal_prefix_extractor),
+ num_buckets_(num_buckets),
+ num_block_array_buffer_entries_(num_block_array_buffer_entries),
+ buckets_(buckets),
+ block_array_buffer_(block_array_buffer) {}
+
+ const SliceTransform* internal_prefix_extractor_;
+ uint32_t num_buckets_;
+ uint32_t num_block_array_buffer_entries_;
+ uint32_t* buckets_;
+ uint32_t* block_array_buffer_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_test.cc b/src/rocksdb/table/block_based/block_test.cc
new file mode 100644
index 000000000..efa5b3ae3
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_test.cc
@@ -0,0 +1,627 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#include <stdio.h>
+#include <algorithm>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string RandomString(Random *rnd, int len) {
+ std::string r;
+ test::RandomString(rnd, len, &r);
+ return r;
+}
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+ Random *rnd) {
+ char buf[50];
+ char *p = &buf[0];
+ snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+ std::string k(p);
+ if (padding_size) {
+ k += RandomString(rnd, padding_size);
+ }
+
+ return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string> *keys,
+ std::vector<std::string> *values, const int from,
+ const int len, const int step = 1,
+ const int padding_size = 0,
+ const int keys_share_prefix = 1) {
+ Random rnd(302);
+
+ // generate different prefix
+ for (int i = from; i < from + len; i += step) {
+ // generating keys that shares the prefix
+ for (int j = 0; j < keys_share_prefix; ++j) {
+ keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+ // 100 bytes values
+ values->emplace_back(RandomString(&rnd, 100));
+ }
+ }
+}
+
+class BlockTest : public testing::Test {};
+
+// block test
+TEST_F(BlockTest, SimpleTest) {
+ Random rnd(301);
+ Options options = Options();
+
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+ BlockBuilder builder(16);
+ int num_records = 100000;
+
+ GenerateRandomKVs(&keys, &values, 0, num_records);
+ // add a bunch of records to a block
+ for (int i = 0; i < num_records; i++) {
+ builder.Add(keys[i], values[i]);
+ }
+
+ // read serialized contents of the block
+ Slice rawblock = builder.Finish();
+
+ // create block reader
+ BlockContents contents;
+ contents.data = rawblock;
+ Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+ // read contents of block sequentially
+ int count = 0;
+ InternalIterator *iter =
+ reader.NewDataIterator(options.comparator, options.comparator);
+ for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
+ // read kv from block
+ Slice k = iter->key();
+ Slice v = iter->value();
+
+ // compare with lookaside array
+ ASSERT_EQ(k.ToString().compare(keys[count]), 0);
+ ASSERT_EQ(v.ToString().compare(values[count]), 0);
+ }
+ delete iter;
+
+ // read block contents randomly
+ iter = reader.NewDataIterator(options.comparator, options.comparator);
+ for (int i = 0; i < num_records; i++) {
+ // find a random key in the lookaside array
+ int index = rnd.Uniform(num_records);
+ Slice k(keys[index]);
+
+ // search in block for this key
+ iter->Seek(k);
+ ASSERT_TRUE(iter->Valid());
+ Slice v = iter->value();
+ ASSERT_EQ(v.ToString().compare(values[index]), 0);
+ }
+ delete iter;
+}
+
+// return the block contents
+BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
+ const std::vector<std::string> &keys,
+ const std::vector<std::string> &values,
+ const int /*prefix_group_size*/ = 1) {
+ builder->reset(new BlockBuilder(1 /* restart interval */));
+
+ // Add only half of the keys
+ for (size_t i = 0; i < keys.size(); ++i) {
+ (*builder)->Add(keys[i], values[i]);
+ }
+ Slice rawblock = (*builder)->Finish();
+
+ BlockContents contents;
+ contents.data = rawblock;
+
+ return contents;
+}
+
+void CheckBlockContents(BlockContents contents, const int max_key,
+ const std::vector<std::string> &keys,
+ const std::vector<std::string> &values) {
+ const size_t prefix_size = 6;
+ // create block reader
+ BlockContents contents_ref(contents.data);
+ Block reader1(std::move(contents), kDisableGlobalSequenceNumber);
+ Block reader2(std::move(contents_ref), kDisableGlobalSequenceNumber);
+
+ std::unique_ptr<const SliceTransform> prefix_extractor(
+ NewFixedPrefixTransform(prefix_size));
+
+ std::unique_ptr<InternalIterator> regular_iter(
+ reader2.NewDataIterator(BytewiseComparator(), BytewiseComparator()));
+
+ // Seek existent keys
+ for (size_t i = 0; i < keys.size(); i++) {
+ regular_iter->Seek(keys[i]);
+ ASSERT_OK(regular_iter->status());
+ ASSERT_TRUE(regular_iter->Valid());
+
+ Slice v = regular_iter->value();
+ ASSERT_EQ(v.ToString().compare(values[i]), 0);
+ }
+
+ // Seek non-existent keys.
+ // For hash index, if no key with a given prefix is not found, iterator will
+ // simply be set as invalid; whereas the binary search based iterator will
+ // return the one that is closest.
+ for (int i = 1; i < max_key - 1; i += 2) {
+ auto key = GenerateKey(i, 0, 0, nullptr);
+ regular_iter->Seek(key);
+ ASSERT_TRUE(regular_iter->Valid());
+ }
+}
+
+// In this test case, no two key share same prefix.
+TEST_F(BlockTest, SimpleIndexHash) {
+ const int kMaxKey = 100000;
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+ GenerateRandomKVs(&keys, &values, 0 /* first key id */,
+ kMaxKey /* last key id */, 2 /* step */,
+ 8 /* padding size (8 bytes randomly generated suffix) */);
+
+ std::unique_ptr<BlockBuilder> builder;
+ auto contents = GetBlockContents(&builder, keys, values);
+
+ CheckBlockContents(std::move(contents), kMaxKey, keys, values);
+}
+
+TEST_F(BlockTest, IndexHashWithSharedPrefix) {
+ const int kMaxKey = 100000;
+ // for each prefix, there will be 5 keys starts with it.
+ const int kPrefixGroup = 5;
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+ // Generate keys with same prefix.
+ GenerateRandomKVs(&keys, &values, 0, // first key id
+ kMaxKey, // last key id
+ 2, // step
+ 10, // padding size,
+ kPrefixGroup);
+
+ std::unique_ptr<BlockBuilder> builder;
+ auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
+
+ CheckBlockContents(std::move(contents), kMaxKey, keys, values);
+}
+
+// A slow and accurate version of BlockReadAmpBitmap that simply store
+// all the marked ranges in a set.
+class BlockReadAmpBitmapSlowAndAccurate {
+ public:
+ void Mark(size_t start_offset, size_t end_offset) {
+ assert(end_offset >= start_offset);
+ marked_ranges_.emplace(end_offset, start_offset);
+ }
+
+ void ResetCheckSequence() { iter_valid_ = false; }
+
+ // Return true if any byte in this range was Marked
+ // This does linear search from the previous position. When calling
+ // multiple times, `offset` needs to be incremental to get correct results.
+ // Call ResetCheckSequence() to reset it.
+ bool IsPinMarked(size_t offset) {
+ if (iter_valid_) {
+ // Has existing iterator, try linear search from
+ // the iterator.
+ for (int i = 0; i < 64; i++) {
+ if (offset < iter_->second) {
+ return false;
+ }
+ if (offset <= iter_->first) {
+ return true;
+ }
+
+ iter_++;
+ if (iter_ == marked_ranges_.end()) {
+ iter_valid_ = false;
+ return false;
+ }
+ }
+ }
+ // Initial call or have linear searched too many times.
+ // Do binary search.
+ iter_ = marked_ranges_.lower_bound(
+ std::make_pair(offset, static_cast<size_t>(0)));
+ if (iter_ == marked_ranges_.end()) {
+ iter_valid_ = false;
+ return false;
+ }
+ iter_valid_ = true;
+ return offset <= iter_->first && offset >= iter_->second;
+ }
+
+ private:
+ std::set<std::pair<size_t, size_t>> marked_ranges_;
+ std::set<std::pair<size_t, size_t>>::iterator iter_;
+ bool iter_valid_ = false;
+};
+
+TEST_F(BlockTest, BlockReadAmpBitmap) {
+ uint32_t pin_offset = 0;
+ SyncPoint::GetInstance()->SetCallBack(
+ "BlockReadAmpBitmap:rnd", [&pin_offset](void *arg) {
+ pin_offset = *(static_cast<uint32_t *>(arg));
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ std::vector<size_t> block_sizes = {
+ 1, // 1 byte
+ 32, // 32 bytes
+ 61, // 61 bytes
+ 64, // 64 bytes
+ 512, // 0.5 KB
+ 1024, // 1 KB
+ 1024 * 4, // 4 KB
+ 1024 * 10, // 10 KB
+ 1024 * 50, // 50 KB
+ 1024 * 1024 * 4, // 5 MB
+ 777,
+ 124653,
+ };
+ const size_t kBytesPerBit = 64;
+
+ Random rnd(301);
+ for (size_t block_size : block_sizes) {
+ std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ BlockReadAmpBitmap read_amp_bitmap(block_size, kBytesPerBit, stats.get());
+ BlockReadAmpBitmapSlowAndAccurate read_amp_slow_and_accurate;
+
+ size_t needed_bits = (block_size / kBytesPerBit);
+ if (block_size % kBytesPerBit != 0) {
+ needed_bits++;
+ }
+
+ ASSERT_EQ(stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES), block_size);
+
+ // Generate some random entries
+ std::vector<size_t> random_entry_offsets;
+ for (int i = 0; i < 1000; i++) {
+ random_entry_offsets.push_back(rnd.Next() % block_size);
+ }
+ std::sort(random_entry_offsets.begin(), random_entry_offsets.end());
+ auto it =
+ std::unique(random_entry_offsets.begin(), random_entry_offsets.end());
+ random_entry_offsets.resize(
+ std::distance(random_entry_offsets.begin(), it));
+
+ std::vector<std::pair<size_t, size_t>> random_entries;
+ for (size_t i = 0; i < random_entry_offsets.size(); i++) {
+ size_t entry_start = random_entry_offsets[i];
+ size_t entry_end;
+ if (i + 1 < random_entry_offsets.size()) {
+ entry_end = random_entry_offsets[i + 1] - 1;
+ } else {
+ entry_end = block_size - 1;
+ }
+ random_entries.emplace_back(entry_start, entry_end);
+ }
+
+ for (size_t i = 0; i < random_entries.size(); i++) {
+ read_amp_slow_and_accurate.ResetCheckSequence();
+ auto &current_entry = random_entries[rnd.Next() % random_entries.size()];
+
+ read_amp_bitmap.Mark(static_cast<uint32_t>(current_entry.first),
+ static_cast<uint32_t>(current_entry.second));
+ read_amp_slow_and_accurate.Mark(current_entry.first,
+ current_entry.second);
+
+ size_t total_bits = 0;
+ for (size_t bit_idx = 0; bit_idx < needed_bits; bit_idx++) {
+ total_bits += read_amp_slow_and_accurate.IsPinMarked(
+ bit_idx * kBytesPerBit + pin_offset);
+ }
+ size_t expected_estimate_useful = total_bits * kBytesPerBit;
+ size_t got_estimate_useful =
+ stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+ ASSERT_EQ(expected_estimate_useful, got_estimate_useful);
+ }
+ }
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlockTest, BlockWithReadAmpBitmap) {
+ Random rnd(301);
+ Options options = Options();
+
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+ BlockBuilder builder(16);
+ int num_records = 10000;
+
+ GenerateRandomKVs(&keys, &values, 0, num_records, 1);
+ // add a bunch of records to a block
+ for (int i = 0; i < num_records; i++) {
+ builder.Add(keys[i], values[i]);
+ }
+
+ Slice rawblock = builder.Finish();
+ const size_t kBytesPerBit = 8;
+
+ // Read the block sequentially using Next()
+ {
+ std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ // create block reader
+ BlockContents contents;
+ contents.data = rawblock;
+ Block reader(std::move(contents), kDisableGlobalSequenceNumber,
+ kBytesPerBit, stats.get());
+
+ // read contents of block sequentially
+ size_t read_bytes = 0;
+ DataBlockIter *iter = reader.NewDataIterator(
+ options.comparator, options.comparator, nullptr, stats.get());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ iter->value();
+ read_bytes += iter->TEST_CurrentEntrySize();
+
+ double semi_acc_read_amp =
+ static_cast<double>(read_bytes) / rawblock.size();
+ double read_amp = static_cast<double>(stats->getTickerCount(
+ READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+ stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+ // Error in read amplification will be less than 1% if we are reading
+ // sequentially
+ double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+ EXPECT_LT(error_pct, 1);
+ }
+
+ delete iter;
+ }
+
+ // Read the block sequentially using Seek()
+ {
+ std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ // create block reader
+ BlockContents contents;
+ contents.data = rawblock;
+ Block reader(std::move(contents), kDisableGlobalSequenceNumber,
+ kBytesPerBit, stats.get());
+
+ size_t read_bytes = 0;
+ DataBlockIter *iter = reader.NewDataIterator(
+ options.comparator, options.comparator, nullptr, stats.get());
+ for (int i = 0; i < num_records; i++) {
+ Slice k(keys[i]);
+
+ // search in block for this key
+ iter->Seek(k);
+ iter->value();
+ read_bytes += iter->TEST_CurrentEntrySize();
+
+ double semi_acc_read_amp =
+ static_cast<double>(read_bytes) / rawblock.size();
+ double read_amp = static_cast<double>(stats->getTickerCount(
+ READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+ stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+ // Error in read amplification will be less than 1% if we are reading
+ // sequentially
+ double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+ EXPECT_LT(error_pct, 1);
+ }
+ delete iter;
+ }
+
+ // Read the block randomly
+ {
+ std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+ // create block reader
+ BlockContents contents;
+ contents.data = rawblock;
+ Block reader(std::move(contents), kDisableGlobalSequenceNumber,
+ kBytesPerBit, stats.get());
+
+ size_t read_bytes = 0;
+ DataBlockIter *iter = reader.NewDataIterator(
+ options.comparator, options.comparator, nullptr, stats.get());
+ std::unordered_set<int> read_keys;
+ for (int i = 0; i < num_records; i++) {
+ int index = rnd.Uniform(num_records);
+ Slice k(keys[index]);
+
+ iter->Seek(k);
+ iter->value();
+ if (read_keys.find(index) == read_keys.end()) {
+ read_keys.insert(index);
+ read_bytes += iter->TEST_CurrentEntrySize();
+ }
+
+ double semi_acc_read_amp =
+ static_cast<double>(read_bytes) / rawblock.size();
+ double read_amp = static_cast<double>(stats->getTickerCount(
+ READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+ stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+ double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+ // Error in read amplification will be less than 2% if we are reading
+ // randomly
+ EXPECT_LT(error_pct, 2);
+ }
+ delete iter;
+ }
+}
+
+TEST_F(BlockTest, ReadAmpBitmapPow2) {
+ std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+ ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1u);
+ ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2u);
+ ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4u);
+ ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8u);
+ ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16u);
+ ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32u);
+
+ ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2u);
+ ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4u);
+ ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8u);
+ ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16u);
+ ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32u);
+ ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u);
+}
+
+class IndexBlockTest
+ : public testing::Test,
+ public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+ IndexBlockTest() = default;
+
+ bool useValueDeltaEncoding() const { return std::get<0>(GetParam()); }
+ bool includeFirstKey() const { return std::get<1>(GetParam()); }
+};
+
+// Similar to GenerateRandomKVs but for index block contents.
+void GenerateRandomIndexEntries(std::vector<std::string> *separators,
+ std::vector<BlockHandle> *block_handles,
+ std::vector<std::string> *first_keys,
+ const int len) {
+ Random rnd(42);
+
+ // For each of `len` blocks, we need to generate a first and last key.
+ // Let's generate n*2 random keys, sort them, group into consecutive pairs.
+ std::set<std::string> keys;
+ while ((int)keys.size() < len * 2) {
+ // Keys need to be at least 8 bytes long to look like internal keys.
+ keys.insert(test::RandomKey(&rnd, 12));
+ }
+
+ uint64_t offset = 0;
+ for (auto it = keys.begin(); it != keys.end();) {
+ first_keys->emplace_back(*it++);
+ separators->emplace_back(*it++);
+ uint64_t size = rnd.Uniform(1024 * 16);
+ BlockHandle handle(offset, size);
+ offset += size + kBlockTrailerSize;
+ block_handles->emplace_back(handle);
+ }
+}
+
+TEST_P(IndexBlockTest, IndexValueEncodingTest) {
+ Random rnd(301);
+ Options options = Options();
+
+ std::vector<std::string> separators;
+ std::vector<BlockHandle> block_handles;
+ std::vector<std::string> first_keys;
+ const bool kUseDeltaEncoding = true;
+ BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding());
+ int num_records = 100;
+
+ GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
+ num_records);
+ BlockHandle last_encoded_handle;
+ for (int i = 0; i < num_records; i++) {
+ IndexValue entry(block_handles[i], first_keys[i]);
+ std::string encoded_entry;
+ std::string delta_encoded_entry;
+ entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr);
+ if (useValueDeltaEncoding() && i > 0) {
+ entry.EncodeTo(&delta_encoded_entry, includeFirstKey(),
+ &last_encoded_handle);
+ }
+ last_encoded_handle = entry.handle;
+ const Slice delta_encoded_entry_slice(delta_encoded_entry);
+ builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice);
+ }
+
+ // read serialized contents of the block
+ Slice rawblock = builder.Finish();
+
+ // create block reader
+ BlockContents contents;
+ contents.data = rawblock;
+ Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+ const bool kTotalOrderSeek = true;
+ const bool kIncludesSeq = true;
+ const bool kValueIsFull = !useValueDeltaEncoding();
+ IndexBlockIter *kNullIter = nullptr;
+ Statistics *kNullStats = nullptr;
+ // read contents of block sequentially
+ InternalIteratorBase<IndexValue> *iter = reader.NewIndexIterator(
+ options.comparator, options.comparator, kNullIter, kNullStats,
+ kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull);
+ iter->SeekToFirst();
+ for (int index = 0; index < num_records; ++index) {
+ ASSERT_TRUE(iter->Valid());
+
+ Slice k = iter->key();
+ IndexValue v = iter->value();
+
+ EXPECT_EQ(separators[index], k.ToString());
+ EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+ EXPECT_EQ(block_handles[index].size(), v.handle.size());
+ EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+ v.first_internal_key.ToString());
+
+ iter->Next();
+ }
+ delete iter;
+
+ // read block contents randomly
+ iter = reader.NewIndexIterator(options.comparator, options.comparator,
+ kNullIter, kNullStats, kTotalOrderSeek,
+ includeFirstKey(), kIncludesSeq, kValueIsFull);
+ for (int i = 0; i < num_records * 2; i++) {
+ // find a random key in the lookaside array
+ int index = rnd.Uniform(num_records);
+ Slice k(separators[index]);
+
+ // search in block for this key
+ iter->Seek(k);
+ ASSERT_TRUE(iter->Valid());
+ IndexValue v = iter->value();
+ EXPECT_EQ(separators[index], iter->key().ToString());
+ EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+ EXPECT_EQ(block_handles[index].size(), v.handle.size());
+ EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+ v.first_internal_key.ToString());
+ }
+ delete iter;
+}
+
+INSTANTIATE_TEST_CASE_P(P, IndexBlockTest,
+ ::testing::Values(std::make_tuple(false, false),
+ std::make_tuple(false, true),
+ std::make_tuple(true, false),
+ std::make_tuple(true, true)));
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char **argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/block_type.h b/src/rocksdb/table/block_based/block_type.h
new file mode 100644
index 000000000..b2a913746
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_type.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+namespace ROCKSDB_NAMESPACE {
+
+// Represents the types of blocks used in the block based table format.
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details.
+
+enum class BlockType : uint8_t {
+ kData,
+ kFilter,
+ kProperties,
+ kCompressionDictionary,
+ kRangeDeletion,
+ kHashIndexPrefixes,
+ kHashIndexMetadata,
+ kMetaIndex,
+ kIndex,
+ // Note: keep kInvalid the last value when adding new enum values.
+ kInvalid
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/cachable_entry.h b/src/rocksdb/table/block_based/cachable_entry.h
new file mode 100644
index 000000000..598f1ef57
--- /dev/null
+++ b/src/rocksdb/table/block_based/cachable_entry.h
@@ -0,0 +1,220 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include "port/likely.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/cleanable.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// CachableEntry is a handle to an object that may or may not be in the block
+// cache. It is used in a variety of ways:
+//
+// 1) It may refer to an object in the block cache. In this case, cache_ and
+// cache_handle_ are not nullptr, and the cache handle has to be released when
+// the CachableEntry is destroyed (the lifecycle of the cached object, on the
+// other hand, is managed by the cache itself).
+// 2) It may uniquely own the (non-cached) object it refers to (examples include
+// a block read directly from file, or uncompressed blocks when there is a
+// compressed block cache but no uncompressed block cache). In such cases, the
+// object has to be destroyed when the CachableEntry is destroyed.
+// 3) It may point to an object (cached or not) without owning it. In this case,
+// no action is needed when the CachableEntry is destroyed.
+// 4) Sometimes, management of a cached or owned object (see #1 and #2 above)
+// is transferred to some other object. This is used for instance with iterators
+// (where cleanup is performed using a chain of cleanup functions,
+// see Cleanable).
+//
+// Because of #1 and #2 above, copying a CachableEntry is not safe (and thus not
+// allowed); hence, this is a move-only type, where a move transfers the
+// management responsibilities, and leaves the source object in an empty state.
+
+template <class T>
+class CachableEntry {
+public:
+ CachableEntry() = default;
+
+ CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle,
+ bool own_value)
+ : value_(value)
+ , cache_(cache)
+ , cache_handle_(cache_handle)
+ , own_value_(own_value)
+ {
+ assert(value_ != nullptr ||
+ (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+ assert(!!cache_ == !!cache_handle_);
+ assert(!cache_handle_ || !own_value_);
+ }
+
+ CachableEntry(const CachableEntry&) = delete;
+ CachableEntry& operator=(const CachableEntry&) = delete;
+
+ CachableEntry(CachableEntry&& rhs)
+ : value_(rhs.value_)
+ , cache_(rhs.cache_)
+ , cache_handle_(rhs.cache_handle_)
+ , own_value_(rhs.own_value_)
+ {
+ assert(value_ != nullptr ||
+ (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+ assert(!!cache_ == !!cache_handle_);
+ assert(!cache_handle_ || !own_value_);
+
+ rhs.ResetFields();
+ }
+
+ CachableEntry& operator=(CachableEntry&& rhs) {
+ if (UNLIKELY(this == &rhs)) {
+ return *this;
+ }
+
+ ReleaseResource();
+
+ value_ = rhs.value_;
+ cache_ = rhs.cache_;
+ cache_handle_ = rhs.cache_handle_;
+ own_value_ = rhs.own_value_;
+
+ assert(value_ != nullptr ||
+ (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+ assert(!!cache_ == !!cache_handle_);
+ assert(!cache_handle_ || !own_value_);
+
+ rhs.ResetFields();
+
+ return *this;
+ }
+
+ ~CachableEntry() {
+ ReleaseResource();
+ }
+
+ bool IsEmpty() const {
+ return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr &&
+ !own_value_;
+ }
+
+ bool IsCached() const {
+ assert(!!cache_ == !!cache_handle_);
+
+ return cache_handle_ != nullptr;
+ }
+
+ T* GetValue() const { return value_; }
+ Cache* GetCache() const { return cache_; }
+ Cache::Handle* GetCacheHandle() const { return cache_handle_; }
+ bool GetOwnValue() const { return own_value_; }
+
+ void Reset() {
+ ReleaseResource();
+ ResetFields();
+ }
+
+ void TransferTo(Cleanable* cleanable) {
+ if (cleanable) {
+ if (cache_handle_ != nullptr) {
+ assert(cache_ != nullptr);
+ cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, cache_handle_);
+ } else if (own_value_) {
+ cleanable->RegisterCleanup(&DeleteValue, value_, nullptr);
+ }
+ }
+
+ ResetFields();
+ }
+
+ void SetOwnedValue(T* value) {
+ assert(value != nullptr);
+
+ if (UNLIKELY(value_ == value && own_value_)) {
+ assert(cache_ == nullptr && cache_handle_ == nullptr);
+ return;
+ }
+
+ Reset();
+
+ value_ = value;
+ own_value_ = true;
+ }
+
+ void SetUnownedValue(T* value) {
+ assert(value != nullptr);
+
+ if (UNLIKELY(value_ == value && cache_ == nullptr &&
+ cache_handle_ == nullptr && !own_value_)) {
+ return;
+ }
+
+ Reset();
+
+ value_ = value;
+ assert(!own_value_);
+ }
+
+ void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) {
+ assert(value != nullptr);
+ assert(cache != nullptr);
+ assert(cache_handle != nullptr);
+
+ if (UNLIKELY(value_ == value && cache_ == cache &&
+ cache_handle_ == cache_handle && !own_value_)) {
+ return;
+ }
+
+ Reset();
+
+ value_ = value;
+ cache_ = cache;
+ cache_handle_ = cache_handle;
+ assert(!own_value_);
+ }
+
+private:
+ void ReleaseResource() {
+ if (LIKELY(cache_handle_ != nullptr)) {
+ assert(cache_ != nullptr);
+ cache_->Release(cache_handle_);
+ } else if (own_value_) {
+ delete value_;
+ }
+ }
+
+ void ResetFields() {
+ value_ = nullptr;
+ cache_ = nullptr;
+ cache_handle_ = nullptr;
+ own_value_ = false;
+ }
+
+ static void ReleaseCacheHandle(void* arg1, void* arg2) {
+ Cache* const cache = static_cast<Cache*>(arg1);
+ assert(cache);
+
+ Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
+ assert(cache_handle);
+
+ cache->Release(cache_handle);
+ }
+
+ static void DeleteValue(void* arg1, void* /* arg2 */) {
+ delete static_cast<T*>(arg1);
+ }
+
+private:
+ T* value_ = nullptr;
+ Cache* cache_ = nullptr;
+ Cache::Handle* cache_handle_ = nullptr;
+ bool own_value_ = false;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_footer.cc b/src/rocksdb/table/block_based/data_block_footer.cc
new file mode 100644
index 000000000..5d5d8ed55
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_footer.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/data_block_footer.h"
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const int kDataBlockIndexTypeBitShift = 31;
+
+// 0x7FFFFFFF
+const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+// 0x7FFFFFFF
+const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+uint32_t PackIndexTypeAndNumRestarts(
+ BlockBasedTableOptions::DataBlockIndexType index_type,
+ uint32_t num_restarts) {
+ if (num_restarts > kMaxNumRestarts) {
+ assert(0); // mute travis "unused" warning
+ }
+
+ uint32_t block_footer = num_restarts;
+ if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) {
+ block_footer |= 1u << kDataBlockIndexTypeBitShift;
+ } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) {
+ assert(0);
+ }
+
+ return block_footer;
+}
+
+void UnPackIndexTypeAndNumRestarts(
+ uint32_t block_footer,
+ BlockBasedTableOptions::DataBlockIndexType* index_type,
+ uint32_t* num_restarts) {
+ if (index_type) {
+ if (block_footer & 1u << kDataBlockIndexTypeBitShift) {
+ *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+ } else {
+ *index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
+ }
+ }
+
+ if (num_restarts) {
+ *num_restarts = block_footer & kNumRestartsMask;
+ assert(*num_restarts <= kMaxNumRestarts);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_footer.h b/src/rocksdb/table/block_based/data_block_footer.h
new file mode 100644
index 000000000..c1cfd4730
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_footer.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint32_t PackIndexTypeAndNumRestarts(
+ BlockBasedTableOptions::DataBlockIndexType index_type,
+ uint32_t num_restarts);
+
+void UnPackIndexTypeAndNumRestarts(
+ uint32_t block_footer,
+ BlockBasedTableOptions::DataBlockIndexType* index_type,
+ uint32_t* num_restarts);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index.cc b/src/rocksdb/table/block_based/data_block_hash_index.cc
new file mode 100644
index 000000000..222475834
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+#include "table/block_based/data_block_hash_index.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DataBlockHashIndexBuilder::Add(const Slice& key,
+ const size_t restart_index) {
+ assert(Valid());
+ if (restart_index > kMaxRestartSupportedByHashIndex) {
+ valid_ = false;
+ return;
+ }
+
+ uint32_t hash_value = GetSliceHash(key);
+ hash_and_restart_pairs_.emplace_back(hash_value,
+ static_cast<uint8_t>(restart_index));
+ estimated_num_buckets_ += bucket_per_key_;
+}
+
+void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
+ assert(Valid());
+ uint16_t num_buckets = static_cast<uint16_t>(estimated_num_buckets_);
+
+ if (num_buckets == 0) {
+ num_buckets = 1; // sanity check
+ }
+
+ // The build-in hash cannot well distribute strings when into different
+ // buckets when num_buckets is power of two, resulting in high hash
+ // collision.
+ // We made the num_buckets to be odd to avoid this issue.
+ num_buckets |= 1;
+
+ std::vector<uint8_t> buckets(num_buckets, kNoEntry);
+ // write the restart_index array
+ for (auto& entry : hash_and_restart_pairs_) {
+ uint32_t hash_value = entry.first;
+ uint8_t restart_index = entry.second;
+ uint16_t buck_idx = static_cast<uint16_t>(hash_value % num_buckets);
+ if (buckets[buck_idx] == kNoEntry) {
+ buckets[buck_idx] = restart_index;
+ } else if (buckets[buck_idx] != restart_index) {
+ // same bucket cannot store two different restart_index, mark collision
+ buckets[buck_idx] = kCollision;
+ }
+ }
+
+ for (uint8_t restart_index : buckets) {
+ buffer.append(
+ const_cast<const char*>(reinterpret_cast<char*>(&restart_index)),
+ sizeof(restart_index));
+ }
+
+ // write NUM_BUCK
+ PutFixed16(&buffer, num_buckets);
+
+ assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex);
+}
+
+void DataBlockHashIndexBuilder::Reset() {
+ estimated_num_buckets_ = 0;
+ valid_ = true;
+ hash_and_restart_pairs_.clear();
+}
+
+void DataBlockHashIndex::Initialize(const char* data, uint16_t size,
+ uint16_t* map_offset) {
+ assert(size >= sizeof(uint16_t)); // NUM_BUCKETS
+ num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t));
+ assert(num_buckets_ > 0);
+ assert(size > num_buckets_ * sizeof(uint8_t));
+ *map_offset = static_cast<uint16_t>(size - sizeof(uint16_t) -
+ num_buckets_ * sizeof(uint8_t));
+}
+
+uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset,
+ const Slice& key) const {
+ uint32_t hash_value = GetSliceHash(key);
+ uint16_t idx = static_cast<uint16_t>(hash_value % num_buckets_);
+ const char* bucket_table = data + map_offset;
+ return static_cast<uint8_t>(*(bucket_table + idx * sizeof(uint8_t)));
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index.h b/src/rocksdb/table/block_based/data_block_hash_index.h
new file mode 100644
index 000000000..f356395f3
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index.h
@@ -0,0 +1,136 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+// This is an experimental feature aiming to reduce the CPU utilization of
+// point-lookup within a data-block. It is only used in data blocks, and not
+// in meta-data blocks or per-table index blocks.
+//
+// It only used to support BlockBasedTable::Get().
+//
+// A serialized hash index is appended to the data-block. The new block data
+// format is as follows:
+//
+// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
+//
+// RI: Restart Interval (the same as the default data-block format)
+// RI_IDX: Restart Interval index (the same as the default data-block format)
+// HASH_IDX: The new data-block hash index feature.
+// FOOTER: A 32bit block footer, which is the NUM_RESTARTS with the MSB as
+// the flag indicating if this hash index is in use. Note that
+// given a data block < 32KB, the MSB is never used. So we can
+// borrow the MSB as the hash index flag. Therefore, this format is
+// compatible with the legacy data-blocks with num_restarts < 32768,
+// as the MSB is 0.
+//
+// The format of the data-block hash index is as follows:
+//
+// HASH_IDX: [B B B ... B NUM_BUCK]
+//
+// B: bucket, an array of restart index. Each buckets is uint8_t.
+// NUM_BUCK: Number of buckets, which is the length of the bucket array.
+//
+// We reserve two special flag:
+// kNoEntry=255,
+// kCollision=254.
+//
+// Therefore, the max number of restarts this hash index can supoport is 253.
+//
+// Buckets are initialized to be kNoEntry.
+//
+// When storing a key in the hash index, the key is first hashed to a bucket.
+// If there the bucket is empty (kNoEntry), the restart index is stored in
+// the bucket. If there is already a restart index there, we will update the
+// existing restart index to a collision marker (kCollision). If the
+// the bucket is already marked as collision, we do not store the restart
+// index either.
+//
+// During query process, a key is first hashed to a bucket. Then we examine if
+// the buckets store nothing (kNoEntry) or the bucket had a collision
+// (kCollision). If either of those happens, we get the restart index of
+// the key and will directly go to the restart interval to search the key.
+//
+// Note that we only support blocks with #restart_interval < 254. If a block
+// has more restart interval than that, hash index will not be create for it.
+
+const uint8_t kNoEntry = 255;
+const uint8_t kCollision = 254;
+const uint8_t kMaxRestartSupportedByHashIndex = 253;
+
+// Because we use uint16_t address, we only support block no more than 64KB
+const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
+const double kDefaultUtilRatio = 0.75;
+
+class DataBlockHashIndexBuilder {
+ public:
+ DataBlockHashIndexBuilder()
+ : bucket_per_key_(-1 /*uninitialized marker*/),
+ estimated_num_buckets_(0),
+ valid_(false) {}
+
+ void Initialize(double util_ratio) {
+ if (util_ratio <= 0) {
+ util_ratio = kDefaultUtilRatio; // sanity check
+ }
+ bucket_per_key_ = 1 / util_ratio;
+ valid_ = true;
+ }
+
+ inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
+ void Add(const Slice& key, const size_t restart_index);
+ void Finish(std::string& buffer);
+ void Reset();
+ inline size_t EstimateSize() const {
+ uint16_t estimated_num_buckets =
+ static_cast<uint16_t>(estimated_num_buckets_);
+
+ // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
+ estimated_num_buckets |= 1;
+
+ return sizeof(uint16_t) +
+ static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
+ }
+
+ private:
+ double bucket_per_key_; // is the multiplicative inverse of util_ratio_
+ double estimated_num_buckets_;
+
+ // Now the only usage for `valid_` is to mark false when the inserted
+ // restart_index is larger than supported. In this case HashIndex is not
+ // appended to the block content.
+ bool valid_;
+
+ std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
+ friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
+};
+
+class DataBlockHashIndex {
+ public:
+ DataBlockHashIndex() : num_buckets_(0) {}
+
+ void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
+
+ uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
+
+ inline bool Valid() { return num_buckets_ != 0; }
+
+ private:
+ // To make the serialized hash index compact and to save the space overhead,
+ // here all the data fields persisted in the block are in uint16 format.
+ // We find that a uint16 is large enough to index every offset of a 64KiB
+ // block.
+ // So in other words, DataBlockHashIndex does not support block size equal
+ // or greater then 64KiB.
+ uint16_t num_buckets_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index_test.cc b/src/rocksdb/table/block_based/data_block_hash_index_test.cc
new file mode 100644
index 000000000..8548c8508
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index_test.cc
@@ -0,0 +1,719 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdlib>
+#include <string>
+#include <unordered_map>
+
+#include "db/table_properties_collector.h"
+#include "rocksdb/slice.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/data_block_hash_index.h"
+#include "table/get_context.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool SearchForOffset(DataBlockHashIndex& index, const char* data,
+ uint16_t map_offset, const Slice& key,
+ uint8_t& restart_point) {
+ uint8_t entry = index.Lookup(data, map_offset, key);
+ if (entry == kCollision) {
+ return true;
+ }
+
+ if (entry == kNoEntry) {
+ return false;
+ }
+
+ return entry == restart_point;
+}
+
+// Random KV generator similer to block_test
+static std::string RandomString(Random* rnd, int len) {
+ std::string r;
+ test::RandomString(rnd, len, &r);
+ return r;
+}
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+ Random* rnd) {
+ char buf[50];
+ char* p = &buf[0];
+ snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+ std::string k(p);
+ if (padding_size) {
+ k += RandomString(rnd, padding_size);
+ }
+
+ return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string>* keys,
+ std::vector<std::string>* values, const int from,
+ const int len, const int step = 1,
+ const int padding_size = 0,
+ const int keys_share_prefix = 1) {
+ Random rnd(302);
+
+ // generate different prefix
+ for (int i = from; i < from + len; i += step) {
+ // generating keys that shares the prefix
+ for (int j = 0; j < keys_share_prefix; ++j) {
+ keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+ // 100 bytes values
+ values->emplace_back(RandomString(&rnd, 100));
+ }
+ }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestSmall) {
+ DataBlockHashIndexBuilder builder;
+ builder.Initialize(0.75 /*util_ratio*/);
+ for (int j = 0; j < 5; j++) {
+ for (uint8_t i = 0; i < 2 + j; i++) {
+ std::string key("key" + std::to_string(i));
+ uint8_t restart_point = i;
+ builder.Add(key, restart_point);
+ }
+
+ size_t estimated_size = builder.EstimateSize();
+
+ std::string buffer("fake"), buffer2;
+ size_t original_size = buffer.size();
+ estimated_size += original_size;
+ builder.Finish(buffer);
+
+ ASSERT_EQ(buffer.size(), estimated_size);
+
+ buffer2 = buffer; // test for the correctness of relative offset
+
+ Slice s(buffer2);
+ DataBlockHashIndex index;
+ uint16_t map_offset;
+ index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+ // the additional hash map should start at the end of the buffer
+ ASSERT_EQ(original_size, map_offset);
+ for (uint8_t i = 0; i < 2; i++) {
+ std::string key("key" + std::to_string(i));
+ uint8_t restart_point = i;
+ ASSERT_TRUE(
+ SearchForOffset(index, s.data(), map_offset, key, restart_point));
+ }
+ builder.Reset();
+ }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTest) {
+ // bucket_num = 200, #keys = 100. 50% utilization
+ DataBlockHashIndexBuilder builder;
+ builder.Initialize(0.75 /*util_ratio*/);
+
+ for (uint8_t i = 0; i < 100; i++) {
+ std::string key("key" + std::to_string(i));
+ uint8_t restart_point = i;
+ builder.Add(key, restart_point);
+ }
+
+ size_t estimated_size = builder.EstimateSize();
+
+ std::string buffer("fake content"), buffer2;
+ size_t original_size = buffer.size();
+ estimated_size += original_size;
+ builder.Finish(buffer);
+
+ ASSERT_EQ(buffer.size(), estimated_size);
+
+ buffer2 = buffer; // test for the correctness of relative offset
+
+ Slice s(buffer2);
+ DataBlockHashIndex index;
+ uint16_t map_offset;
+ index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+ // the additional hash map should start at the end of the buffer
+ ASSERT_EQ(original_size, map_offset);
+ for (uint8_t i = 0; i < 100; i++) {
+ std::string key("key" + std::to_string(i));
+ uint8_t restart_point = i;
+ ASSERT_TRUE(
+ SearchForOffset(index, s.data(), map_offset, key, restart_point));
+ }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestCollision) {
+ // bucket_num = 2. There will be intense hash collisions
+ DataBlockHashIndexBuilder builder;
+ builder.Initialize(0.75 /*util_ratio*/);
+
+ for (uint8_t i = 0; i < 100; i++) {
+ std::string key("key" + std::to_string(i));
+ uint8_t restart_point = i;
+ builder.Add(key, restart_point);
+ }
+
+ size_t estimated_size = builder.EstimateSize();
+
+ std::string buffer("some other fake content to take up space"), buffer2;
+ size_t original_size = buffer.size();
+ estimated_size += original_size;
+ builder.Finish(buffer);
+
+ ASSERT_EQ(buffer.size(), estimated_size);
+
+ buffer2 = buffer; // test for the correctness of relative offset
+
+ Slice s(buffer2);
+ DataBlockHashIndex index;
+ uint16_t map_offset;
+ index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+ // the additional hash map should start at the end of the buffer
+ ASSERT_EQ(original_size, map_offset);
+ for (uint8_t i = 0; i < 100; i++) {
+ std::string key("key" + std::to_string(i));
+ uint8_t restart_point = i;
+ ASSERT_TRUE(
+ SearchForOffset(index, s.data(), map_offset, key, restart_point));
+ }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestLarge) {
+ DataBlockHashIndexBuilder builder;
+ builder.Initialize(0.75 /*util_ratio*/);
+ std::unordered_map<std::string, uint8_t> m;
+
+ for (uint8_t i = 0; i < 100; i++) {
+ if (i % 2) {
+ continue; // leave half of the keys out
+ }
+ std::string key = "key" + std::to_string(i);
+ uint8_t restart_point = i;
+ builder.Add(key, restart_point);
+ m[key] = restart_point;
+ }
+
+ size_t estimated_size = builder.EstimateSize();
+
+ std::string buffer("filling stuff"), buffer2;
+ size_t original_size = buffer.size();
+ estimated_size += original_size;
+ builder.Finish(buffer);
+
+ ASSERT_EQ(buffer.size(), estimated_size);
+
+ buffer2 = buffer; // test for the correctness of relative offset
+
+ Slice s(buffer2);
+ DataBlockHashIndex index;
+ uint16_t map_offset;
+ index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+ // the additional hash map should start at the end of the buffer
+ ASSERT_EQ(original_size, map_offset);
+ for (uint8_t i = 0; i < 100; i++) {
+ std::string key = "key" + std::to_string(i);
+ uint8_t restart_point = i;
+ if (m.count(key)) {
+ ASSERT_TRUE(m[key] == restart_point);
+ ASSERT_TRUE(
+ SearchForOffset(index, s.data(), map_offset, key, restart_point));
+ } else {
+ // we allow false positve, so don't test the nonexisting keys.
+ // when false positive happens, the search will continue to the
+ // restart intervals to see if the key really exist.
+ }
+ }
+}
+
+TEST(DataBlockHashIndex, RestartIndexExceedMax) {
+ DataBlockHashIndexBuilder builder;
+ builder.Initialize(0.75 /*util_ratio*/);
+ std::unordered_map<std::string, uint8_t> m;
+
+ for (uint8_t i = 0; i <= 253; i++) {
+ std::string key = "key" + std::to_string(i);
+ uint8_t restart_point = i;
+ builder.Add(key, restart_point);
+ }
+ ASSERT_TRUE(builder.Valid());
+
+ builder.Reset();
+
+ for (uint8_t i = 0; i <= 254; i++) {
+ std::string key = "key" + std::to_string(i);
+ uint8_t restart_point = i;
+ builder.Add(key, restart_point);
+ }
+
+ ASSERT_FALSE(builder.Valid());
+
+ builder.Reset();
+ ASSERT_TRUE(builder.Valid());
+}
+
+TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) {
+ Options options = Options();
+
+ BlockBuilder builder(1 /* block_restart_interval */,
+ true /* use_delta_encoding */,
+ false /* use_value_delta_encoding */,
+ BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+ // #restarts <= 253. HashIndex is valid
+ for (int i = 0; i <= 253; i++) {
+ std::string ukey = "key" + std::to_string(i);
+ InternalKey ikey(ukey, 0, kTypeValue);
+ builder.Add(ikey.Encode().ToString(), "value");
+ }
+
+ {
+ // read serialized contents of the block
+ Slice rawblock = builder.Finish();
+
+ // create block reader
+ BlockContents contents;
+ contents.data = rawblock;
+ Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+ ASSERT_EQ(reader.IndexType(),
+ BlockBasedTableOptions::kDataBlockBinaryAndHash);
+ }
+
+ builder.Reset();
+
+ // #restarts > 253. HashIndex is not used
+ for (int i = 0; i <= 254; i++) {
+ std::string ukey = "key" + std::to_string(i);
+ InternalKey ikey(ukey, 0, kTypeValue);
+ builder.Add(ikey.Encode().ToString(), "value");
+ }
+
+ {
+ // read serialized contents of the block
+ Slice rawblock = builder.Finish();
+
+ // create block reader
+ BlockContents contents;
+ contents.data = rawblock;
+ Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+ ASSERT_EQ(reader.IndexType(),
+ BlockBasedTableOptions::kDataBlockBinarySearch);
+ }
+}
+
+TEST(DataBlockHashIndex, BlockSizeExceedMax) {
+ Options options = Options();
+ std::string ukey(10, 'k');
+ InternalKey ikey(ukey, 0, kTypeValue);
+
+ BlockBuilder builder(1 /* block_restart_interval */,
+ false /* use_delta_encoding */,
+ false /* use_value_delta_encoding */,
+ BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+ {
+ // insert a large value. The block size plus HashIndex is 65536.
+ std::string value(65502, 'v');
+
+ builder.Add(ikey.Encode().ToString(), value);
+
+ // read serialized contents of the block
+ Slice rawblock = builder.Finish();
+ ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+ std::cerr << "block size: " << rawblock.size() << std::endl;
+
+ // create block reader
+ BlockContents contents;
+ contents.data = rawblock;
+ Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+ ASSERT_EQ(reader.IndexType(),
+ BlockBasedTableOptions::kDataBlockBinaryAndHash);
+ }
+
+ builder.Reset();
+
+ {
+ // insert a large value. The block size plus HashIndex would be 65537.
+ // This excceed the max block size supported by HashIndex (65536).
+ // So when build finishes HashIndex will not be created for the block.
+ std::string value(65503, 'v');
+
+ builder.Add(ikey.Encode().ToString(), value);
+
+ // read serialized contents of the block
+ Slice rawblock = builder.Finish();
+ ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+ std::cerr << "block size: " << rawblock.size() << std::endl;
+
+ // create block reader
+ BlockContents contents;
+ contents.data = rawblock;
+ Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+ // the index type have fallen back to binary when build finish.
+ ASSERT_EQ(reader.IndexType(),
+ BlockBasedTableOptions::kDataBlockBinarySearch);
+ }
+}
+
+TEST(DataBlockHashIndex, BlockTestSingleKey) {
+ Options options = Options();
+
+ BlockBuilder builder(16 /* block_restart_interval */,
+ true /* use_delta_encoding */,
+ false /* use_value_delta_encoding */,
+ BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+ std::string ukey("gopher");
+ std::string value("gold");
+ InternalKey ikey(ukey, 10, kTypeValue);
+ builder.Add(ikey.Encode().ToString(), value /*value*/);
+
+ // read serialized contents of the block
+ Slice rawblock = builder.Finish();
+
+ // create block reader
+ BlockContents contents;
+ contents.data = rawblock;
+ Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+ const InternalKeyComparator icmp(BytewiseComparator());
+ auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
+ bool may_exist;
+ // search in block for the key just inserted
+ {
+ InternalKey seek_ikey(ukey, 10, kValueTypeForSeek);
+ may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+ ASSERT_TRUE(may_exist);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(
+ options.comparator->Compare(iter->key(), ikey.Encode().ToString()), 0);
+ ASSERT_EQ(iter->value(), value);
+ }
+
+ // search in block for the existing ukey, but with higher seqno
+ {
+ InternalKey seek_ikey(ukey, 20, kValueTypeForSeek);
+
+ // HashIndex should be able to set the iter correctly
+ may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+ ASSERT_TRUE(may_exist);
+ ASSERT_TRUE(iter->Valid());
+
+ // user key should match
+ ASSERT_EQ(options.comparator->Compare(ExtractUserKey(iter->key()), ukey),
+ 0);
+
+ // seek_key seqno number should be greater than that of iter result
+ ASSERT_GT(GetInternalKeySeqno(seek_ikey.Encode()),
+ GetInternalKeySeqno(iter->key()));
+
+ ASSERT_EQ(iter->value(), value);
+ }
+
+ // Search in block for the existing ukey, but with lower seqno
+ // in this case, hash can find the only occurrence of the user_key, but
+ // ParseNextDataKey() will skip it as it does not have a older seqno.
+ // In this case, GetForSeek() is effective to locate the user_key, and
+ // iter->Valid() == false indicates that we've reached to the end of
+ // the block and the caller should continue searching the next block.
+ {
+ InternalKey seek_ikey(ukey, 5, kValueTypeForSeek);
+ may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+ ASSERT_TRUE(may_exist);
+ ASSERT_FALSE(iter->Valid()); // should have reached to the end of block
+ }
+
+ delete iter;
+}
+
+TEST(DataBlockHashIndex, BlockTestLarge) {
+ Random rnd(1019);
+ Options options = Options();
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+
+ BlockBuilder builder(16 /* block_restart_interval */,
+ true /* use_delta_encoding */,
+ false /* use_value_delta_encoding */,
+ BlockBasedTableOptions::kDataBlockBinaryAndHash);
+ int num_records = 500;
+
+ GenerateRandomKVs(&keys, &values, 0, num_records);
+
+ // Generate keys. Adding a trailing "1" to indicate existent keys.
+ // Later will Seeking for keys with a trailing "0" to test seeking
+ // non-existent keys.
+ for (int i = 0; i < num_records; i++) {
+ std::string ukey(keys[i] + "1" /* existing key marker */);
+ InternalKey ikey(ukey, 0, kTypeValue);
+ builder.Add(ikey.Encode().ToString(), values[i]);
+ }
+
+ // read serialized contents of the block
+ Slice rawblock = builder.Finish();
+
+ // create block reader
+ BlockContents contents;
+ contents.data = rawblock;
+ Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+ const InternalKeyComparator icmp(BytewiseComparator());
+
+ // random seek existent keys
+ for (int i = 0; i < num_records; i++) {
+ auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
+ // find a random key in the lookaside array
+ int index = rnd.Uniform(num_records);
+ std::string ukey(keys[index] + "1" /* existing key marker */);
+ InternalKey ikey(ukey, 0, kTypeValue);
+
+ // search in block for this key
+ bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+ ASSERT_TRUE(may_exist);
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(values[index], iter->value());
+
+ delete iter;
+ }
+
+ // random seek non-existent user keys
+ // In this case A), the user_key cannot be found in HashIndex. The key may
+ // exist in the next block. So the iter is set invalidated to tell the
+ // caller to search the next block. This test case belongs to this case A).
+ //
+ // Note that for non-existent keys, there is possibility of false positive,
+ // i.e. the key is still hashed into some restart interval.
+ // Two additional possible outcome:
+ // B) linear seek the restart interval and not found, the iter stops at the
+ // starting of the next restart interval. The key does not exist
+ // anywhere.
+ // C) linear seek the restart interval and not found, the iter stops at the
+ // the end of the block, i.e. restarts_. The key may exist in the next
+ // block.
+ // So these combinations are possible when searching non-existent user_key:
+ //
+ // case# may_exist iter->Valid()
+ // A true false
+ // B false true
+ // C true false
+
+ for (int i = 0; i < num_records; i++) {
+ auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
+ // find a random key in the lookaside array
+ int index = rnd.Uniform(num_records);
+ std::string ukey(keys[index] + "0" /* non-existing key marker */);
+ InternalKey ikey(ukey, 0, kTypeValue);
+
+ // search in block for this key
+ bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+ if (!may_exist) {
+ ASSERT_TRUE(iter->Valid());
+ }
+ if (!iter->Valid()) {
+ ASSERT_TRUE(may_exist);
+ }
+
+ delete iter;
+ }
+}
+
+// helper routine for DataBlockHashIndex.BlockBoundary
+void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
+ std::string& v2, InternalKey& seek_ikey,
+ GetContext& get_context, Options& options) {
+ std::unique_ptr<WritableFileWriter> file_writer;
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ std::unique_ptr<TableReader> table_reader;
+ int level_ = -1;
+
+ std::vector<std::string> keys;
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ const InternalKeyComparator internal_comparator(options.comparator);
+
+ EnvOptions soptions;
+
+ soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+ file_writer.reset(
+ test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
+ std::unique_ptr<TableBuilder> builder;
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+ std::string column_family_name;
+ builder.reset(ioptions.table_factory->NewTableBuilder(
+ TableBuilderOptions(ioptions, moptions, internal_comparator,
+ &int_tbl_prop_collector_factories,
+ options.compression, options.sample_for_compression,
+ CompressionOptions(), false /* skip_filters */,
+ column_family_name, level_),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ file_writer.get()));
+
+ builder->Add(ik1.Encode().ToString(), v1);
+ builder->Add(ik2.Encode().ToString(), v2);
+ EXPECT_TRUE(builder->status().ok());
+
+ Status s = builder->Finish();
+ file_writer->Flush();
+ EXPECT_TRUE(s.ok()) << s.ToString();
+
+ EXPECT_EQ(
+ test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(),
+ builder->FileSize());
+
+ // Open the table
+ file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource(
+ test::GetStringSinkFromLegacyWriter(file_writer.get())->contents(),
+ 0 /*uniq_id*/, ioptions.allow_mmap_reads)));
+ const bool kSkipFilters = true;
+ const bool kImmortal = true;
+ ioptions.table_factory->NewTableReader(
+ TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
+ internal_comparator, !kSkipFilters, !kImmortal,
+ level_),
+ std::move(file_reader),
+ test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(),
+ &table_reader);
+ // Search using Get()
+ ReadOptions ro;
+
+ ASSERT_OK(table_reader->Get(ro, seek_ikey.Encode().ToString(), &get_context,
+ moptions.prefix_extractor.get()));
+}
+
+TEST(DataBlockHashIndex, BlockBoundary) {
+ BlockBasedTableOptions table_options;
+ table_options.data_block_index_type =
+ BlockBasedTableOptions::kDataBlockBinaryAndHash;
+ table_options.block_restart_interval = 1;
+ table_options.block_size = 4096;
+
+ Options options;
+ options.comparator = BytewiseComparator();
+
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ // insert two large k/v pair. Given that the block_size is 4096, one k/v
+ // pair will take up one block.
+ // [ k1/v1 ][ k2/v2 ]
+ // [ Block N ][ Block N+1 ]
+
+ {
+ // [ "aab"@100 ][ "axy"@10 ]
+ // | Block N ][ Block N+1 ]
+ // seek for "axy"@60
+ std::string uk1("aab");
+ InternalKey ik1(uk1, 100, kTypeValue);
+ std::string v1(4100, '1'); // large value
+
+ std::string uk2("axy");
+ InternalKey ik2(uk2, 10, kTypeValue);
+ std::string v2(4100, '2'); // large value
+
+ PinnableSlice value;
+ std::string seek_ukey("axy");
+ InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+ GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, seek_ukey, &value, nullptr,
+ nullptr, true, nullptr, nullptr);
+
+ TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+ ASSERT_EQ(get_context.State(), GetContext::kFound);
+ ASSERT_EQ(value, v2);
+ value.Reset();
+ }
+
+ {
+ // [ "axy"@100 ][ "axy"@10 ]
+ // | Block N ][ Block N+1 ]
+ // seek for "axy"@60
+ std::string uk1("axy");
+ InternalKey ik1(uk1, 100, kTypeValue);
+ std::string v1(4100, '1'); // large value
+
+ std::string uk2("axy");
+ InternalKey ik2(uk2, 10, kTypeValue);
+ std::string v2(4100, '2'); // large value
+
+ PinnableSlice value;
+ std::string seek_ukey("axy");
+ InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+ GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, seek_ukey, &value, nullptr,
+ nullptr, true, nullptr, nullptr);
+
+ TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+ ASSERT_EQ(get_context.State(), GetContext::kFound);
+ ASSERT_EQ(value, v2);
+ value.Reset();
+ }
+
+ {
+ // [ "axy"@100 ][ "axy"@10 ]
+ // | Block N ][ Block N+1 ]
+ // seek for "axy"@120
+ std::string uk1("axy");
+ InternalKey ik1(uk1, 100, kTypeValue);
+ std::string v1(4100, '1'); // large value
+
+ std::string uk2("axy");
+ InternalKey ik2(uk2, 10, kTypeValue);
+ std::string v2(4100, '2'); // large value
+
+ PinnableSlice value;
+ std::string seek_ukey("axy");
+ InternalKey seek_ikey(seek_ukey, 120, kTypeValue);
+ GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, seek_ukey, &value, nullptr,
+ nullptr, true, nullptr, nullptr);
+
+ TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+ ASSERT_EQ(get_context.State(), GetContext::kFound);
+ ASSERT_EQ(value, v1);
+ value.Reset();
+ }
+
+ {
+ // [ "axy"@100 ][ "axy"@10 ]
+ // | Block N ][ Block N+1 ]
+ // seek for "axy"@5
+ std::string uk1("axy");
+ InternalKey ik1(uk1, 100, kTypeValue);
+ std::string v1(4100, '1'); // large value
+
+ std::string uk2("axy");
+ InternalKey ik2(uk2, 10, kTypeValue);
+ std::string v2(4100, '2'); // large value
+
+ PinnableSlice value;
+ std::string seek_ukey("axy");
+ InternalKey seek_ikey(seek_ukey, 5, kTypeValue);
+ GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, seek_ukey, &value, nullptr,
+ nullptr, true, nullptr, nullptr);
+
+ TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+ ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+ value.Reset();
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/filter_block.h b/src/rocksdb/table/block_based/filter_block.h
new file mode 100644
index 000000000..1ad8d3f18
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block.h
@@ -0,0 +1,176 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file. It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+//
+// It is a base class for BlockBasedFilter and FullFilter.
+// These two are both used in BlockBasedTable. The first one contain filter
+// For a part of keys in sst file, the second contain filter for all keys
+// in sst file.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "db/dbformat.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/format.h"
+#include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint64_t kNotValid = ULLONG_MAX;
+class FilterPolicy;
+
+class GetContext;
+using MultiGetRange = MultiGetContext::Range;
+
+// A FilterBlockBuilder is used to construct all of the filters for a
+// particular Table. It generates a single string which is stored as
+// a special block in the Table.
+//
+// The sequence of calls to FilterBlockBuilder must match the regexp:
+// (StartBlock Add*)* Finish
+//
+// BlockBased/Full FilterBlock would be called in the same way.
+class FilterBlockBuilder {
+ public:
+ explicit FilterBlockBuilder() {}
+ // No copying allowed
+ FilterBlockBuilder(const FilterBlockBuilder&) = delete;
+ void operator=(const FilterBlockBuilder&) = delete;
+
+ virtual ~FilterBlockBuilder() {}
+
+ virtual bool IsBlockBased() = 0; // If is blockbased filter
+ virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter
+ virtual void Add(const Slice& key) = 0; // Add a key to current filter
+ virtual size_t NumAdded() const = 0; // Number of keys added
+ Slice Finish() { // Generate Filter
+ const BlockHandle empty_handle;
+ Status dont_care_status;
+ auto ret = Finish(empty_handle, &dont_care_status);
+ assert(dont_care_status.ok());
+ return ret;
+ }
+ virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0;
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+//
+// BlockBased/Full FilterBlock would be called in the same way.
+class FilterBlockReader {
+ public:
+ FilterBlockReader() = default;
+ virtual ~FilterBlockReader() = default;
+
+ FilterBlockReader(const FilterBlockReader&) = delete;
+ FilterBlockReader& operator=(const FilterBlockReader&) = delete;
+
+ virtual bool IsBlockBased() = 0; // If is blockbased filter
+
+ /**
+ * If no_io is set, then it returns true if it cannot answer the query without
+ * reading data from disk. This is used in PartitionedFilterBlockReader to
+ * avoid reading partitions that are not in block cache already
+ *
+ * Normally filters are built on only the user keys and the InternalKey is not
+ * needed for a query. The index in PartitionedFilterBlockReader however is
+ * built upon InternalKey and must be provided via const_ikey_ptr when running
+ * queries.
+ */
+ virtual bool KeyMayMatch(const Slice& key,
+ const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const const_ikey_ptr,
+ GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) = 0;
+
+ virtual void KeysMayMatch(MultiGetRange* range,
+ const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ BlockCacheLookupContext* lookup_context) {
+ for (auto iter = range->begin(); iter != range->end(); ++iter) {
+ const Slice ukey = iter->ukey;
+ const Slice ikey = iter->ikey;
+ GetContext* const get_context = iter->get_context;
+ if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey,
+ get_context, lookup_context)) {
+ range->SkipKey(iter);
+ }
+ }
+ }
+
+ /**
+ * no_io and const_ikey_ptr here means the same as in KeyMayMatch
+ */
+ virtual bool PrefixMayMatch(const Slice& prefix,
+ const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const const_ikey_ptr,
+ GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) = 0;
+
+ virtual void PrefixesMayMatch(MultiGetRange* range,
+ const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ BlockCacheLookupContext* lookup_context) {
+ for (auto iter = range->begin(); iter != range->end(); ++iter) {
+ const Slice ukey = iter->ukey;
+ const Slice ikey = iter->ikey;
+ GetContext* const get_context = iter->get_context;
+ if (prefix_extractor->InDomain(ukey) &&
+ !PrefixMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
+ block_offset, no_io, &ikey, get_context,
+ lookup_context)) {
+ range->SkipKey(iter);
+ }
+ }
+ }
+
+ virtual size_t ApproximateMemoryUsage() const = 0;
+
+ // convert this object to a human readable form
+ virtual std::string ToString() const {
+ std::string error_msg("Unsupported filter \n");
+ return error_msg;
+ }
+
+ virtual void CacheDependencies(bool /*pin*/) {}
+
+ virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/,
+ const Slice& user_key,
+ const SliceTransform* prefix_extractor,
+ const Comparator* /*comparator*/,
+ const Slice* const const_ikey_ptr,
+ bool* filter_checked, bool need_upper_bound_check,
+ BlockCacheLookupContext* lookup_context) {
+ if (need_upper_bound_check) {
+ return true;
+ }
+ *filter_checked = true;
+ Slice prefix = prefix_extractor->Transform(user_key);
+ return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
+ const_ikey_ptr, /* get_context */ nullptr,
+ lookup_context);
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_block_reader_common.cc b/src/rocksdb/table/block_based/filter_block_reader_common.cc
new file mode 100644
index 000000000..fa0802669
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block_reader_common.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/filter_block_reader_common.h"
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/parsed_full_filter_block.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<TBlocklike>* filter_block) {
+ PERF_TIMER_GUARD(read_filter_block_nanos);
+
+ assert(table);
+ assert(filter_block);
+ assert(filter_block->IsEmpty());
+
+ const BlockBasedTable::Rep* const rep = table->get_rep();
+ assert(rep);
+
+ const Status s =
+ table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle,
+ UncompressionDict::GetEmptyDict(), filter_block,
+ BlockType::kFilter, get_context, lookup_context,
+ /* for_compaction */ false, use_cache);
+
+ return s;
+}
+
+template <typename TBlocklike>
+const SliceTransform*
+FilterBlockReaderCommon<TBlocklike>::table_prefix_extractor() const {
+ assert(table_);
+
+ const BlockBasedTable::Rep* const rep = table_->get_rep();
+ assert(rep);
+
+ return rep->prefix_filtering ? rep->table_prefix_extractor.get() : nullptr;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::whole_key_filtering() const {
+ assert(table_);
+ assert(table_->get_rep());
+
+ return table_->get_rep()->whole_key_filtering;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::cache_filter_blocks() const {
+ assert(table_);
+ assert(table_->get_rep());
+
+ return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::GetOrReadFilterBlock(
+ bool no_io, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<TBlocklike>* filter_block) const {
+ assert(filter_block);
+
+ if (!filter_block_.IsEmpty()) {
+ filter_block->SetUnownedValue(filter_block_.GetValue());
+ return Status::OK();
+ }
+
+ ReadOptions read_options;
+ if (no_io) {
+ read_options.read_tier = kBlockCacheTier;
+ }
+
+ return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options,
+ cache_filter_blocks(), get_context, lookup_context,
+ filter_block);
+}
+
+template <typename TBlocklike>
+size_t FilterBlockReaderCommon<TBlocklike>::ApproximateFilterBlockMemoryUsage()
+ const {
+ assert(!filter_block_.GetOwnValue() || filter_block_.GetValue() != nullptr);
+ return filter_block_.GetOwnValue()
+ ? filter_block_.GetValue()->ApproximateMemoryUsage()
+ : 0;
+}
+
+// Explicitly instantiate templates for both "blocklike" types we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template class FilterBlockReaderCommon<BlockContents>;
+template class FilterBlockReaderCommon<Block>;
+template class FilterBlockReaderCommon<ParsedFullFilterBlock>;
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_block_reader_common.h b/src/rocksdb/table/block_based/filter_block_reader_common.h
new file mode 100644
index 000000000..a18bc5449
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block_reader_common.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBasedTable;
+class FilePrefetchBuffer;
+
+// Encapsulates common functionality for the various filter block reader
+// implementations. Provides access to the filter block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+template <typename TBlocklike>
+class FilterBlockReaderCommon : public FilterBlockReader {
+ public:
+ FilterBlockReaderCommon(const BlockBasedTable* t,
+ CachableEntry<TBlocklike>&& filter_block)
+ : table_(t), filter_block_(std::move(filter_block)) {
+ assert(table_);
+ }
+
+ protected:
+ static Status ReadFilterBlock(const BlockBasedTable* table,
+ FilePrefetchBuffer* prefetch_buffer,
+ const ReadOptions& read_options, bool use_cache,
+ GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<TBlocklike>* filter_block);
+
+ const BlockBasedTable* table() const { return table_; }
+ const SliceTransform* table_prefix_extractor() const;
+ bool whole_key_filtering() const;
+ bool cache_filter_blocks() const;
+
+ Status GetOrReadFilterBlock(bool no_io, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<TBlocklike>* filter_block) const;
+
+ size_t ApproximateFilterBlockMemoryUsage() const;
+
+ private:
+ const BlockBasedTable* table_;
+ CachableEntry<TBlocklike> filter_block_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_policy.cc b/src/rocksdb/table/block_based/filter_policy.cc
new file mode 100644
index 000000000..c8f23ee33
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_policy.cc
@@ -0,0 +1,759 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <array>
+#include <deque>
+
+#include "rocksdb/filter_policy.h"
+
+#include "rocksdb/slice.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "third-party/folly/folly/ConstexprMath.h"
+#include "util/bloom_impl.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+ explicit FastLocalBloomBitsBuilder(const int millibits_per_key)
+ : millibits_per_key_(millibits_per_key),
+ num_probes_(FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_)) {
+ assert(millibits_per_key >= 1000);
+ }
+
+ // No Copy allowed
+ FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete;
+ void operator=(const FastLocalBloomBitsBuilder&) = delete;
+
+ ~FastLocalBloomBitsBuilder() override {}
+
+ virtual void AddKey(const Slice& key) override {
+ uint64_t hash = GetSliceHash64(key);
+ if (hash_entries_.empty() || hash != hash_entries_.back()) {
+ hash_entries_.push_back(hash);
+ }
+ }
+
+ virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+ uint32_t len_with_metadata =
+ CalculateSpace(static_cast<uint32_t>(hash_entries_.size()));
+ char* data = new char[len_with_metadata];
+ memset(data, 0, len_with_metadata);
+
+ assert(data);
+ assert(len_with_metadata >= 5);
+
+ uint32_t len = len_with_metadata - 5;
+ if (len > 0) {
+ AddAllEntries(data, len);
+ }
+
+ // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+ // -1 = Marker for newer Bloom implementations
+ data[len] = static_cast<char>(-1);
+ // 0 = Marker for this sub-implementation
+ data[len + 1] = static_cast<char>(0);
+ // num_probes (and 0 in upper bits for 64-byte block size)
+ data[len + 2] = static_cast<char>(num_probes_);
+ // rest of metadata stays zero
+
+ const char* const_data = data;
+ buf->reset(const_data);
+ assert(hash_entries_.empty());
+
+ return Slice(data, len_with_metadata);
+ }
+
+ int CalculateNumEntry(const uint32_t bytes) override {
+ uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0;
+ return static_cast<int>(uint64_t{8000} * bytes_no_meta /
+ millibits_per_key_);
+ }
+
+ uint32_t CalculateSpace(const int num_entry) override {
+ uint32_t num_cache_lines = 0;
+ if (millibits_per_key_ > 0 && num_entry > 0) {
+ num_cache_lines = static_cast<uint32_t>(
+ (int64_t{num_entry} * millibits_per_key_ + 511999) / 512000);
+ }
+ return num_cache_lines * 64 + /*metadata*/ 5;
+ }
+
+ double EstimatedFpRate(size_t keys, size_t bytes) override {
+ return FastLocalBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5,
+ num_probes_, /*hash bits*/ 64);
+ }
+
+ private:
+ void AddAllEntries(char* data, uint32_t len) {
+ // Simple version without prefetching:
+ //
+ // for (auto h : hash_entries_) {
+ // FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len,
+ // num_probes_, data);
+ // }
+
+ const size_t num_entries = hash_entries_.size();
+ constexpr size_t kBufferMask = 7;
+ static_assert(((kBufferMask + 1) & kBufferMask) == 0,
+ "Must be power of 2 minus 1");
+
+ std::array<uint32_t, kBufferMask + 1> hashes;
+ std::array<uint32_t, kBufferMask + 1> byte_offsets;
+
+ // Prime the buffer
+ size_t i = 0;
+ for (; i <= kBufferMask && i < num_entries; ++i) {
+ uint64_t h = hash_entries_.front();
+ hash_entries_.pop_front();
+ FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+ /*out*/ &byte_offsets[i]);
+ hashes[i] = Upper32of64(h);
+ }
+
+ // Process and buffer
+ for (; i < num_entries; ++i) {
+ uint32_t& hash_ref = hashes[i & kBufferMask];
+ uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask];
+ // Process (add)
+ FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes_,
+ data + byte_offset_ref);
+ // And buffer
+ uint64_t h = hash_entries_.front();
+ hash_entries_.pop_front();
+ FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+ /*out*/ &byte_offset_ref);
+ hash_ref = Upper32of64(h);
+ }
+
+ // Finish processing
+ for (i = 0; i <= kBufferMask && i < num_entries; ++i) {
+ FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes_,
+ data + byte_offsets[i]);
+ }
+ }
+
+ int millibits_per_key_;
+ int num_probes_;
+ // A deque avoids unnecessary copying of already-saved values
+ // and has near-minimal peak memory use.
+ std::deque<uint64_t> hash_entries_;
+};
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsReader : public FilterBitsReader {
+ public:
+ FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes)
+ : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {}
+
+ // No Copy allowed
+ FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete;
+ void operator=(const FastLocalBloomBitsReader&) = delete;
+
+ ~FastLocalBloomBitsReader() override {}
+
+ bool MayMatch(const Slice& key) override {
+ uint64_t h = GetSliceHash64(key);
+ uint32_t byte_offset;
+ FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+ /*out*/ &byte_offset);
+ return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_,
+ data_ + byte_offset);
+ }
+
+ virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+ std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+ std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+ for (int i = 0; i < num_keys; ++i) {
+ uint64_t h = GetSliceHash64(*keys[i]);
+ FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+ /*out*/ &byte_offsets[i]);
+ hashes[i] = Upper32of64(h);
+ }
+ for (int i = 0; i < num_keys; ++i) {
+ may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared(
+ hashes[i], num_probes_, data_ + byte_offsets[i]);
+ }
+ }
+
+ private:
+ const char* data_;
+ const int num_probes_;
+ const uint32_t len_bytes_;
+};
+
+using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
+
+class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+ explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log);
+
+ // No Copy allowed
+ LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete;
+ void operator=(const LegacyBloomBitsBuilder&) = delete;
+
+ ~LegacyBloomBitsBuilder() override;
+
+ void AddKey(const Slice& key) override;
+
+ Slice Finish(std::unique_ptr<const char[]>* buf) override;
+
+ int CalculateNumEntry(const uint32_t bytes) override;
+
+ uint32_t CalculateSpace(const int num_entry) override {
+ uint32_t dont_care1;
+ uint32_t dont_care2;
+ return CalculateSpace(num_entry, &dont_care1, &dont_care2);
+ }
+
+ double EstimatedFpRate(size_t keys, size_t bytes) override {
+ return LegacyBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5,
+ num_probes_);
+ }
+
+ private:
+ int bits_per_key_;
+ int num_probes_;
+ std::vector<uint32_t> hash_entries_;
+ Logger* info_log_;
+
+ // Get totalbits that optimized for cpu cache line
+ uint32_t GetTotalBitsForLocality(uint32_t total_bits);
+
+ // Reserve space for new filter
+ char* ReserveSpace(const int num_entry, uint32_t* total_bits,
+ uint32_t* num_lines);
+
+ // Implementation-specific variant of public CalculateSpace
+ uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
+ uint32_t* num_lines);
+
+ // Assuming single threaded access to this function.
+ void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
+};
+
+LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key,
+ Logger* info_log)
+ : bits_per_key_(bits_per_key),
+ num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)),
+ info_log_(info_log) {
+ assert(bits_per_key_);
+}
+
+LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {}
+
+void LegacyBloomBitsBuilder::AddKey(const Slice& key) {
+ uint32_t hash = BloomHash(key);
+ if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
+ hash_entries_.push_back(hash);
+ }
+}
+
+Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
+ uint32_t total_bits, num_lines;
+ size_t num_entries = hash_entries_.size();
+ char* data =
+ ReserveSpace(static_cast<int>(num_entries), &total_bits, &num_lines);
+ assert(data);
+
+ if (total_bits != 0 && num_lines != 0) {
+ for (auto h : hash_entries_) {
+ AddHash(h, data, num_lines, total_bits);
+ }
+
+ // Check for excessive entries for 32-bit hash function
+ if (num_entries >= /* minimum of 3 million */ 3000000U) {
+ // More specifically, we can detect that the 32-bit hash function
+ // is causing significant increase in FP rate by comparing current
+ // estimated FP rate to what we would get with a normal number of
+ // keys at same memory ratio.
+ double est_fp_rate = LegacyBloomImpl::EstimatedFpRate(
+ num_entries, total_bits / 8, num_probes_);
+ double vs_fp_rate = LegacyBloomImpl::EstimatedFpRate(
+ 1U << 16, (1U << 16) * bits_per_key_ / 8, num_probes_);
+
+ if (est_fp_rate >= 1.50 * vs_fp_rate) {
+ // For more details, see
+ // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
+ ROCKS_LOG_WARN(
+ info_log_,
+ "Using legacy SST/BBT Bloom filter with excessive key count "
+ "(%.1fM @ %dbpk), causing estimated %.1fx higher filter FP rate. "
+ "Consider using new Bloom with format_version>=5, smaller SST "
+ "file size, or partitioned filters.",
+ num_entries / 1000000.0, bits_per_key_, est_fp_rate / vs_fp_rate);
+ }
+ }
+ }
+ // See BloomFilterPolicy::GetFilterBitsReader for metadata
+ data[total_bits / 8] = static_cast<char>(num_probes_);
+ EncodeFixed32(data + total_bits / 8 + 1, static_cast<uint32_t>(num_lines));
+
+ const char* const_data = data;
+ buf->reset(const_data);
+ hash_entries_.clear();
+
+ return Slice(data, total_bits / 8 + 5);
+}
+
+uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
+ uint32_t num_lines =
+ (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+ // Make num_lines an odd number to make sure more bits are involved
+ // when determining which block.
+ if (num_lines % 2 == 0) {
+ num_lines++;
+ }
+ return num_lines * (CACHE_LINE_SIZE * 8);
+}
+
+uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry,
+ uint32_t* total_bits,
+ uint32_t* num_lines) {
+ assert(bits_per_key_);
+ if (num_entry != 0) {
+ uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
+
+ *total_bits = GetTotalBitsForLocality(total_bits_tmp);
+ *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
+ assert(*total_bits > 0 && *total_bits % 8 == 0);
+ } else {
+ // filter is empty, just leave space for metadata
+ *total_bits = 0;
+ *num_lines = 0;
+ }
+
+ // Reserve space for Filter
+ uint32_t sz = *total_bits / 8;
+ sz += 5; // 4 bytes for num_lines, 1 byte for num_probes
+ return sz;
+}
+
+char* LegacyBloomBitsBuilder::ReserveSpace(const int num_entry,
+ uint32_t* total_bits,
+ uint32_t* num_lines) {
+ uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
+ char* data = new char[sz];
+ memset(data, 0, sz);
+ return data;
+}
+
+int LegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
+ assert(bits_per_key_);
+ assert(bytes > 0);
+ int high = static_cast<int>(bytes * 8 / bits_per_key_ + 1);
+ int low = 1;
+ int n = high;
+ for (; n >= low; n--) {
+ if (CalculateSpace(n) <= bytes) {
+ break;
+ }
+ }
+ assert(n < high); // High should be an overestimation
+ return n;
+}
+
+inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data,
+ uint32_t num_lines,
+ uint32_t total_bits) {
+#ifdef NDEBUG
+ static_cast<void>(total_bits);
+#endif
+ assert(num_lines > 0 && total_bits > 0);
+
+ LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data,
+ folly::constexpr_log2(CACHE_LINE_SIZE));
+}
+
+class LegacyBloomBitsReader : public FilterBitsReader {
+ public:
+ LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines,
+ uint32_t log2_cache_line_size)
+ : data_(data),
+ num_probes_(num_probes),
+ num_lines_(num_lines),
+ log2_cache_line_size_(log2_cache_line_size) {}
+
+ // No Copy allowed
+ LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete;
+ void operator=(const LegacyBloomBitsReader&) = delete;
+
+ ~LegacyBloomBitsReader() override {}
+
+ // "contents" contains the data built by a preceding call to
+ // FilterBitsBuilder::Finish. MayMatch must return true if the key was
+ // passed to FilterBitsBuilder::AddKey. This method may return true or false
+ // if the key was not on the list, but it should aim to return false with a
+ // high probability.
+ bool MayMatch(const Slice& key) override {
+ uint32_t hash = BloomHash(key);
+ uint32_t byte_offset;
+ LegacyBloomImpl::PrepareHashMayMatch(
+ hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
+ return LegacyBloomImpl::HashMayMatchPrepared(
+ hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
+ }
+
+ virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+ std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+ std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+ for (int i = 0; i < num_keys; ++i) {
+ hashes[i] = BloomHash(*keys[i]);
+ LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
+ /*out*/ &byte_offsets[i],
+ log2_cache_line_size_);
+ }
+ for (int i = 0; i < num_keys; ++i) {
+ may_match[i] = LegacyBloomImpl::HashMayMatchPrepared(
+ hashes[i], num_probes_, data_ + byte_offsets[i],
+ log2_cache_line_size_);
+ }
+ }
+
+ private:
+ const char* data_;
+ const int num_probes_;
+ const uint32_t num_lines_;
+ const uint32_t log2_cache_line_size_;
+};
+
+class AlwaysTrueFilter : public FilterBitsReader {
+ public:
+ bool MayMatch(const Slice&) override { return true; }
+ using FilterBitsReader::MayMatch; // inherit overload
+};
+
+class AlwaysFalseFilter : public FilterBitsReader {
+ public:
+ bool MayMatch(const Slice&) override { return false; }
+ using FilterBitsReader::MayMatch; // inherit overload
+};
+
+} // namespace
+
+const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllFixedImpls = {
+ kLegacyBloom,
+ kDeprecatedBlock,
+ kFastLocalBloom,
+};
+
+const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllUserModes = {
+ kDeprecatedBlock,
+ kAuto,
+};
+
+BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode)
+ : mode_(mode), warned_(false) {
+ // Sanitize bits_per_key
+ if (bits_per_key < 1.0) {
+ bits_per_key = 1.0;
+ } else if (!(bits_per_key < 100.0)) { // including NaN
+ bits_per_key = 100.0;
+ }
+
+ // Includes a nudge toward rounding up, to ensure on all platforms
+ // that doubles specified with three decimal digits after the decimal
+ // point are interpreted accurately.
+ millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001);
+
+ // For better or worse, this is a rounding up of a nudged rounding up,
+ // e.g. 7.4999999999999 will round up to 8, but that provides more
+ // predictability against small arithmetic errors in floating point.
+ whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000;
+}
+
+BloomFilterPolicy::~BloomFilterPolicy() {}
+
+const char* BloomFilterPolicy::Name() const {
+ return "rocksdb.BuiltinBloomFilter";
+}
+
+void BloomFilterPolicy::CreateFilter(const Slice* keys, int n,
+ std::string* dst) const {
+ // We should ideally only be using this deprecated interface for
+ // appropriately constructed BloomFilterPolicy
+ assert(mode_ == kDeprecatedBlock);
+
+ // Compute bloom filter size (in both bits and bytes)
+ uint32_t bits = static_cast<uint32_t>(n * whole_bits_per_key_);
+
+ // For small n, we can see a very high false positive rate. Fix it
+ // by enforcing a minimum bloom filter length.
+ if (bits < 64) bits = 64;
+
+ uint32_t bytes = (bits + 7) / 8;
+ bits = bytes * 8;
+
+ int num_probes =
+ LegacyNoLocalityBloomImpl::ChooseNumProbes(whole_bits_per_key_);
+
+ const size_t init_size = dst->size();
+ dst->resize(init_size + bytes, 0);
+ dst->push_back(static_cast<char>(num_probes)); // Remember # of probes
+ char* array = &(*dst)[init_size];
+ for (int i = 0; i < n; i++) {
+ LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes,
+ array);
+ }
+}
+
+bool BloomFilterPolicy::KeyMayMatch(const Slice& key,
+ const Slice& bloom_filter) const {
+ const size_t len = bloom_filter.size();
+ if (len < 2 || len > 0xffffffffU) {
+ return false;
+ }
+
+ const char* array = bloom_filter.data();
+ const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
+
+ // Use the encoded k so that we can read filters generated by
+ // bloom filters created using different parameters.
+ const int k = static_cast<uint8_t>(array[len - 1]);
+ if (k > 30) {
+ // Reserved for potentially new encodings for short bloom filters.
+ // Consider it a match.
+ return true;
+ }
+ // NB: using stored k not num_probes for whole_bits_per_key_
+ return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k,
+ array);
+}
+
+FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const {
+ // This code path should no longer be used, for the built-in
+ // BloomFilterPolicy. Internal to RocksDB and outside
+ // BloomFilterPolicy, only get a FilterBitsBuilder with
+ // BloomFilterPolicy::GetBuilderFromContext(), which will call
+ // BloomFilterPolicy::GetBuilderWithContext(). RocksDB users have
+ // been warned (HISTORY.md) that they can no longer call this on
+ // the built-in BloomFilterPolicy (unlikely).
+ assert(false);
+ return GetBuilderWithContext(FilterBuildingContext(BlockBasedTableOptions()));
+}
+
+FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
+ const FilterBuildingContext& context) const {
+ Mode cur = mode_;
+ // Unusual code construction so that we can have just
+ // one exhaustive switch without (risky) recursion
+ for (int i = 0; i < 2; ++i) {
+ switch (cur) {
+ case kAuto:
+ if (context.table_options.format_version < 5) {
+ cur = kLegacyBloom;
+ } else {
+ cur = kFastLocalBloom;
+ }
+ break;
+ case kDeprecatedBlock:
+ return nullptr;
+ case kFastLocalBloom:
+ return new FastLocalBloomBitsBuilder(millibits_per_key_);
+ case kLegacyBloom:
+ if (whole_bits_per_key_ >= 14 && context.info_log &&
+ !warned_.load(std::memory_order_relaxed)) {
+ warned_ = true;
+ const char* adjective;
+ if (whole_bits_per_key_ >= 20) {
+ adjective = "Dramatic";
+ } else {
+ adjective = "Significant";
+ }
+ // For more details, see
+ // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
+ ROCKS_LOG_WARN(
+ context.info_log,
+ "Using legacy Bloom filter with high (%d) bits/key. "
+ "%s filter space and/or accuracy improvement is available "
+ "with format_version>=5.",
+ whole_bits_per_key_, adjective);
+ }
+ return new LegacyBloomBitsBuilder(whole_bits_per_key_,
+ context.info_log);
+ }
+ }
+ assert(false);
+ return nullptr; // something legal
+}
+
+FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext(
+ const FilterBuildingContext& context) {
+ if (context.table_options.filter_policy) {
+ return context.table_options.filter_policy->GetBuilderWithContext(context);
+ } else {
+ return nullptr;
+ }
+}
+
+// Read metadata to determine what kind of FilterBitsReader is needed
+// and return a new one.
+FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
+ const Slice& contents) const {
+ uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+ if (len_with_meta <= 5) {
+ // filter is empty or broken. Treat like zero keys added.
+ return new AlwaysFalseFilter();
+ }
+
+ // Legacy Bloom filter data:
+ // 0 +-----------------------------------+
+ // | Raw Bloom filter data |
+ // | ... |
+ // len +-----------------------------------+
+ // | byte for num_probes or |
+ // | marker for new implementations |
+ // len+1 +-----------------------------------+
+ // | four bytes for number of cache |
+ // | lines |
+ // len_with_meta +-----------------------------------+
+
+ int8_t raw_num_probes =
+ static_cast<int8_t>(contents.data()[len_with_meta - 5]);
+ // NB: *num_probes > 30 and < 128 probably have not been used, because of
+ // BloomFilterPolicy::initialize, unless directly calling
+ // LegacyBloomBitsBuilder as an API, but we are leaving those cases in
+ // limbo with LegacyBloomBitsReader for now.
+
+ if (raw_num_probes < 1) {
+ // Note: < 0 (or unsigned > 127) indicate special new implementations
+ // (or reserved for future use)
+ if (raw_num_probes == -1) {
+ // Marker for newer Bloom implementations
+ return GetBloomBitsReader(contents);
+ }
+ // otherwise
+ // Treat as zero probes (always FP) for now.
+ return new AlwaysTrueFilter();
+ }
+ // else attempt decode for LegacyBloomBitsReader
+
+ int num_probes = raw_num_probes;
+ assert(num_probes >= 1);
+ assert(num_probes <= 127);
+
+ uint32_t len = len_with_meta - 5;
+ assert(len > 0);
+
+ uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
+ uint32_t log2_cache_line_size;
+
+ if (num_lines * CACHE_LINE_SIZE == len) {
+ // Common case
+ log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE);
+ } else if (num_lines == 0 || len % num_lines != 0) {
+ // Invalid (no solution to num_lines * x == len)
+ // Treat as zero probes (always FP) for now.
+ return new AlwaysTrueFilter();
+ } else {
+ // Determine the non-native cache line size (from another system)
+ log2_cache_line_size = 0;
+ while ((num_lines << log2_cache_line_size) < len) {
+ ++log2_cache_line_size;
+ }
+ if ((num_lines << log2_cache_line_size) != len) {
+ // Invalid (block size not a power of two)
+ // Treat as zero probes (always FP) for now.
+ return new AlwaysTrueFilter();
+ }
+ }
+ // if not early return
+ return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines,
+ log2_cache_line_size);
+}
+
+// For newer Bloom filter implementations
+FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader(
+ const Slice& contents) const {
+ uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+ uint32_t len = len_with_meta - 5;
+
+ assert(len > 0); // precondition
+
+ // New Bloom filter data:
+ // 0 +-----------------------------------+
+ // | Raw Bloom filter data |
+ // | ... |
+ // len +-----------------------------------+
+ // | char{-1} byte -> new Bloom filter |
+ // len+1 +-----------------------------------+
+ // | byte for subimplementation |
+ // | 0: FastLocalBloom |
+ // | other: reserved |
+ // len+2 +-----------------------------------+
+ // | byte for block_and_probes |
+ // | 0 in top 3 bits -> 6 -> 64-byte |
+ // | reserved: |
+ // | 1 in top 3 bits -> 7 -> 128-byte|
+ // | 2 in top 3 bits -> 8 -> 256-byte|
+ // | ... |
+ // | num_probes in bottom 5 bits, |
+ // | except 0 and 31 reserved |
+ // len+3 +-----------------------------------+
+ // | two bytes reserved |
+ // | possibly for hash seed |
+ // len_with_meta +-----------------------------------+
+
+ // Read more metadata (see above)
+ char sub_impl_val = contents.data()[len_with_meta - 4];
+ char block_and_probes = contents.data()[len_with_meta - 3];
+ int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6;
+
+ int num_probes = (block_and_probes & 31);
+ if (num_probes < 1 || num_probes > 30) {
+ // Reserved / future safe
+ return new AlwaysTrueFilter();
+ }
+
+ uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2);
+ if (rest != 0) {
+ // Reserved, possibly for hash seed
+ // Future safe
+ return new AlwaysTrueFilter();
+ }
+
+ if (sub_impl_val == 0) { // FastLocalBloom
+ if (log2_block_bytes == 6) { // Only block size supported for now
+ return new FastLocalBloomBitsReader(contents.data(), num_probes, len);
+ }
+ }
+ // otherwise
+ // Reserved / future safe
+ return new AlwaysTrueFilter();
+}
+
+const FilterPolicy* NewBloomFilterPolicy(double bits_per_key,
+ bool use_block_based_builder) {
+ BloomFilterPolicy::Mode m;
+ if (use_block_based_builder) {
+ m = BloomFilterPolicy::kDeprecatedBlock;
+ } else {
+ m = BloomFilterPolicy::kAuto;
+ }
+ assert(std::find(BloomFilterPolicy::kAllUserModes.begin(),
+ BloomFilterPolicy::kAllUserModes.end(),
+ m) != BloomFilterPolicy::kAllUserModes.end());
+ return new BloomFilterPolicy(bits_per_key, m);
+}
+
+FilterBuildingContext::FilterBuildingContext(
+ const BlockBasedTableOptions& _table_options)
+ : table_options(_table_options) {}
+
+FilterPolicy::~FilterPolicy() { }
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_policy_internal.h b/src/rocksdb/table/block_based/filter_policy_internal.h
new file mode 100644
index 000000000..2ca9dc859
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_policy_internal.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// Exposes any extra information needed for testing built-in
+// FilterBitsBuilders
+class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+ // Calculate number of bytes needed for a new filter, including
+ // metadata. Passing the result to CalculateNumEntry should
+ // return >= the num_entry passed in.
+ virtual uint32_t CalculateSpace(const int num_entry) = 0;
+
+ // Returns an estimate of the FP rate of the returned filter if
+ // `keys` keys are added and the filter returned by Finish is `bytes`
+ // bytes.
+ virtual double EstimatedFpRate(size_t keys, size_t bytes) = 0;
+};
+
+// RocksDB built-in filter policy for Bloom or Bloom-like filters.
+// This class is considered internal API and subject to change.
+// See NewBloomFilterPolicy.
+class BloomFilterPolicy : public FilterPolicy {
+ public:
+ // An internal marker for operating modes of BloomFilterPolicy, in terms
+ // of selecting an implementation. This makes it easier for tests to track
+ // or to walk over the built-in set of Bloom filter implementations. The
+ // only variance in BloomFilterPolicy by mode/implementation is in
+ // GetFilterBitsBuilder(), so an enum is practical here vs. subclasses.
+ //
+ // This enum is essentially the union of all the different kinds of return
+ // value from GetFilterBitsBuilder, or "underlying implementation", and
+ // higher-level modes that choose an underlying implementation based on
+ // context information.
+ enum Mode {
+ // Legacy implementation of Bloom filter for full and partitioned filters.
+ // Set to 0 in case of value confusion with bool use_block_based_builder
+ // NOTE: TESTING ONLY as this mode does not use best compatible
+ // implementation
+ kLegacyBloom = 0,
+ // Deprecated block-based Bloom filter implementation.
+ // Set to 1 in case of value confusion with bool use_block_based_builder
+ // NOTE: DEPRECATED but user exposed
+ kDeprecatedBlock = 1,
+ // A fast, cache-local Bloom filter implementation. See description in
+ // FastLocalBloomImpl.
+ // NOTE: TESTING ONLY as this mode does not check format_version
+ kFastLocalBloom = 2,
+ // Automatically choose from the above (except kDeprecatedBlock) based on
+ // context at build time, including compatibility with format_version.
+ // NOTE: This is currently the only recommended mode that is user exposed.
+ kAuto = 100,
+ };
+ // All the different underlying implementations that a BloomFilterPolicy
+ // might use, as a mode that says "always use this implementation."
+ // Only appropriate for unit tests.
+ static const std::vector<Mode> kAllFixedImpls;
+
+ // All the different modes of BloomFilterPolicy that are exposed from
+ // user APIs. Only appropriate for higher-level unit tests. Integration
+ // tests should prefer using NewBloomFilterPolicy (user-exposed).
+ static const std::vector<Mode> kAllUserModes;
+
+ explicit BloomFilterPolicy(double bits_per_key, Mode mode);
+
+ ~BloomFilterPolicy() override;
+
+ const char* Name() const override;
+
+ // Deprecated block-based filter only
+ void CreateFilter(const Slice* keys, int n, std::string* dst) const override;
+
+ // Deprecated block-based filter only
+ bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override;
+
+ FilterBitsBuilder* GetFilterBitsBuilder() const override;
+
+ // To use this function, call GetBuilderFromContext().
+ //
+ // Neither the context nor any objects therein should be saved beyond
+ // the call to this function, unless it's shared_ptr.
+ FilterBitsBuilder* GetBuilderWithContext(
+ const FilterBuildingContext&) const override;
+
+ // Returns a new FilterBitsBuilder from the filter_policy in
+ // table_options of a context, or nullptr if not applicable.
+ // (An internal convenience function to save boilerplate.)
+ static FilterBitsBuilder* GetBuilderFromContext(const FilterBuildingContext&);
+
+ // Read metadata to determine what kind of FilterBitsReader is needed
+ // and return a new one. This must successfully process any filter data
+ // generated by a built-in FilterBitsBuilder, regardless of the impl
+ // chosen for this BloomFilterPolicy. Not compatible with CreateFilter.
+ FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
+
+ // Essentially for testing only: configured millibits/key
+ int GetMillibitsPerKey() const { return millibits_per_key_; }
+ // Essentially for testing only: legacy whole bits/key
+ int GetWholeBitsPerKey() const { return whole_bits_per_key_; }
+
+ private:
+ // Newer filters support fractional bits per key. For predictable behavior
+ // of 0.001-precision values across floating point implementations, we
+ // round to thousandths of a bit (on average) per key.
+ int millibits_per_key_;
+
+ // Older filters round to whole number bits per key. (There *should* be no
+ // compatibility issue with fractional bits per key, but preserving old
+ // behavior with format_version < 5 just in case.)
+ int whole_bits_per_key_;
+
+ // Selected mode (a specific implementation or way of selecting an
+ // implementation) for building new SST filters.
+ Mode mode_;
+
+ // Whether relevant warnings have been logged already. (Remember so we
+ // only report once per BloomFilterPolicy instance, to keep the noise down.)
+ mutable std::atomic<bool> warned_;
+
+ // For newer Bloom filter implementation(s)
+ FilterBitsReader* GetBloomBitsReader(const Slice& contents) const;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/flush_block_policy.cc b/src/rocksdb/table/block_based/flush_block_policy.cc
new file mode 100644
index 000000000..f5cb2d227
--- /dev/null
+++ b/src/rocksdb/table/block_based/flush_block_policy.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+
+#include <cassert>
+
+namespace ROCKSDB_NAMESPACE {
+
+// Flush block by size
+class FlushBlockBySizePolicy : public FlushBlockPolicy {
+ public:
+ // @params block_size: Approximate size of user data packed per
+ // block.
+ // @params block_size_deviation: This is used to close a block before it
+ // reaches the configured
+ FlushBlockBySizePolicy(const uint64_t block_size,
+ const uint64_t block_size_deviation,
+ const bool align,
+ const BlockBuilder& data_block_builder)
+ : block_size_(block_size),
+ block_size_deviation_limit_(
+ ((block_size * (100 - block_size_deviation)) + 99) / 100),
+ align_(align),
+ data_block_builder_(data_block_builder) {}
+
+ bool Update(const Slice& key, const Slice& value) override {
+ // it makes no sense to flush when the data block is empty
+ if (data_block_builder_.empty()) {
+ return false;
+ }
+
+ auto curr_size = data_block_builder_.CurrentSizeEstimate();
+
+ // Do flush if one of the below two conditions is true:
+ // 1) if the current estimated size already exceeds the block size,
+ // 2) block_size_deviation is set and the estimated size after appending
+ // the kv will exceed the block size and the current size is under the
+ // the deviation.
+ return curr_size >= block_size_ || BlockAlmostFull(key, value);
+ }
+
+ private:
+ bool BlockAlmostFull(const Slice& key, const Slice& value) const {
+ if (block_size_deviation_limit_ == 0) {
+ return false;
+ }
+
+ const auto curr_size = data_block_builder_.CurrentSizeEstimate();
+ auto estimated_size_after =
+ data_block_builder_.EstimateSizeAfterKV(key, value);
+
+ if (align_) {
+ estimated_size_after += kBlockTrailerSize;
+ return estimated_size_after > block_size_;
+ }
+
+ return estimated_size_after > block_size_ &&
+ curr_size > block_size_deviation_limit_;
+ }
+
+ const uint64_t block_size_;
+ const uint64_t block_size_deviation_limit_;
+ const bool align_;
+ const BlockBuilder& data_block_builder_;
+};
+
+FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+ const BlockBasedTableOptions& table_options,
+ const BlockBuilder& data_block_builder) const {
+ return new FlushBlockBySizePolicy(
+ table_options.block_size, table_options.block_size_deviation,
+ table_options.block_align, data_block_builder);
+}
+
+FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+ const uint64_t size, const int deviation,
+ const BlockBuilder& data_block_builder) {
+ return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/flush_block_policy.h b/src/rocksdb/table/block_based/flush_block_policy.h
new file mode 100644
index 000000000..68c60c168
--- /dev/null
+++ b/src/rocksdb/table/block_based/flush_block_policy.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/flush_block_policy.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// FlushBlockEveryKeyPolicy currently used only in tests.
+
+class FlushBlockEveryKeyPolicy : public FlushBlockPolicy {
+ public:
+ bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+ if (!start_) {
+ start_ = true;
+ return false;
+ }
+ return true;
+ }
+
+ private:
+ bool start_ = false;
+};
+
+class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+ explicit FlushBlockEveryKeyPolicyFactory() {}
+
+ const char* Name() const override {
+ return "FlushBlockEveryKeyPolicyFactory";
+ }
+
+ FlushBlockPolicy* NewFlushBlockPolicy(
+ const BlockBasedTableOptions& /*table_options*/,
+ const BlockBuilder& /*data_block_builder*/) const override {
+ return new FlushBlockEveryKeyPolicy;
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block.cc b/src/rocksdb/table/block_based/full_filter_block.cc
new file mode 100644
index 000000000..e2f7f476f
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block.cc
@@ -0,0 +1,338 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/full_filter_block.h"
+#include <array>
+
+#include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FullFilterBlockBuilder::FullFilterBlockBuilder(
+ const SliceTransform* _prefix_extractor, bool whole_key_filtering,
+ FilterBitsBuilder* filter_bits_builder)
+ : prefix_extractor_(_prefix_extractor),
+ whole_key_filtering_(whole_key_filtering),
+ last_whole_key_recorded_(false),
+ last_prefix_recorded_(false),
+ num_added_(0) {
+ assert(filter_bits_builder != nullptr);
+ filter_bits_builder_.reset(filter_bits_builder);
+}
+
+void FullFilterBlockBuilder::Add(const Slice& key) {
+ const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key);
+ if (whole_key_filtering_) {
+ if (!add_prefix) {
+ AddKey(key);
+ } else {
+ // if both whole_key and prefix are added to bloom then we will have whole
+ // key and prefix addition being interleaved and thus cannot rely on the
+ // bits builder to properly detect the duplicates by comparing with the
+ // last item.
+ Slice last_whole_key = Slice(last_whole_key_str_);
+ if (!last_whole_key_recorded_ || last_whole_key.compare(key) != 0) {
+ AddKey(key);
+ last_whole_key_recorded_ = true;
+ last_whole_key_str_.assign(key.data(), key.size());
+ }
+ }
+ }
+ if (add_prefix) {
+ AddPrefix(key);
+ }
+}
+
+// Add key to filter if needed
+inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
+ filter_bits_builder_->AddKey(key);
+ num_added_++;
+}
+
+// Add prefix to filter if needed
+void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
+ Slice prefix = prefix_extractor_->Transform(key);
+ if (whole_key_filtering_) {
+ // if both whole_key and prefix are added to bloom then we will have whole
+ // key and prefix addition being interleaved and thus cannot rely on the
+ // bits builder to properly detect the duplicates by comparing with the last
+ // item.
+ Slice last_prefix = Slice(last_prefix_str_);
+ if (!last_prefix_recorded_ || last_prefix.compare(prefix) != 0) {
+ AddKey(prefix);
+ last_prefix_recorded_ = true;
+ last_prefix_str_.assign(prefix.data(), prefix.size());
+ }
+ } else {
+ AddKey(prefix);
+ }
+}
+
+void FullFilterBlockBuilder::Reset() {
+ last_whole_key_recorded_ = false;
+ last_prefix_recorded_ = false;
+}
+
+Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
+ Status* status) {
+ Reset();
+ // In this impl we ignore BlockHandle
+ *status = Status::OK();
+ if (num_added_ != 0) {
+ num_added_ = 0;
+ return filter_bits_builder_->Finish(&filter_data_);
+ }
+ return Slice();
+}
+
+FullFilterBlockReader::FullFilterBlockReader(
+ const BlockBasedTable* t,
+ CachableEntry<ParsedFullFilterBlock>&& filter_block)
+ : FilterBlockReaderCommon(t, std::move(filter_block)) {
+ const SliceTransform* const prefix_extractor = table_prefix_extractor();
+ if (prefix_extractor) {
+ full_length_enabled_ =
+ prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_);
+ }
+}
+
+bool FullFilterBlockReader::KeyMayMatch(
+ const Slice& key, const SliceTransform* /*prefix_extractor*/,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) {
+#ifdef NDEBUG
+ (void)block_offset;
+#endif
+ assert(block_offset == kNotValid);
+ if (!whole_key_filtering()) {
+ return true;
+ }
+ return MayMatch(key, no_io, get_context, lookup_context);
+}
+
+std::unique_ptr<FilterBlockReader> FullFilterBlockReader::Create(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ bool use_cache, bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context) {
+ assert(table);
+ assert(table->get_rep());
+ assert(!pin || prefetch);
+
+ CachableEntry<ParsedFullFilterBlock> filter_block;
+ if (prefetch || !use_cache) {
+ const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+ use_cache, nullptr /* get_context */,
+ lookup_context, &filter_block);
+ if (!s.ok()) {
+ return std::unique_ptr<FilterBlockReader>();
+ }
+
+ if (use_cache && !pin) {
+ filter_block.Reset();
+ }
+ }
+
+ return std::unique_ptr<FilterBlockReader>(
+ new FullFilterBlockReader(table, std::move(filter_block)));
+}
+
+bool FullFilterBlockReader::PrefixMayMatch(
+ const Slice& prefix, const SliceTransform* /* prefix_extractor */,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) {
+#ifdef NDEBUG
+ (void)block_offset;
+#endif
+ assert(block_offset == kNotValid);
+ return MayMatch(prefix, no_io, get_context, lookup_context);
+}
+
+bool FullFilterBlockReader::MayMatch(
+ const Slice& entry, bool no_io, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) const {
+ CachableEntry<ParsedFullFilterBlock> filter_block;
+
+ const Status s =
+ GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+ if (!s.ok()) {
+ return true;
+ }
+
+ assert(filter_block.GetValue());
+
+ FilterBitsReader* const filter_bits_reader =
+ filter_block.GetValue()->filter_bits_reader();
+
+ if (filter_bits_reader) {
+ if (filter_bits_reader->MayMatch(entry)) {
+ PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+ return true;
+ } else {
+ PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+ return false;
+ }
+ }
+ return true; // remain the same with block_based filter
+}
+
+void FullFilterBlockReader::KeysMayMatch(
+ MultiGetRange* range, const SliceTransform* /*prefix_extractor*/,
+ uint64_t block_offset, const bool no_io,
+ BlockCacheLookupContext* lookup_context) {
+#ifdef NDEBUG
+ (void)range;
+ (void)block_offset;
+#endif
+ assert(block_offset == kNotValid);
+ if (!whole_key_filtering()) {
+ // Simply return. Don't skip any key - consider all keys as likely to be
+ // present
+ return;
+ }
+ MayMatch(range, no_io, nullptr, lookup_context);
+}
+
+void FullFilterBlockReader::PrefixesMayMatch(
+ MultiGetRange* range, const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ BlockCacheLookupContext* lookup_context) {
+#ifdef NDEBUG
+ (void)range;
+ (void)block_offset;
+#endif
+ assert(block_offset == kNotValid);
+ MayMatch(range, no_io, prefix_extractor, lookup_context);
+}
+
+void FullFilterBlockReader::MayMatch(
+ MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor,
+ BlockCacheLookupContext* lookup_context) const {
+ CachableEntry<ParsedFullFilterBlock> filter_block;
+
+ const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context,
+ lookup_context, &filter_block);
+ if (!s.ok()) {
+ return;
+ }
+
+ assert(filter_block.GetValue());
+
+ FilterBitsReader* const filter_bits_reader =
+ filter_block.GetValue()->filter_bits_reader();
+
+ if (!filter_bits_reader) {
+ return;
+ }
+
+ // We need to use an array instead of autovector for may_match since
+ // &may_match[0] doesn't work for autovector<bool> (compiler error). So
+ // declare both keys and may_match as arrays, which is also slightly less
+ // expensive compared to autovector
+ std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
+ std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
+ autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+ int num_keys = 0;
+ MultiGetRange filter_range(*range, range->begin(), range->end());
+ for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
+ if (!prefix_extractor) {
+ keys[num_keys++] = &iter->ukey;
+ } else if (prefix_extractor->InDomain(iter->ukey)) {
+ prefixes.emplace_back(prefix_extractor->Transform(iter->ukey));
+ keys[num_keys++] = &prefixes.back();
+ } else {
+ filter_range.SkipKey(iter);
+ }
+ }
+
+ filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]);
+
+ int i = 0;
+ for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
+ if (!may_match[i]) {
+ // Update original MultiGet range to skip this key. The filter_range
+ // was temporarily used just to skip keys not in prefix_extractor domain
+ range->SkipKey(iter);
+ PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+ } else {
+ // PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+ PerfContext* perf_ctx = get_perf_context();
+ perf_ctx->bloom_sst_hit_count++;
+ }
+ ++i;
+ }
+}
+
+size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
+ size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(const_cast<FullFilterBlockReader*>(this));
+#else
+ usage += sizeof(*this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ return usage;
+}
+
+bool FullFilterBlockReader::RangeMayExist(
+ const Slice* iterate_upper_bound, const Slice& user_key,
+ const SliceTransform* prefix_extractor, const Comparator* comparator,
+ const Slice* const const_ikey_ptr, bool* filter_checked,
+ bool need_upper_bound_check, BlockCacheLookupContext* lookup_context) {
+ if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) {
+ *filter_checked = false;
+ return true;
+ }
+ Slice prefix = prefix_extractor->Transform(user_key);
+ if (need_upper_bound_check &&
+ !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
+ *filter_checked = false;
+ return true;
+ } else {
+ *filter_checked = true;
+ return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
+ const_ikey_ptr, /* get_context */ nullptr,
+ lookup_context);
+ }
+}
+
+bool FullFilterBlockReader::IsFilterCompatible(
+ const Slice* iterate_upper_bound, const Slice& prefix,
+ const Comparator* comparator) const {
+ // Try to reuse the bloom filter in the SST table if prefix_extractor in
+ // mutable_cf_options has changed. If range [user_key, upper_bound) all
+ // share the same prefix then we may still be able to use the bloom filter.
+ const SliceTransform* const prefix_extractor = table_prefix_extractor();
+ if (iterate_upper_bound != nullptr && prefix_extractor) {
+ if (!prefix_extractor->InDomain(*iterate_upper_bound)) {
+ return false;
+ }
+ Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
+ // first check if user_key and upper_bound all share the same prefix
+ if (!comparator->Equal(prefix, upper_bound_xform)) {
+ // second check if user_key's prefix is the immediate predecessor of
+ // upper_bound and have the same length. If so, we know for sure all
+ // keys in the range [user_key, upper_bound) share the same prefix.
+ // Also need to make sure upper_bound are full length to ensure
+ // correctness
+ if (!full_length_enabled_ ||
+ iterate_upper_bound->size() != prefix_extractor_full_length_ ||
+ !comparator->IsSameLengthImmediateSuccessor(prefix,
+ *iterate_upper_bound)) {
+ return false;
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block.h b/src/rocksdb/table/block_based/full_filter_block.h
new file mode 100644
index 000000000..c72a58021
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FilterPolicy;
+class FilterBitsBuilder;
+class FilterBitsReader;
+
+// A FullFilterBlockBuilder is used to construct a full filter for a
+// particular Table. It generates a single string which is stored as
+// a special block in the Table.
+// The format of full filter block is:
+// +----------------------------------------------------------------+
+// | full filter for all keys in sst file |
+// +----------------------------------------------------------------+
+// The full filter can be very large. At the end of it, we put
+// num_probes: how many hash functions are used in bloom filter
+//
+class FullFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+ explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor,
+ bool whole_key_filtering,
+ FilterBitsBuilder* filter_bits_builder);
+ // No copying allowed
+ FullFilterBlockBuilder(const FullFilterBlockBuilder&) = delete;
+ void operator=(const FullFilterBlockBuilder&) = delete;
+
+ // bits_builder is created in filter_policy, it should be passed in here
+ // directly. and be deleted here
+ ~FullFilterBlockBuilder() {}
+
+ virtual bool IsBlockBased() override { return false; }
+ virtual void StartBlock(uint64_t /*block_offset*/) override {}
+ virtual void Add(const Slice& key) override;
+ virtual size_t NumAdded() const override { return num_added_; }
+ virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
+ using FilterBlockBuilder::Finish;
+
+ protected:
+ virtual void AddKey(const Slice& key);
+ std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
+ virtual void Reset();
+ void AddPrefix(const Slice& key);
+ const SliceTransform* prefix_extractor() { return prefix_extractor_; }
+
+ private:
+ // important: all of these might point to invalid addresses
+ // at the time of destruction of this filter block. destructor
+ // should NOT dereference them.
+ const SliceTransform* prefix_extractor_;
+ bool whole_key_filtering_;
+ bool last_whole_key_recorded_;
+ std::string last_whole_key_str_;
+ bool last_prefix_recorded_;
+ std::string last_prefix_str_;
+
+ uint32_t num_added_;
+ std::unique_ptr<const char[]> filter_data_;
+
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class FullFilterBlockReader
+ : public FilterBlockReaderCommon<ParsedFullFilterBlock> {
+ public:
+ FullFilterBlockReader(const BlockBasedTable* t,
+ CachableEntry<ParsedFullFilterBlock>&& filter_block);
+
+ static std::unique_ptr<FilterBlockReader> Create(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ bool use_cache, bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context);
+
+ bool IsBlockBased() override { return false; }
+
+ bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const const_ikey_ptr, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) override;
+
+ bool PrefixMayMatch(const Slice& prefix,
+ const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const const_ikey_ptr,
+ GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) override;
+
+ void KeysMayMatch(MultiGetRange* range,
+ const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ BlockCacheLookupContext* lookup_context) override;
+
+ void PrefixesMayMatch(MultiGetRange* range,
+ const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ BlockCacheLookupContext* lookup_context) override;
+ size_t ApproximateMemoryUsage() const override;
+ bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
+ const SliceTransform* prefix_extractor,
+ const Comparator* comparator,
+ const Slice* const const_ikey_ptr, bool* filter_checked,
+ bool need_upper_bound_check,
+ BlockCacheLookupContext* lookup_context) override;
+
+ private:
+ bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) const;
+ void MayMatch(MultiGetRange* range, bool no_io,
+ const SliceTransform* prefix_extractor,
+ BlockCacheLookupContext* lookup_context) const;
+ bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix,
+ const Comparator* comparator) const;
+
+ private:
+ bool full_length_enabled_;
+ size_t prefix_extractor_full_length_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block_test.cc b/src/rocksdb/table/block_based/full_filter_block_test.cc
new file mode 100644
index 000000000..496b149ab
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block_test.cc
@@ -0,0 +1,333 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <set>
+
+#include "table/block_based/full_filter_block.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+ explicit TestFilterBitsBuilder() {}
+
+ // Add Key to filter
+ void AddKey(const Slice& key) override {
+ hash_entries_.push_back(Hash(key.data(), key.size(), 1));
+ }
+
+ // Generate the filter using the keys that are added
+ Slice Finish(std::unique_ptr<const char[]>* buf) override {
+ uint32_t len = static_cast<uint32_t>(hash_entries_.size()) * 4;
+ char* data = new char[len];
+ for (size_t i = 0; i < hash_entries_.size(); i++) {
+ EncodeFixed32(data + i * 4, hash_entries_[i]);
+ }
+ const char* const_data = data;
+ buf->reset(const_data);
+ return Slice(data, len);
+ }
+
+ private:
+ std::vector<uint32_t> hash_entries_;
+};
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+ explicit MockBlockBasedTable(Rep* rep)
+ : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class TestFilterBitsReader : public FilterBitsReader {
+ public:
+ explicit TestFilterBitsReader(const Slice& contents)
+ : data_(contents.data()), len_(static_cast<uint32_t>(contents.size())) {}
+
+ // Silence compiler warning about overloaded virtual
+ using FilterBitsReader::MayMatch;
+ bool MayMatch(const Slice& entry) override {
+ uint32_t h = Hash(entry.data(), entry.size(), 1);
+ for (size_t i = 0; i + 4 <= len_; i += 4) {
+ if (h == DecodeFixed32(data_ + i)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private:
+ const char* data_;
+ uint32_t len_;
+};
+
+
+class TestHashFilter : public FilterPolicy {
+ public:
+ const char* Name() const override { return "TestHashFilter"; }
+
+ void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+ for (int i = 0; i < n; i++) {
+ uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+ PutFixed32(dst, h);
+ }
+ }
+
+ bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+ uint32_t h = Hash(key.data(), key.size(), 1);
+ for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+ if (h == DecodeFixed32(filter.data() + i)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ FilterBitsBuilder* GetFilterBitsBuilder() const override {
+ return new TestFilterBitsBuilder();
+ }
+
+ FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+ return new TestFilterBitsReader(contents);
+ }
+};
+
+class PluginFullFilterBlockTest : public mock::MockBlockBasedTableTester,
+ public testing::Test {
+ public:
+ PluginFullFilterBlockTest()
+ : mock::MockBlockBasedTableTester(new TestHashFilter) {}
+};
+
+TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
+ FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+ Slice slice = builder.Finish();
+ ASSERT_EQ("", EscapeString(slice));
+
+ CachableEntry<ParsedFullFilterBlock> block(
+ new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+ BlockContents(slice)),
+ nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+ FullFilterBlockReader reader(table_.get(), std::move(block));
+ // Remain same symantic with blockbased filter
+ ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+}
+
+TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
+ FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+ builder.Add("foo");
+ builder.Add("bar");
+ builder.Add("box");
+ builder.Add("box");
+ builder.Add("hello");
+ Slice slice = builder.Finish();
+
+ CachableEntry<ParsedFullFilterBlock> block(
+ new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+ BlockContents(slice)),
+ nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+ FullFilterBlockReader reader(table_.get(), std::move(block));
+ ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+}
+
+class FullFilterBlockTest : public mock::MockBlockBasedTableTester,
+ public testing::Test {
+ public:
+ FullFilterBlockTest()
+ : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, false)) {}
+};
+
+TEST_F(FullFilterBlockTest, EmptyBuilder) {
+ FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+ Slice slice = builder.Finish();
+ ASSERT_EQ("", EscapeString(slice));
+
+ CachableEntry<ParsedFullFilterBlock> block(
+ new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+ BlockContents(slice)),
+ nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+ FullFilterBlockReader reader(table_.get(), std::move(block));
+ // Remain same symantic with blockbased filter
+ ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+}
+
+class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder {
+ std::unique_ptr<FilterBitsBuilder> b_;
+ std::set<std::string> uniq_;
+
+ public:
+ explicit CountUniqueFilterBitsBuilderWrapper(FilterBitsBuilder* b) : b_(b) {}
+
+ ~CountUniqueFilterBitsBuilderWrapper() override {}
+
+ void AddKey(const Slice& key) override {
+ b_->AddKey(key);
+ uniq_.insert(key.ToString());
+ }
+
+ Slice Finish(std::unique_ptr<const char[]>* buf) override {
+ Slice rv = b_->Finish(buf);
+ uniq_.clear();
+ return rv;
+ }
+
+ int CalculateNumEntry(const uint32_t bytes) override {
+ return b_->CalculateNumEntry(bytes);
+ }
+
+ size_t CountUnique() { return uniq_.size(); }
+};
+
+TEST_F(FullFilterBlockTest, DuplicateEntries) {
+ { // empty prefixes
+ std::unique_ptr<const SliceTransform> prefix_extractor(
+ NewFixedPrefixTransform(0));
+ auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
+ const bool WHOLE_KEY = true;
+ FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+ bits_builder);
+ ASSERT_EQ(0, builder.NumAdded());
+ ASSERT_EQ(0, bits_builder->CountUnique());
+ // adds key and empty prefix; both abstractions count them
+ builder.Add("key1");
+ ASSERT_EQ(2, builder.NumAdded());
+ ASSERT_EQ(2, bits_builder->CountUnique());
+ // Add different key (unique) and also empty prefix (not unique).
+ // From here in this test, it's immaterial whether the block builder
+ // can count unique keys.
+ builder.Add("key2");
+ ASSERT_EQ(3, bits_builder->CountUnique());
+ // Empty key -> nothing unique
+ builder.Add("");
+ ASSERT_EQ(3, bits_builder->CountUnique());
+ }
+
+ // mix of empty and non-empty
+ std::unique_ptr<const SliceTransform> prefix_extractor(
+ NewFixedPrefixTransform(7));
+ auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
+ const bool WHOLE_KEY = true;
+ FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+ bits_builder);
+ ASSERT_EQ(0, builder.NumAdded());
+ builder.Add(""); // test with empty key too
+ builder.Add("prefix1key1");
+ builder.Add("prefix1key1");
+ builder.Add("prefix1key2");
+ builder.Add("prefix1key3");
+ builder.Add("prefix2key4");
+ // 1 empty, 2 non-empty prefixes, and 4 non-empty keys
+ ASSERT_EQ(1 + 2 + 4, bits_builder->CountUnique());
+}
+
+TEST_F(FullFilterBlockTest, SingleChunk) {
+ FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+ ASSERT_EQ(0, builder.NumAdded());
+ builder.Add("foo");
+ builder.Add("bar");
+ builder.Add("box");
+ builder.Add("box");
+ builder.Add("hello");
+ ASSERT_EQ(5, builder.NumAdded());
+ Slice slice = builder.Finish();
+
+ CachableEntry<ParsedFullFilterBlock> block(
+ new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+ BlockContents(slice)),
+ nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+ FullFilterBlockReader reader(table_.get(), std::move(block));
+ ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+ /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+ /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ ASSERT_TRUE(!reader.KeyMayMatch(
+ "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+ /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/index_builder.cc b/src/rocksdb/table/block_based/index_builder.cc
new file mode 100644
index 000000000..277bec61d
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_builder.cc
@@ -0,0 +1,222 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/index_builder.h"
+
+#include <assert.h>
+#include <cinttypes>
+
+#include <list>
+#include <string>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/flush_block_policy.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/format.h"
+
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace ROCKSDB_NAMESPACE {
+// using namespace rocksdb;
+// Create a index builder based on its type.
+IndexBuilder* IndexBuilder::CreateIndexBuilder(
+ BlockBasedTableOptions::IndexType index_type,
+ const InternalKeyComparator* comparator,
+ const InternalKeySliceTransform* int_key_slice_transform,
+ const bool use_value_delta_encoding,
+ const BlockBasedTableOptions& table_opt) {
+ IndexBuilder* result = nullptr;
+ switch (index_type) {
+ case BlockBasedTableOptions::kBinarySearch: {
+ result = new ShortenedIndexBuilder(
+ comparator, table_opt.index_block_restart_interval,
+ table_opt.format_version, use_value_delta_encoding,
+ table_opt.index_shortening, /* include_first_key */ false);
+ } break;
+ case BlockBasedTableOptions::kHashSearch: {
+ // Currently kHashSearch is incompatible with index_block_restart_interval
+ // > 1
+ assert(table_opt.index_block_restart_interval == 1);
+ result = new HashIndexBuilder(
+ comparator, int_key_slice_transform,
+ table_opt.index_block_restart_interval, table_opt.format_version,
+ use_value_delta_encoding, table_opt.index_shortening);
+ } break;
+ case BlockBasedTableOptions::kTwoLevelIndexSearch: {
+ result = PartitionedIndexBuilder::CreateIndexBuilder(
+ comparator, use_value_delta_encoding, table_opt);
+ } break;
+ case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
+ result = new ShortenedIndexBuilder(
+ comparator, table_opt.index_block_restart_interval,
+ table_opt.format_version, use_value_delta_encoding,
+ table_opt.index_shortening, /* include_first_key */ true);
+ } break;
+ default: {
+ assert(!"Do not recognize the index type ");
+ } break;
+ }
+ return result;
+}
+
+PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
+ const InternalKeyComparator* comparator,
+ const bool use_value_delta_encoding,
+ const BlockBasedTableOptions& table_opt) {
+ return new PartitionedIndexBuilder(comparator, table_opt,
+ use_value_delta_encoding);
+}
+
+PartitionedIndexBuilder::PartitionedIndexBuilder(
+ const InternalKeyComparator* comparator,
+ const BlockBasedTableOptions& table_opt,
+ const bool use_value_delta_encoding)
+ : IndexBuilder(comparator),
+ index_block_builder_(table_opt.index_block_restart_interval,
+ true /*use_delta_encoding*/,
+ use_value_delta_encoding),
+ index_block_builder_without_seq_(table_opt.index_block_restart_interval,
+ true /*use_delta_encoding*/,
+ use_value_delta_encoding),
+ sub_index_builder_(nullptr),
+ table_opt_(table_opt),
+ // We start by false. After each partition we revise the value based on
+ // what the sub_index_builder has decided. If the feature is disabled
+ // entirely, this will be set to true after switching the first
+ // sub_index_builder. Otherwise, it could be set to true even one of the
+ // sub_index_builders could not safely exclude seq from the keys, then it
+ // wil be enforced on all sub_index_builders on ::Finish.
+ seperator_is_key_plus_seq_(false),
+ use_value_delta_encoding_(use_value_delta_encoding) {}
+
+PartitionedIndexBuilder::~PartitionedIndexBuilder() {
+ delete sub_index_builder_;
+}
+
+void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
+ assert(sub_index_builder_ == nullptr);
+ sub_index_builder_ = new ShortenedIndexBuilder(
+ comparator_, table_opt_.index_block_restart_interval,
+ table_opt_.format_version, use_value_delta_encoding_,
+ table_opt_.index_shortening, /* include_first_key */ false);
+ flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+ table_opt_.metadata_block_size, table_opt_.block_size_deviation,
+ // Note: this is sub-optimal since sub_index_builder_ could later reset
+ // seperator_is_key_plus_seq_ but the probability of that is low.
+ sub_index_builder_->seperator_is_key_plus_seq_
+ ? sub_index_builder_->index_block_builder_
+ : sub_index_builder_->index_block_builder_without_seq_));
+ partition_cut_requested_ = false;
+}
+
+void PartitionedIndexBuilder::RequestPartitionCut() {
+ partition_cut_requested_ = true;
+}
+
+void PartitionedIndexBuilder::AddIndexEntry(
+ std::string* last_key_in_current_block,
+ const Slice* first_key_in_next_block, const BlockHandle& block_handle) {
+ // Note: to avoid two consecuitive flush in the same method call, we do not
+ // check flush policy when adding the last key
+ if (UNLIKELY(first_key_in_next_block == nullptr)) { // no more keys
+ if (sub_index_builder_ == nullptr) {
+ MakeNewSubIndexBuilder();
+ }
+ sub_index_builder_->AddIndexEntry(last_key_in_current_block,
+ first_key_in_next_block, block_handle);
+ if (sub_index_builder_->seperator_is_key_plus_seq_) {
+ // then we need to apply it to all sub-index builders
+ seperator_is_key_plus_seq_ = true;
+ }
+ sub_index_last_key_ = std::string(*last_key_in_current_block);
+ entries_.push_back(
+ {sub_index_last_key_,
+ std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
+ sub_index_builder_ = nullptr;
+ cut_filter_block = true;
+ } else {
+ // apply flush policy only to non-empty sub_index_builder_
+ if (sub_index_builder_ != nullptr) {
+ std::string handle_encoding;
+ block_handle.EncodeTo(&handle_encoding);
+ bool do_flush =
+ partition_cut_requested_ ||
+ flush_policy_->Update(*last_key_in_current_block, handle_encoding);
+ if (do_flush) {
+ entries_.push_back(
+ {sub_index_last_key_,
+ std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
+ cut_filter_block = true;
+ sub_index_builder_ = nullptr;
+ }
+ }
+ if (sub_index_builder_ == nullptr) {
+ MakeNewSubIndexBuilder();
+ }
+ sub_index_builder_->AddIndexEntry(last_key_in_current_block,
+ first_key_in_next_block, block_handle);
+ sub_index_last_key_ = std::string(*last_key_in_current_block);
+ if (sub_index_builder_->seperator_is_key_plus_seq_) {
+ // then we need to apply it to all sub-index builders
+ seperator_is_key_plus_seq_ = true;
+ }
+ }
+}
+
+Status PartitionedIndexBuilder::Finish(
+ IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
+ if (partition_cnt_ == 0) {
+ partition_cnt_ = entries_.size();
+ }
+ // It must be set to null after last key is added
+ assert(sub_index_builder_ == nullptr);
+ if (finishing_indexes == true) {
+ Entry& last_entry = entries_.front();
+ std::string handle_encoding;
+ last_partition_block_handle.EncodeTo(&handle_encoding);
+ std::string handle_delta_encoding;
+ PutVarsignedint64(
+ &handle_delta_encoding,
+ last_partition_block_handle.size() - last_encoded_handle_.size());
+ last_encoded_handle_ = last_partition_block_handle;
+ const Slice handle_delta_encoding_slice(handle_delta_encoding);
+ index_block_builder_.Add(last_entry.key, handle_encoding,
+ &handle_delta_encoding_slice);
+ if (!seperator_is_key_plus_seq_) {
+ index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
+ handle_encoding,
+ &handle_delta_encoding_slice);
+ }
+ entries_.pop_front();
+ }
+ // If there is no sub_index left, then return the 2nd level index.
+ if (UNLIKELY(entries_.empty())) {
+ if (seperator_is_key_plus_seq_) {
+ index_blocks->index_block_contents = index_block_builder_.Finish();
+ } else {
+ index_blocks->index_block_contents =
+ index_block_builder_without_seq_.Finish();
+ }
+ top_level_index_size_ = index_blocks->index_block_contents.size();
+ index_size_ += top_level_index_size_;
+ return Status::OK();
+ } else {
+ // Finish the next partition index in line and Incomplete() to indicate we
+ // expect more calls to Finish
+ Entry& entry = entries_.front();
+ // Apply the policy to all sub-indexes
+ entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
+ auto s = entry.value->Finish(index_blocks);
+ index_size_ += index_blocks->index_block_contents.size();
+ finishing_indexes = true;
+ return s.ok() ? Status::Incomplete() : s;
+ }
+}
+
+size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; }
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/index_builder.h b/src/rocksdb/table/block_based/index_builder.h
new file mode 100644
index 000000000..bfffc5996
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_builder.h
@@ -0,0 +1,443 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <assert.h>
+#include <cinttypes>
+
+#include <list>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/comparator.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The interface for building index.
+// Instruction for adding a new concrete IndexBuilder:
+// 1. Create a subclass instantiated from IndexBuilder.
+// 2. Add a new entry associated with that subclass in TableOptions::IndexType.
+// 3. Add a create function for the new subclass in CreateIndexBuilder.
+// Note: we can devise more advanced design to simplify the process for adding
+// new subclass, which will, on the other hand, increase the code complexity and
+// catch unwanted attention from readers. Given that we won't add/change
+// indexes frequently, it makes sense to just embrace a more straightforward
+// design that just works.
+class IndexBuilder {
+ public:
+ static IndexBuilder* CreateIndexBuilder(
+ BlockBasedTableOptions::IndexType index_type,
+ const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
+ const InternalKeySliceTransform* int_key_slice_transform,
+ const bool use_value_delta_encoding,
+ const BlockBasedTableOptions& table_opt);
+
+ // Index builder will construct a set of blocks which contain:
+ // 1. One primary index block.
+ // 2. (Optional) a set of metablocks that contains the metadata of the
+ // primary index.
+ struct IndexBlocks {
+ Slice index_block_contents;
+ std::unordered_map<std::string, Slice> meta_blocks;
+ };
+ explicit IndexBuilder(const InternalKeyComparator* comparator)
+ : comparator_(comparator) {}
+
+ virtual ~IndexBuilder() {}
+
+ // Add a new index entry to index block.
+ // To allow further optimization, we provide `last_key_in_current_block` and
+ // `first_key_in_next_block`, based on which the specific implementation can
+ // determine the best index key to be used for the index block.
+ // Called before the OnKeyAdded() call for first_key_in_next_block.
+ // @last_key_in_current_block: this parameter maybe overridden with the value
+ // "substitute key".
+ // @first_key_in_next_block: it will be nullptr if the entry being added is
+ // the last one in the table
+ //
+ // REQUIRES: Finish() has not yet been called.
+ virtual void AddIndexEntry(std::string* last_key_in_current_block,
+ const Slice* first_key_in_next_block,
+ const BlockHandle& block_handle) = 0;
+
+ // This method will be called whenever a key is added. The subclasses may
+ // override OnKeyAdded() if they need to collect additional information.
+ virtual void OnKeyAdded(const Slice& /*key*/) {}
+
+ // Inform the index builder that all entries has been written. Block builder
+ // may therefore perform any operation required for block finalization.
+ //
+ // REQUIRES: Finish() has not yet been called.
+ inline Status Finish(IndexBlocks* index_blocks) {
+ // Throw away the changes to last_partition_block_handle. It has no effect
+ // on the first call to Finish anyway.
+ BlockHandle last_partition_block_handle;
+ return Finish(index_blocks, last_partition_block_handle);
+ }
+
+ // This override of Finish can be utilized to build the 2nd level index in
+ // PartitionIndexBuilder.
+ //
+ // index_blocks will be filled with the resulting index data. If the return
+ // value is Status::InComplete() then it means that the index is partitioned
+ // and the callee should keep calling Finish until Status::OK() is returned.
+ // In that case, last_partition_block_handle is pointer to the block written
+ // with the result of the last call to Finish. This can be utilized to build
+ // the second level index pointing to each block of partitioned indexes. The
+ // last call to Finish() that returns Status::OK() populates index_blocks with
+ // the 2nd level index content.
+ virtual Status Finish(IndexBlocks* index_blocks,
+ const BlockHandle& last_partition_block_handle) = 0;
+
+ // Get the size for index block. Must be called after ::Finish.
+ virtual size_t IndexSize() const = 0;
+
+ virtual bool seperator_is_key_plus_seq() { return true; }
+
+ protected:
+ const InternalKeyComparator* comparator_;
+ // Set after ::Finish is called
+ size_t index_size_ = 0;
+};
+
+// This index builder builds space-efficient index block.
+//
+// Optimizations:
+// 1. Made block's `block_restart_interval` to be 1, which will avoid linear
+// search when doing index lookup (can be disabled by setting
+// index_block_restart_interval).
+// 2. Shorten the key length for index block. Other than honestly using the
+// last key in the data block as the index key, we instead find a shortest
+// substitute key that serves the same function.
+class ShortenedIndexBuilder : public IndexBuilder {
+ public:
+ explicit ShortenedIndexBuilder(
+ const InternalKeyComparator* comparator,
+ const int index_block_restart_interval, const uint32_t format_version,
+ const bool use_value_delta_encoding,
+ BlockBasedTableOptions::IndexShorteningMode shortening_mode,
+ bool include_first_key)
+ : IndexBuilder(comparator),
+ index_block_builder_(index_block_restart_interval,
+ true /*use_delta_encoding*/,
+ use_value_delta_encoding),
+ index_block_builder_without_seq_(index_block_restart_interval,
+ true /*use_delta_encoding*/,
+ use_value_delta_encoding),
+ use_value_delta_encoding_(use_value_delta_encoding),
+ include_first_key_(include_first_key),
+ shortening_mode_(shortening_mode) {
+ // Making the default true will disable the feature for old versions
+ seperator_is_key_plus_seq_ = (format_version <= 2);
+ }
+
+ virtual void OnKeyAdded(const Slice& key) override {
+ if (include_first_key_ && current_block_first_internal_key_.empty()) {
+ current_block_first_internal_key_.assign(key.data(), key.size());
+ }
+ }
+
+ virtual void AddIndexEntry(std::string* last_key_in_current_block,
+ const Slice* first_key_in_next_block,
+ const BlockHandle& block_handle) override {
+ if (first_key_in_next_block != nullptr) {
+ if (shortening_mode_ !=
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening) {
+ comparator_->FindShortestSeparator(last_key_in_current_block,
+ *first_key_in_next_block);
+ }
+ if (!seperator_is_key_plus_seq_ &&
+ comparator_->user_comparator()->Compare(
+ ExtractUserKey(*last_key_in_current_block),
+ ExtractUserKey(*first_key_in_next_block)) == 0) {
+ seperator_is_key_plus_seq_ = true;
+ }
+ } else {
+ if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode::
+ kShortenSeparatorsAndSuccessor) {
+ comparator_->FindShortSuccessor(last_key_in_current_block);
+ }
+ }
+ auto sep = Slice(*last_key_in_current_block);
+
+ assert(!include_first_key_ || !current_block_first_internal_key_.empty());
+ IndexValue entry(block_handle, current_block_first_internal_key_);
+ std::string encoded_entry;
+ std::string delta_encoded_entry;
+ entry.EncodeTo(&encoded_entry, include_first_key_, nullptr);
+ if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) {
+ entry.EncodeTo(&delta_encoded_entry, include_first_key_,
+ &last_encoded_handle_);
+ } else {
+ // If it's the first block, or delta encoding is disabled,
+ // BlockBuilder::Add() below won't use delta-encoded slice.
+ }
+ last_encoded_handle_ = block_handle;
+ const Slice delta_encoded_entry_slice(delta_encoded_entry);
+ index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice);
+ if (!seperator_is_key_plus_seq_) {
+ index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry,
+ &delta_encoded_entry_slice);
+ }
+
+ current_block_first_internal_key_.clear();
+ }
+
+ using IndexBuilder::Finish;
+ virtual Status Finish(
+ IndexBlocks* index_blocks,
+ const BlockHandle& /*last_partition_block_handle*/) override {
+ if (seperator_is_key_plus_seq_) {
+ index_blocks->index_block_contents = index_block_builder_.Finish();
+ } else {
+ index_blocks->index_block_contents =
+ index_block_builder_without_seq_.Finish();
+ }
+ index_size_ = index_blocks->index_block_contents.size();
+ return Status::OK();
+ }
+
+ virtual size_t IndexSize() const override { return index_size_; }
+
+ virtual bool seperator_is_key_plus_seq() override {
+ return seperator_is_key_plus_seq_;
+ }
+
+ friend class PartitionedIndexBuilder;
+
+ private:
+ BlockBuilder index_block_builder_;
+ BlockBuilder index_block_builder_without_seq_;
+ const bool use_value_delta_encoding_;
+ bool seperator_is_key_plus_seq_;
+ const bool include_first_key_;
+ BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
+ BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
+ std::string current_block_first_internal_key_;
+};
+
+// HashIndexBuilder contains a binary-searchable primary index and the
+// metadata for secondary hash index construction.
+// The metadata for hash index consists two parts:
+// - a metablock that compactly contains a sequence of prefixes. All prefixes
+// are stored consectively without any metadata (like, prefix sizes) being
+// stored, which is kept in the other metablock.
+// - a metablock contains the metadata of the prefixes, including prefix size,
+// restart index and number of block it spans. The format looks like:
+//
+// +-----------------+---------------------------+---------------------+
+// <=prefix 1
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+// <=prefix 2
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+// | |
+// | .... |
+// | |
+// +-----------------+---------------------------+---------------------+
+// <=prefix n
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+//
+// The reason of separating these two metablocks is to enable the efficiently
+// reuse the first metablock during hash index construction without unnecessary
+// data copy or small heap allocations for prefixes.
+class HashIndexBuilder : public IndexBuilder {
+ public:
+ explicit HashIndexBuilder(
+ const InternalKeyComparator* comparator,
+ const SliceTransform* hash_key_extractor,
+ int index_block_restart_interval, int format_version,
+ bool use_value_delta_encoding,
+ BlockBasedTableOptions::IndexShorteningMode shortening_mode)
+ : IndexBuilder(comparator),
+ primary_index_builder_(comparator, index_block_restart_interval,
+ format_version, use_value_delta_encoding,
+ shortening_mode, /* include_first_key */ false),
+ hash_key_extractor_(hash_key_extractor) {}
+
+ virtual void AddIndexEntry(std::string* last_key_in_current_block,
+ const Slice* first_key_in_next_block,
+ const BlockHandle& block_handle) override {
+ ++current_restart_index_;
+ primary_index_builder_.AddIndexEntry(last_key_in_current_block,
+ first_key_in_next_block, block_handle);
+ }
+
+ virtual void OnKeyAdded(const Slice& key) override {
+ auto key_prefix = hash_key_extractor_->Transform(key);
+ bool is_first_entry = pending_block_num_ == 0;
+
+ // Keys may share the prefix
+ if (is_first_entry || pending_entry_prefix_ != key_prefix) {
+ if (!is_first_entry) {
+ FlushPendingPrefix();
+ }
+
+ // need a hard copy otherwise the underlying data changes all the time.
+ // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
+ // copy.
+ pending_entry_prefix_ = key_prefix.ToString();
+ pending_block_num_ = 1;
+ pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
+ } else {
+ // entry number increments when keys share the prefix reside in
+ // different data blocks.
+ auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
+ assert(last_restart_index <= current_restart_index_);
+ if (last_restart_index != current_restart_index_) {
+ ++pending_block_num_;
+ }
+ }
+ }
+
+ virtual Status Finish(
+ IndexBlocks* index_blocks,
+ const BlockHandle& last_partition_block_handle) override {
+ if (pending_block_num_ != 0) {
+ FlushPendingPrefix();
+ }
+ primary_index_builder_.Finish(index_blocks, last_partition_block_handle);
+ index_blocks->meta_blocks.insert(
+ {kHashIndexPrefixesBlock.c_str(), prefix_block_});
+ index_blocks->meta_blocks.insert(
+ {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
+ return Status::OK();
+ }
+
+ virtual size_t IndexSize() const override {
+ return primary_index_builder_.IndexSize() + prefix_block_.size() +
+ prefix_meta_block_.size();
+ }
+
+ virtual bool seperator_is_key_plus_seq() override {
+ return primary_index_builder_.seperator_is_key_plus_seq();
+ }
+
+ private:
+ void FlushPendingPrefix() {
+ prefix_block_.append(pending_entry_prefix_.data(),
+ pending_entry_prefix_.size());
+ PutVarint32Varint32Varint32(
+ &prefix_meta_block_,
+ static_cast<uint32_t>(pending_entry_prefix_.size()),
+ pending_entry_index_, pending_block_num_);
+ }
+
+ ShortenedIndexBuilder primary_index_builder_;
+ const SliceTransform* hash_key_extractor_;
+
+ // stores a sequence of prefixes
+ std::string prefix_block_;
+ // stores the metadata of prefixes
+ std::string prefix_meta_block_;
+
+ // The following 3 variables keeps unflushed prefix and its metadata.
+ // The details of block_num and entry_index can be found in
+ // "block_hash_index.{h,cc}"
+ uint32_t pending_block_num_ = 0;
+ uint32_t pending_entry_index_ = 0;
+ std::string pending_entry_prefix_;
+
+ uint64_t current_restart_index_ = 0;
+};
+
+/**
+ * IndexBuilder for two-level indexing. Internally it creates a new index for
+ * each partition and Finish then in order when Finish is called on it
+ * continiously until Status::OK() is returned.
+ *
+ * The format on the disk would be I I I I I I IP where I is block containing a
+ * partition of indexes built using ShortenedIndexBuilder and IP is a block
+ * containing a secondary index on the partitions, built using
+ * ShortenedIndexBuilder.
+ */
+class PartitionedIndexBuilder : public IndexBuilder {
+ public:
+ static PartitionedIndexBuilder* CreateIndexBuilder(
+ const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
+ const bool use_value_delta_encoding,
+ const BlockBasedTableOptions& table_opt);
+
+ explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
+ const BlockBasedTableOptions& table_opt,
+ const bool use_value_delta_encoding);
+
+ virtual ~PartitionedIndexBuilder();
+
+ virtual void AddIndexEntry(std::string* last_key_in_current_block,
+ const Slice* first_key_in_next_block,
+ const BlockHandle& block_handle) override;
+
+ virtual Status Finish(
+ IndexBlocks* index_blocks,
+ const BlockHandle& last_partition_block_handle) override;
+
+ virtual size_t IndexSize() const override { return index_size_; }
+ size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
+ size_t NumPartitions() const;
+
+ inline bool ShouldCutFilterBlock() {
+ // Current policy is to align the partitions of index and filters
+ if (cut_filter_block) {
+ cut_filter_block = false;
+ return true;
+ }
+ return false;
+ }
+
+ std::string& GetPartitionKey() { return sub_index_last_key_; }
+
+ // Called when an external entity (such as filter partition builder) request
+ // cutting the next partition
+ void RequestPartitionCut();
+
+ virtual bool seperator_is_key_plus_seq() override {
+ return seperator_is_key_plus_seq_;
+ }
+
+ bool get_use_value_delta_encoding() { return use_value_delta_encoding_; }
+
+ private:
+ // Set after ::Finish is called
+ size_t top_level_index_size_ = 0;
+ // Set after ::Finish is called
+ size_t partition_cnt_ = 0;
+
+ void MakeNewSubIndexBuilder();
+
+ struct Entry {
+ std::string key;
+ std::unique_ptr<ShortenedIndexBuilder> value;
+ };
+ std::list<Entry> entries_; // list of partitioned indexes and their keys
+ BlockBuilder index_block_builder_; // top-level index builder
+ BlockBuilder index_block_builder_without_seq_; // same for user keys
+ // the active partition index builder
+ ShortenedIndexBuilder* sub_index_builder_;
+ // the last key in the active partition index builder
+ std::string sub_index_last_key_;
+ std::unique_ptr<FlushBlockPolicy> flush_policy_;
+ // true if Finish is called once but not complete yet.
+ bool finishing_indexes = false;
+ const BlockBasedTableOptions& table_opt_;
+ bool seperator_is_key_plus_seq_;
+ bool use_value_delta_encoding_;
+ // true if an external entity (such as filter partition builder) request
+ // cutting the next partition
+ bool partition_cut_requested_ = true;
+ // true if it should cut the next filter partition block
+ bool cut_filter_block = false;
+ BlockHandle last_encoded_handle_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/mock_block_based_table.h b/src/rocksdb/table/block_based/mock_block_based_table.h
new file mode 100644
index 000000000..54817bd67
--- /dev/null
+++ b/src/rocksdb/table/block_based/mock_block_based_table.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace mock {
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+ explicit MockBlockBasedTable(Rep* rep)
+ : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class MockBlockBasedTableTester {
+ static constexpr int kMockLevel = 0;
+
+ public:
+ Options options_;
+ ImmutableCFOptions ioptions_;
+ EnvOptions env_options_;
+ BlockBasedTableOptions table_options_;
+ InternalKeyComparator icomp_;
+ std::unique_ptr<BlockBasedTable> table_;
+
+ MockBlockBasedTableTester(const FilterPolicy *filter_policy)
+ : ioptions_(options_),
+ env_options_(options_),
+ icomp_(options_.comparator) {
+ table_options_.filter_policy.reset(filter_policy);
+
+ constexpr bool skip_filters = false;
+ constexpr bool immortal_table = false;
+ table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep(
+ ioptions_, env_options_, table_options_, icomp_, skip_filters,
+ kMockLevel, immortal_table)));
+ }
+
+ FilterBitsBuilder* GetBuilder() const {
+ FilterBuildingContext context(table_options_);
+ context.column_family_name = "mock_cf";
+ context.compaction_style = ioptions_.compaction_style;
+ context.level_at_creation = kMockLevel;
+ context.info_log = ioptions_.info_log;
+ return BloomFilterPolicy::GetBuilderFromContext(context);
+ }
+};
+
+} // namespace mock
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/parsed_full_filter_block.cc b/src/rocksdb/table/block_based/parsed_full_filter_block.cc
new file mode 100644
index 000000000..3e555387e
--- /dev/null
+++ b/src/rocksdb/table/block_based/parsed_full_filter_block.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/parsed_full_filter_block.h"
+#include "rocksdb/filter_policy.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ParsedFullFilterBlock::ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+ BlockContents&& contents)
+ : block_contents_(std::move(contents)),
+ filter_bits_reader_(
+ !block_contents_.data.empty()
+ ? filter_policy->GetFilterBitsReader(block_contents_.data)
+ : nullptr) {}
+
+ParsedFullFilterBlock::~ParsedFullFilterBlock() = default;
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/parsed_full_filter_block.h b/src/rocksdb/table/block_based/parsed_full_filter_block.h
new file mode 100644
index 000000000..36c619921
--- /dev/null
+++ b/src/rocksdb/table/block_based/parsed_full_filter_block.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FilterBitsReader;
+class FilterPolicy;
+
+// The sharable/cachable part of the full filter.
+class ParsedFullFilterBlock {
+ public:
+ ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+ BlockContents&& contents);
+ ~ParsedFullFilterBlock();
+
+ FilterBitsReader* filter_bits_reader() const {
+ return filter_bits_reader_.get();
+ }
+
+ // TODO: consider memory usage of the FilterBitsReader
+ size_t ApproximateMemoryUsage() const {
+ return block_contents_.ApproximateMemoryUsage();
+ }
+
+ bool own_bytes() const { return block_contents_.own_bytes(); }
+
+ private:
+ BlockContents block_contents_;
+ std::unique_ptr<FilterBitsReader> filter_bits_reader_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block.cc b/src/rocksdb/table/block_based/partitioned_filter_block.cc
new file mode 100644
index 000000000..2138d96dd
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block.cc
@@ -0,0 +1,388 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/partitioned_filter_block.h"
+
+#include <utility>
+
+#include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
+ const SliceTransform* _prefix_extractor, bool whole_key_filtering,
+ FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+ const bool use_value_delta_encoding,
+ PartitionedIndexBuilder* const p_index_builder,
+ const uint32_t partition_size)
+ : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering,
+ filter_bits_builder),
+ index_on_filter_block_builder_(index_block_restart_interval,
+ true /*use_delta_encoding*/,
+ use_value_delta_encoding),
+ index_on_filter_block_builder_without_seq_(index_block_restart_interval,
+ true /*use_delta_encoding*/,
+ use_value_delta_encoding),
+ p_index_builder_(p_index_builder),
+ keys_added_to_partition_(0) {
+ keys_per_partition_ =
+ filter_bits_builder_->CalculateNumEntry(partition_size);
+}
+
+PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
+
+void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
+ const Slice* next_key) {
+ // Use == to send the request only once
+ if (keys_added_to_partition_ == keys_per_partition_) {
+ // Currently only index builder is in charge of cutting a partition. We keep
+ // requesting until it is granted.
+ p_index_builder_->RequestPartitionCut();
+ }
+ if (!p_index_builder_->ShouldCutFilterBlock()) {
+ return;
+ }
+ filter_gc.push_back(std::unique_ptr<const char[]>(nullptr));
+
+ // Add the prefix of the next key before finishing the partition. This hack,
+ // fixes a bug with format_verison=3 where seeking for the prefix would lead
+ // us to the previous partition.
+ const bool add_prefix =
+ next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
+ if (add_prefix) {
+ FullFilterBlockBuilder::AddPrefix(*next_key);
+ }
+
+ Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
+ std::string& index_key = p_index_builder_->GetPartitionKey();
+ filters.push_back({index_key, filter});
+ keys_added_to_partition_ = 0;
+ Reset();
+}
+
+void PartitionedFilterBlockBuilder::Add(const Slice& key) {
+ MaybeCutAFilterBlock(&key);
+ FullFilterBlockBuilder::Add(key);
+}
+
+void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
+ FullFilterBlockBuilder::AddKey(key);
+ keys_added_to_partition_++;
+}
+
+Slice PartitionedFilterBlockBuilder::Finish(
+ const BlockHandle& last_partition_block_handle, Status* status) {
+ if (finishing_filters == true) {
+ // Record the handle of the last written filter block in the index
+ FilterEntry& last_entry = filters.front();
+ std::string handle_encoding;
+ last_partition_block_handle.EncodeTo(&handle_encoding);
+ std::string handle_delta_encoding;
+ PutVarsignedint64(
+ &handle_delta_encoding,
+ last_partition_block_handle.size() - last_encoded_handle_.size());
+ last_encoded_handle_ = last_partition_block_handle;
+ const Slice handle_delta_encoding_slice(handle_delta_encoding);
+ index_on_filter_block_builder_.Add(last_entry.key, handle_encoding,
+ &handle_delta_encoding_slice);
+ if (!p_index_builder_->seperator_is_key_plus_seq()) {
+ index_on_filter_block_builder_without_seq_.Add(
+ ExtractUserKey(last_entry.key), handle_encoding,
+ &handle_delta_encoding_slice);
+ }
+ filters.pop_front();
+ } else {
+ MaybeCutAFilterBlock(nullptr);
+ }
+ // If there is no filter partition left, then return the index on filter
+ // partitions
+ if (UNLIKELY(filters.empty())) {
+ *status = Status::OK();
+ if (finishing_filters) {
+ if (p_index_builder_->seperator_is_key_plus_seq()) {
+ return index_on_filter_block_builder_.Finish();
+ } else {
+ return index_on_filter_block_builder_without_seq_.Finish();
+ }
+ } else {
+ // This is the rare case where no key was added to the filter
+ return Slice();
+ }
+ } else {
+ // Return the next filter partition in line and set Incomplete() status to
+ // indicate we expect more calls to Finish
+ *status = Status::Incomplete();
+ finishing_filters = true;
+ return filters.front().filter;
+ }
+}
+
+PartitionedFilterBlockReader::PartitionedFilterBlockReader(
+ const BlockBasedTable* t, CachableEntry<Block>&& filter_block)
+ : FilterBlockReaderCommon(t, std::move(filter_block)) {}
+
+std::unique_ptr<FilterBlockReader> PartitionedFilterBlockReader::Create(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ bool use_cache, bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context) {
+ assert(table);
+ assert(table->get_rep());
+ assert(!pin || prefetch);
+
+ CachableEntry<Block> filter_block;
+ if (prefetch || !use_cache) {
+ const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+ use_cache, nullptr /* get_context */,
+ lookup_context, &filter_block);
+ if (!s.ok()) {
+ return std::unique_ptr<FilterBlockReader>();
+ }
+
+ if (use_cache && !pin) {
+ filter_block.Reset();
+ }
+ }
+
+ return std::unique_ptr<FilterBlockReader>(
+ new PartitionedFilterBlockReader(table, std::move(filter_block)));
+}
+
+bool PartitionedFilterBlockReader::KeyMayMatch(
+ const Slice& key, const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
+ GetContext* get_context, BlockCacheLookupContext* lookup_context) {
+ assert(const_ikey_ptr != nullptr);
+ assert(block_offset == kNotValid);
+ if (!whole_key_filtering()) {
+ return true;
+ }
+
+ return MayMatch(key, prefix_extractor, block_offset, no_io, const_ikey_ptr,
+ get_context, lookup_context,
+ &FullFilterBlockReader::KeyMayMatch);
+}
+
+bool PartitionedFilterBlockReader::PrefixMayMatch(
+ const Slice& prefix, const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
+ GetContext* get_context, BlockCacheLookupContext* lookup_context) {
+#ifdef NDEBUG
+ (void)block_offset;
+#endif
+ assert(const_ikey_ptr != nullptr);
+ assert(block_offset == kNotValid);
+ if (!table_prefix_extractor() && !prefix_extractor) {
+ return true;
+ }
+
+ return MayMatch(prefix, prefix_extractor, block_offset, no_io, const_ikey_ptr,
+ get_context, lookup_context,
+ &FullFilterBlockReader::PrefixMayMatch);
+}
+
+BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
+ const CachableEntry<Block>& filter_block, const Slice& entry) const {
+ IndexBlockIter iter;
+ const InternalKeyComparator* const comparator = internal_comparator();
+ Statistics* kNullStats = nullptr;
+ filter_block.GetValue()->NewIndexIterator(
+ comparator, comparator->user_comparator(), &iter, kNullStats,
+ true /* total_order_seek */, false /* have_first_key */,
+ index_key_includes_seq(), index_value_is_full());
+ iter.Seek(entry);
+ if (UNLIKELY(!iter.Valid())) {
+ // entry is larger than all the keys. However its prefix might still be
+ // present in the last partition. If this is called by PrefixMayMatch this
+ // is necessary for correct behavior. Otherwise it is unnecessary but safe.
+ // Assuming this is an unlikely case for full key search, the performance
+ // overhead should be negligible.
+ iter.SeekToLast();
+ }
+ assert(iter.Valid());
+ BlockHandle fltr_blk_handle = iter.value().handle;
+ return fltr_blk_handle;
+}
+
+Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
+ FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle,
+ bool no_io, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<ParsedFullFilterBlock>* filter_block) const {
+ assert(table());
+ assert(filter_block);
+ assert(filter_block->IsEmpty());
+
+ if (!filter_map_.empty()) {
+ auto iter = filter_map_.find(fltr_blk_handle.offset());
+ // This is a possible scenario since block cache might not have had space
+ // for the partition
+ if (iter != filter_map_.end()) {
+ filter_block->SetUnownedValue(iter->second.GetValue());
+ return Status::OK();
+ }
+ }
+
+ ReadOptions read_options;
+ if (no_io) {
+ read_options.read_tier = kBlockCacheTier;
+ }
+
+ const Status s =
+ table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle,
+ UncompressionDict::GetEmptyDict(), filter_block,
+ BlockType::kFilter, get_context, lookup_context,
+ /* for_compaction */ false, /* use_cache */ true);
+
+ return s;
+}
+
+bool PartitionedFilterBlockReader::MayMatch(
+ const Slice& slice, const SliceTransform* prefix_extractor,
+ uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr,
+ GetContext* get_context, BlockCacheLookupContext* lookup_context,
+ FilterFunction filter_function) const {
+ CachableEntry<Block> filter_block;
+ Status s =
+ GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+ if (UNLIKELY(!s.ok())) {
+ return true;
+ }
+
+ if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
+ return true;
+ }
+
+ auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr);
+ if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range
+ return false;
+ }
+
+ CachableEntry<ParsedFullFilterBlock> filter_partition_block;
+ s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle,
+ no_io, get_context, lookup_context,
+ &filter_partition_block);
+ if (UNLIKELY(!s.ok())) {
+ return true;
+ }
+
+ FullFilterBlockReader filter_partition(table(),
+ std::move(filter_partition_block));
+ return (filter_partition.*filter_function)(
+ slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context,
+ lookup_context);
+}
+
+size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
+ size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(const_cast<PartitionedFilterBlockReader*>(this));
+#else
+ usage += sizeof(*this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ return usage;
+ // TODO(myabandeh): better estimation for filter_map_ size
+}
+
+// TODO(myabandeh): merge this with the same function in IndexReader
+void PartitionedFilterBlockReader::CacheDependencies(bool pin) {
+ assert(table());
+
+ const BlockBasedTable::Rep* const rep = table()->get_rep();
+ assert(rep);
+
+ BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+
+ CachableEntry<Block> filter_block;
+
+ Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
+ &lookup_context, &filter_block);
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(rep->ioptions.info_log,
+ "Error retrieving top-level filter block while trying to "
+ "cache filter partitions: %s",
+ s.ToString().c_str());
+ return;
+ }
+
+ // Before read partitions, prefetch them to avoid lots of IOs
+ assert(filter_block.GetValue());
+
+ IndexBlockIter biter;
+ const InternalKeyComparator* const comparator = internal_comparator();
+ Statistics* kNullStats = nullptr;
+ filter_block.GetValue()->NewIndexIterator(
+ comparator, comparator->user_comparator(), &biter, kNullStats,
+ true /* total_order_seek */, false /* have_first_key */,
+ index_key_includes_seq(), index_value_is_full());
+ // Index partitions are assumed to be consecuitive. Prefetch them all.
+ // Read the first block offset
+ biter.SeekToFirst();
+ BlockHandle handle = biter.value().handle;
+ uint64_t prefetch_off = handle.offset();
+
+ // Read the last block's offset
+ biter.SeekToLast();
+ handle = biter.value().handle;
+ uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
+ uint64_t prefetch_len = last_off - prefetch_off;
+ std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+
+ prefetch_buffer.reset(new FilePrefetchBuffer());
+ s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
+ static_cast<size_t>(prefetch_len));
+
+ // After prefetch, read the partitions one by one
+ ReadOptions read_options;
+ for (biter.SeekToFirst(); biter.Valid(); biter.Next()) {
+ handle = biter.value().handle;
+
+ CachableEntry<ParsedFullFilterBlock> block;
+ // TODO: Support counter batch update for partitioned index and
+ // filter blocks
+ s = table()->MaybeReadBlockAndLoadToCache(
+ prefetch_buffer.get(), read_options, handle,
+ UncompressionDict::GetEmptyDict(), &block, BlockType::kFilter,
+ nullptr /* get_context */, &lookup_context, nullptr /* contents */);
+
+ assert(s.ok() || block.GetValue() == nullptr);
+ if (s.ok() && block.GetValue() != nullptr) {
+ if (block.IsCached()) {
+ if (pin) {
+ filter_map_[handle.offset()] = std::move(block);
+ }
+ }
+ }
+ }
+}
+
+const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator()
+ const {
+ assert(table());
+ assert(table()->get_rep());
+
+ return &table()->get_rep()->internal_comparator;
+}
+
+bool PartitionedFilterBlockReader::index_key_includes_seq() const {
+ assert(table());
+ assert(table()->get_rep());
+
+ return table()->get_rep()->index_key_includes_seq;
+}
+
+bool PartitionedFilterBlockReader::index_value_is_full() const {
+ assert(table());
+ assert(table()->get_rep());
+
+ return table()->get_rep()->index_value_is_full;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block.h b/src/rocksdb/table/block_based/partitioned_filter_block.h
new file mode 100644
index 000000000..314297cab
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <list>
+#include <string>
+#include <unordered_map>
+#include "db/dbformat.h"
+#include "index_builder.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/block.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/block_based/full_filter_block.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
+ public:
+ explicit PartitionedFilterBlockBuilder(
+ const SliceTransform* prefix_extractor, bool whole_key_filtering,
+ FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+ const bool use_value_delta_encoding,
+ PartitionedIndexBuilder* const p_index_builder,
+ const uint32_t partition_size);
+
+ virtual ~PartitionedFilterBlockBuilder();
+
+ void AddKey(const Slice& key) override;
+ void Add(const Slice& key) override;
+
+ virtual Slice Finish(const BlockHandle& last_partition_block_handle,
+ Status* status) override;
+
+ private:
+ // Filter data
+ BlockBuilder index_on_filter_block_builder_; // top-level index builder
+ BlockBuilder
+ index_on_filter_block_builder_without_seq_; // same for user keys
+ struct FilterEntry {
+ std::string key;
+ Slice filter;
+ };
+ std::list<FilterEntry> filters; // list of partitioned indexes and their keys
+ std::unique_ptr<IndexBuilder> value;
+ std::vector<std::unique_ptr<const char[]>> filter_gc;
+ bool finishing_filters =
+ false; // true if Finish is called once but not complete yet.
+ // The policy of when cut a filter block and Finish it
+ void MaybeCutAFilterBlock(const Slice* next_key);
+ // Currently we keep the same number of partitions for filters and indexes.
+ // This would allow for some potentioal optimizations in future. If such
+ // optimizations did not realize we can use different number of partitions and
+ // eliminate p_index_builder_
+ PartitionedIndexBuilder* const p_index_builder_;
+ // The desired number of keys per partition
+ uint32_t keys_per_partition_;
+ // The number of keys added to the last partition so far
+ uint32_t keys_added_to_partition_;
+ BlockHandle last_encoded_handle_;
+};
+
+class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
+ public:
+ PartitionedFilterBlockReader(const BlockBasedTable* t,
+ CachableEntry<Block>&& filter_block);
+
+ static std::unique_ptr<FilterBlockReader> Create(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ bool use_cache, bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context);
+
+ bool IsBlockBased() override { return false; }
+ bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const const_ikey_ptr, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) override;
+ bool PrefixMayMatch(const Slice& prefix,
+ const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const const_ikey_ptr,
+ GetContext* get_context,
+ BlockCacheLookupContext* lookup_context) override;
+
+ size_t ApproximateMemoryUsage() const override;
+
+ private:
+ BlockHandle GetFilterPartitionHandle(const CachableEntry<Block>& filter_block,
+ const Slice& entry) const;
+ Status GetFilterPartitionBlock(
+ FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle,
+ bool no_io, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<ParsedFullFilterBlock>* filter_block) const;
+
+ using FilterFunction = bool (FullFilterBlockReader::*)(
+ const Slice& slice, const SliceTransform* prefix_extractor,
+ uint64_t block_offset, const bool no_io,
+ const Slice* const const_ikey_ptr, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context);
+ bool MayMatch(const Slice& slice, const SliceTransform* prefix_extractor,
+ uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr,
+ GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ FilterFunction filter_function) const;
+ void CacheDependencies(bool pin) override;
+
+ const InternalKeyComparator* internal_comparator() const;
+ bool index_key_includes_seq() const;
+ bool index_value_is_full() const;
+
+ protected:
+ std::unordered_map<uint64_t, CachableEntry<ParsedFullFilterBlock>>
+ filter_map_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block_test.cc b/src/rocksdb/table/block_based/partitioned_filter_block_test.cc
new file mode 100644
index 000000000..071bad9ca
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block_test.cc
@@ -0,0 +1,424 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <map>
+
+#include "rocksdb/filter_policy.h"
+
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+
+#include "index_builder.h"
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::map<uint64_t, std::string> blooms;
+
+class MockedBlockBasedTable : public BlockBasedTable {
+ public:
+ MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib)
+ : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) {
+ // Initialize what Open normally does as much as necessary for the test
+ rep->index_key_includes_seq = pib->seperator_is_key_plus_seq();
+ rep->index_value_is_full = !pib->get_use_value_delta_encoding();
+ }
+};
+
+class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader {
+ public:
+ MyPartitionedFilterBlockReader(BlockBasedTable* t,
+ CachableEntry<Block>&& filter_block)
+ : PartitionedFilterBlockReader(t, std::move(filter_block)) {
+ for (const auto& pair : blooms) {
+ const uint64_t offset = pair.first;
+ const std::string& bloom = pair.second;
+
+ assert(t);
+ assert(t->get_rep());
+ CachableEntry<ParsedFullFilterBlock> block(
+ new ParsedFullFilterBlock(
+ t->get_rep()->table_options.filter_policy.get(),
+ BlockContents(Slice(bloom))),
+ nullptr /* cache */, nullptr /* cache_handle */,
+ true /* own_value */);
+ filter_map_[offset] = std::move(block);
+ }
+ }
+};
+
+class PartitionedFilterBlockTest
+ : public testing::Test,
+ virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+ Options options_;
+ ImmutableCFOptions ioptions_;
+ EnvOptions env_options_;
+ BlockBasedTableOptions table_options_;
+ InternalKeyComparator icomp_;
+ std::unique_ptr<BlockBasedTable> table_;
+ std::shared_ptr<Cache> cache_;
+ int bits_per_key_;
+
+ PartitionedFilterBlockTest()
+ : ioptions_(options_),
+ env_options_(options_),
+ icomp_(options_.comparator),
+ bits_per_key_(10) {
+ table_options_.filter_policy.reset(
+ NewBloomFilterPolicy(bits_per_key_, false));
+ table_options_.format_version = GetParam();
+ table_options_.index_block_restart_interval = 3;
+ }
+
+ ~PartitionedFilterBlockTest() override {}
+
+ const std::string keys[4] = {"afoo", "bar", "box", "hello"};
+ const std::string missing_keys[2] = {"missing", "other"};
+
+ uint64_t MaxIndexSize() {
+ int num_keys = sizeof(keys) / sizeof(*keys);
+ uint64_t max_key_size = 0;
+ for (int i = 1; i < num_keys; i++) {
+ max_key_size = std::max(max_key_size, static_cast<uint64_t>(keys[i].size()));
+ }
+ uint64_t max_index_size = num_keys * (max_key_size + 8 /*handle*/);
+ return max_index_size;
+ }
+
+ uint64_t MaxFilterSize() {
+ int num_keys = sizeof(keys) / sizeof(*keys);
+ // General, rough over-approximation
+ return num_keys * bits_per_key_ + (CACHE_LINE_SIZE * 8 + /*metadata*/ 5);
+ }
+
+ uint64_t last_offset = 10;
+ BlockHandle Write(const Slice& slice) {
+ BlockHandle bh(last_offset + 1, slice.size());
+ blooms[bh.offset()] = slice.ToString();
+ last_offset += bh.size();
+ return bh;
+ }
+
+ PartitionedIndexBuilder* NewIndexBuilder() {
+ const bool kValueDeltaEncoded = true;
+ return PartitionedIndexBuilder::CreateIndexBuilder(
+ &icomp_, !kValueDeltaEncoded, table_options_);
+ }
+
+ PartitionedFilterBlockBuilder* NewBuilder(
+ PartitionedIndexBuilder* const p_index_builder,
+ const SliceTransform* prefix_extractor = nullptr) {
+ assert(table_options_.block_size_deviation <= 100);
+ auto partition_size = static_cast<uint32_t>(
+ ((table_options_.metadata_block_size *
+ (100 - table_options_.block_size_deviation)) +
+ 99) /
+ 100);
+ partition_size = std::max(partition_size, static_cast<uint32_t>(1));
+ const bool kValueDeltaEncoded = true;
+ return new PartitionedFilterBlockBuilder(
+ prefix_extractor, table_options_.whole_key_filtering,
+ BloomFilterPolicy::GetBuilderFromContext(
+ FilterBuildingContext(table_options_)),
+ table_options_.index_block_restart_interval, !kValueDeltaEncoded,
+ p_index_builder, partition_size);
+ }
+
+ PartitionedFilterBlockReader* NewReader(
+ PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) {
+ BlockHandle bh;
+ Status status;
+ Slice slice;
+ do {
+ slice = builder->Finish(bh, &status);
+ bh = Write(slice);
+ } while (status.IsIncomplete());
+
+ constexpr bool skip_filters = false;
+ constexpr int level = 0;
+ constexpr bool immortal_table = false;
+ table_.reset(new MockedBlockBasedTable(
+ new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
+ icomp_, skip_filters, level, immortal_table),
+ pib));
+ BlockContents contents(slice);
+ CachableEntry<Block> block(
+ new Block(std::move(contents), kDisableGlobalSequenceNumber,
+ 0 /* read_amp_bytes_per_bit */, nullptr),
+ nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+ auto reader =
+ new MyPartitionedFilterBlockReader(table_.get(), std::move(block));
+ return reader;
+ }
+
+ void VerifyReader(PartitionedFilterBlockBuilder* builder,
+ PartitionedIndexBuilder* pib, bool empty = false,
+ const SliceTransform* prefix_extractor = nullptr) {
+ std::unique_ptr<PartitionedFilterBlockReader> reader(
+ NewReader(builder, pib));
+ // Querying added keys
+ const bool no_io = true;
+ for (auto key : keys) {
+ auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+ const Slice ikey_slice = Slice(*ikey.rep());
+ ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io,
+ &ikey_slice, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ }
+ {
+ // querying a key twice
+ auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue);
+ const Slice ikey_slice = Slice(*ikey.rep());
+ ASSERT_TRUE(reader->KeyMayMatch(
+ keys[0], prefix_extractor, kNotValid, !no_io, &ikey_slice,
+ /*get_context=*/nullptr, /*lookup_context=*/nullptr));
+ }
+ // querying missing keys
+ for (auto key : missing_keys) {
+ auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+ const Slice ikey_slice = Slice(*ikey.rep());
+ if (empty) {
+ ASSERT_TRUE(reader->KeyMayMatch(
+ key, prefix_extractor, kNotValid, !no_io, &ikey_slice,
+ /*get_context=*/nullptr, /*lookup_context=*/nullptr));
+ } else {
+ // assuming a good hash function
+ ASSERT_FALSE(reader->KeyMayMatch(
+ key, prefix_extractor, kNotValid, !no_io, &ikey_slice,
+ /*get_context=*/nullptr, /*lookup_context=*/nullptr));
+ }
+ }
+ }
+
+ int TestBlockPerKey() {
+ std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+ std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+ NewBuilder(pib.get()));
+ int i = 0;
+ builder->Add(keys[i]);
+ CutABlock(pib.get(), keys[i], keys[i + 1]);
+ i++;
+ builder->Add(keys[i]);
+ CutABlock(pib.get(), keys[i], keys[i + 1]);
+ i++;
+ builder->Add(keys[i]);
+ builder->Add(keys[i]);
+ CutABlock(pib.get(), keys[i], keys[i + 1]);
+ i++;
+ builder->Add(keys[i]);
+ CutABlock(pib.get(), keys[i]);
+
+ VerifyReader(builder.get(), pib.get());
+ return CountNumOfIndexPartitions(pib.get());
+ }
+
+ void TestBlockPerTwoKeys(const SliceTransform* prefix_extractor = nullptr) {
+ std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+ std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+ NewBuilder(pib.get(), prefix_extractor));
+ int i = 0;
+ builder->Add(keys[i]);
+ i++;
+ builder->Add(keys[i]);
+ CutABlock(pib.get(), keys[i], keys[i + 1]);
+ i++;
+ builder->Add(keys[i]);
+ builder->Add(keys[i]);
+ i++;
+ builder->Add(keys[i]);
+ CutABlock(pib.get(), keys[i]);
+
+ VerifyReader(builder.get(), pib.get(), prefix_extractor);
+ }
+
+ void TestBlockPerAllKeys() {
+ std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+ std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+ NewBuilder(pib.get()));
+ int i = 0;
+ builder->Add(keys[i]);
+ i++;
+ builder->Add(keys[i]);
+ i++;
+ builder->Add(keys[i]);
+ builder->Add(keys[i]);
+ i++;
+ builder->Add(keys[i]);
+ CutABlock(pib.get(), keys[i]);
+
+ VerifyReader(builder.get(), pib.get());
+ }
+
+ void CutABlock(PartitionedIndexBuilder* builder,
+ const std::string& user_key) {
+ // Assuming a block is cut, add an entry to the index
+ std::string key =
+ std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep());
+ BlockHandle dont_care_block_handle(1, 1);
+ builder->AddIndexEntry(&key, nullptr, dont_care_block_handle);
+ }
+
+ void CutABlock(PartitionedIndexBuilder* builder, const std::string& user_key,
+ const std::string& next_user_key) {
+ // Assuming a block is cut, add an entry to the index
+ std::string key =
+ std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep());
+ std::string next_key = std::string(
+ *InternalKey(next_user_key, 0, ValueType::kTypeValue).rep());
+ BlockHandle dont_care_block_handle(1, 1);
+ Slice slice = Slice(next_key.data(), next_key.size());
+ builder->AddIndexEntry(&key, &slice, dont_care_block_handle);
+ }
+
+ int CountNumOfIndexPartitions(PartitionedIndexBuilder* builder) {
+ IndexBuilder::IndexBlocks dont_care_ib;
+ BlockHandle dont_care_bh(10, 10);
+ Status s;
+ int cnt = 0;
+ do {
+ s = builder->Finish(&dont_care_ib, dont_care_bh);
+ cnt++;
+ } while (s.IsIncomplete());
+ return cnt - 1; // 1 is 2nd level index
+ }
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, PartitionedFilterBlockTest,
+ testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, PartitionedFilterBlockTest,
+ testing::Values(test::kLatestFormatVersion));
+
+TEST_P(PartitionedFilterBlockTest, EmptyBuilder) {
+ std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+ std::unique_ptr<PartitionedFilterBlockBuilder> builder(NewBuilder(pib.get()));
+ const bool empty = true;
+ VerifyReader(builder.get(), pib.get(), empty);
+}
+
+TEST_P(PartitionedFilterBlockTest, OneBlock) {
+ uint64_t max_index_size = MaxIndexSize();
+ for (uint64_t i = 1; i < max_index_size + 1; i++) {
+ table_options_.metadata_block_size = i;
+ TestBlockPerAllKeys();
+ }
+}
+
+TEST_P(PartitionedFilterBlockTest, TwoBlocksPerKey) {
+ uint64_t max_index_size = MaxIndexSize();
+ for (uint64_t i = 1; i < max_index_size + 1; i++) {
+ table_options_.metadata_block_size = i;
+ TestBlockPerTwoKeys();
+ }
+}
+
+// This reproduces the bug that a prefix is the same among multiple consecutive
+// blocks but the bug would add it only to the first block.
+TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
+ // some small number to cause partition cuts
+ table_options_.metadata_block_size = 1;
+ std::unique_ptr<const SliceTransform> prefix_extractor(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(1));
+ std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+ std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+ NewBuilder(pib.get(), prefix_extractor.get()));
+ const std::string pkeys[3] = {"p-key10", "p-key20", "p-key30"};
+ builder->Add(pkeys[0]);
+ CutABlock(pib.get(), pkeys[0], pkeys[1]);
+ builder->Add(pkeys[1]);
+ CutABlock(pib.get(), pkeys[1], pkeys[2]);
+ builder->Add(pkeys[2]);
+ CutABlock(pib.get(), pkeys[2]);
+ std::unique_ptr<PartitionedFilterBlockReader> reader(
+ NewReader(builder.get(), pib.get()));
+ for (auto key : pkeys) {
+ auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+ const Slice ikey_slice = Slice(*ikey.rep());
+ ASSERT_TRUE(reader->PrefixMayMatch(
+ prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid,
+ /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ }
+ // Non-existent keys but with the same prefix
+ const std::string pnonkeys[4] = {"p-key9", "p-key11", "p-key21", "p-key31"};
+ for (auto key : pnonkeys) {
+ auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+ const Slice ikey_slice = Slice(*ikey.rep());
+ ASSERT_TRUE(reader->PrefixMayMatch(
+ prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid,
+ /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ }
+}
+
+// This reproduces the bug in format_version=3 that the seeking the prefix will
+// lead us to the partition before the one that has filter for the prefix.
+TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) {
+ // some small number to cause partition cuts
+ table_options_.metadata_block_size = 1;
+ std::unique_ptr<const SliceTransform> prefix_extractor(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(2));
+ std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+ std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+ NewBuilder(pib.get(), prefix_extractor.get()));
+ // In the bug, searching for prefix "p3" on an index with format version 3,
+ // will give the key "p3" and the partition of the keys that are <= p3, i.e.,
+ // p2-keys, where the filter for prefix "p3" does not exist.
+ const std::string pkeys[] = {"p1-key1", "p2-key2", "p3-key3", "p4-key3",
+ "p5-key3"};
+ builder->Add(pkeys[0]);
+ CutABlock(pib.get(), pkeys[0], pkeys[1]);
+ builder->Add(pkeys[1]);
+ CutABlock(pib.get(), pkeys[1], pkeys[2]);
+ builder->Add(pkeys[2]);
+ CutABlock(pib.get(), pkeys[2], pkeys[3]);
+ builder->Add(pkeys[3]);
+ CutABlock(pib.get(), pkeys[3], pkeys[4]);
+ builder->Add(pkeys[4]);
+ CutABlock(pib.get(), pkeys[4]);
+ std::unique_ptr<PartitionedFilterBlockReader> reader(
+ NewReader(builder.get(), pib.get()));
+ for (auto key : pkeys) {
+ auto prefix = prefix_extractor->Transform(key);
+ auto ikey = InternalKey(prefix, 0, ValueType::kTypeValue);
+ const Slice ikey_slice = Slice(*ikey.rep());
+ ASSERT_TRUE(reader->PrefixMayMatch(
+ prefix, prefix_extractor.get(), kNotValid,
+ /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+ /*lookup_context=*/nullptr));
+ }
+}
+
+TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) {
+ uint64_t max_index_size = MaxIndexSize();
+ for (uint64_t i = 1; i < max_index_size + 1; i++) {
+ table_options_.metadata_block_size = i;
+ TestBlockPerKey();
+ }
+}
+
+TEST_P(PartitionedFilterBlockTest, PartitionCount) {
+ int num_keys = sizeof(keys) / sizeof(*keys);
+ table_options_.metadata_block_size =
+ std::max(MaxIndexSize(), MaxFilterSize());
+ int partitions = TestBlockPerKey();
+ ASSERT_EQ(partitions, 1);
+ // A low number ensures cutting a block after each key
+ table_options_.metadata_block_size = 1;
+ partitions = TestBlockPerKey();
+ ASSERT_EQ(partitions, num_keys - 1 /* last two keys make one flush */);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/uncompression_dict_reader.cc b/src/rocksdb/table/block_based/uncompression_dict_reader.cc
new file mode 100644
index 000000000..78e2b93c1
--- /dev/null
+++ b/src/rocksdb/table/block_based/uncompression_dict_reader.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/uncompression_dict_reader.h"
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status UncompressionDictReader::Create(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ bool use_cache, bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context,
+ std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader) {
+ assert(table);
+ assert(table->get_rep());
+ assert(!pin || prefetch);
+ assert(uncompression_dict_reader);
+
+ CachableEntry<UncompressionDict> uncompression_dict;
+ if (prefetch || !use_cache) {
+ const Status s = ReadUncompressionDictionary(
+ table, prefetch_buffer, ReadOptions(), use_cache,
+ nullptr /* get_context */, lookup_context, &uncompression_dict);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (use_cache && !pin) {
+ uncompression_dict.Reset();
+ }
+ }
+
+ uncompression_dict_reader->reset(
+ new UncompressionDictReader(table, std::move(uncompression_dict)));
+
+ return Status::OK();
+}
+
+Status UncompressionDictReader::ReadUncompressionDictionary(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<UncompressionDict>* uncompression_dict) {
+ // TODO: add perf counter for compression dictionary read time
+
+ assert(table);
+ assert(uncompression_dict);
+ assert(uncompression_dict->IsEmpty());
+
+ const BlockBasedTable::Rep* const rep = table->get_rep();
+ assert(rep);
+ assert(!rep->compression_dict_handle.IsNull());
+
+ const Status s = table->RetrieveBlock(
+ prefetch_buffer, read_options, rep->compression_dict_handle,
+ UncompressionDict::GetEmptyDict(), uncompression_dict,
+ BlockType::kCompressionDictionary, get_context, lookup_context,
+ /* for_compaction */ false, use_cache);
+
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(
+ rep->ioptions.info_log,
+ "Encountered error while reading data from compression dictionary "
+ "block %s",
+ s.ToString().c_str());
+ }
+
+ return s;
+}
+
+Status UncompressionDictReader::GetOrReadUncompressionDictionary(
+ FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<UncompressionDict>* uncompression_dict) const {
+ assert(uncompression_dict);
+
+ if (!uncompression_dict_.IsEmpty()) {
+ uncompression_dict->SetUnownedValue(uncompression_dict_.GetValue());
+ return Status::OK();
+ }
+
+ ReadOptions read_options;
+ if (no_io) {
+ read_options.read_tier = kBlockCacheTier;
+ }
+
+ return ReadUncompressionDictionary(table_, prefetch_buffer, read_options,
+ cache_dictionary_blocks(), get_context,
+ lookup_context, uncompression_dict);
+}
+
+size_t UncompressionDictReader::ApproximateMemoryUsage() const {
+ assert(!uncompression_dict_.GetOwnValue() ||
+ uncompression_dict_.GetValue() != nullptr);
+ size_t usage = uncompression_dict_.GetOwnValue()
+ ? uncompression_dict_.GetValue()->ApproximateMemoryUsage()
+ : 0;
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ usage += malloc_usable_size(const_cast<UncompressionDictReader*>(this));
+#else
+ usage += sizeof(*this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+
+ return usage;
+}
+
+bool UncompressionDictReader::cache_dictionary_blocks() const {
+ assert(table_);
+ assert(table_->get_rep());
+
+ return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/uncompression_dict_reader.h b/src/rocksdb/table/block_based/uncompression_dict_reader.h
new file mode 100644
index 000000000..3e7826179
--- /dev/null
+++ b/src/rocksdb/table/block_based/uncompression_dict_reader.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+#include "table/block_based/cachable_entry.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBasedTable;
+struct BlockCacheLookupContext;
+class FilePrefetchBuffer;
+class GetContext;
+struct ReadOptions;
+struct UncompressionDict;
+
+// Provides access to the uncompression dictionary regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class UncompressionDictReader {
+ public:
+ static Status Create(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ bool use_cache, bool prefetch, bool pin,
+ BlockCacheLookupContext* lookup_context,
+ std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader);
+
+ Status GetOrReadUncompressionDictionary(
+ FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<UncompressionDict>* uncompression_dict) const;
+
+ size_t ApproximateMemoryUsage() const;
+
+ private:
+ UncompressionDictReader(const BlockBasedTable* t,
+ CachableEntry<UncompressionDict>&& uncompression_dict)
+ : table_(t), uncompression_dict_(std::move(uncompression_dict)) {
+ assert(table_);
+ }
+
+ bool cache_dictionary_blocks() const;
+
+ static Status ReadUncompressionDictionary(
+ const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+ const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+ BlockCacheLookupContext* lookup_context,
+ CachableEntry<UncompressionDict>* uncompression_dict);
+
+ const BlockBasedTable* table_;
+ CachableEntry<UncompressionDict> uncompression_dict_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_fetcher.cc b/src/rocksdb/table/block_fetcher.cc
new file mode 100644
index 000000000..9ddee72cc
--- /dev/null
+++ b/src/rocksdb/table/block_fetcher.cc
@@ -0,0 +1,284 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_fetcher.h"
+
+#include <cinttypes>
+#include <string>
+
+#include "logging/logging.h"
+#include "memory/memory_allocator.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/env.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+#include "table/persistent_cache_helper.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+inline void BlockFetcher::CheckBlockChecksum() {
+ // Check the crc of the type and the block contents
+ if (read_options_.verify_checksums) {
+ const char* data = slice_.data(); // Pointer to where Read put the data
+ PERF_TIMER_GUARD(block_checksum_time);
+ uint32_t value = DecodeFixed32(data + block_size_ + 1);
+ uint32_t actual = 0;
+ switch (footer_.checksum()) {
+ case kNoChecksum:
+ break;
+ case kCRC32c:
+ value = crc32c::Unmask(value);
+ actual = crc32c::Value(data, block_size_ + 1);
+ break;
+ case kxxHash:
+ actual = XXH32(data, static_cast<int>(block_size_) + 1, 0);
+ break;
+ case kxxHash64:
+ actual = static_cast<uint32_t>(
+ XXH64(data, static_cast<int>(block_size_) + 1, 0) &
+ uint64_t{0xffffffff});
+ break;
+ default:
+ status_ = Status::Corruption(
+ "unknown checksum type " + ToString(footer_.checksum()) + " in " +
+ file_->file_name() + " offset " + ToString(handle_.offset()) +
+ " size " + ToString(block_size_));
+ }
+ if (status_.ok() && actual != value) {
+ status_ = Status::Corruption(
+ "block checksum mismatch: expected " + ToString(actual) + ", got " +
+ ToString(value) + " in " + file_->file_name() + " offset " +
+ ToString(handle_.offset()) + " size " + ToString(block_size_));
+ }
+ }
+}
+
+inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() {
+ if (cache_options_.persistent_cache &&
+ !cache_options_.persistent_cache->IsCompressed()) {
+ Status status = PersistentCacheHelper::LookupUncompressedPage(
+ cache_options_, handle_, contents_);
+ if (status.ok()) {
+ // uncompressed page is found for the block handle
+ return true;
+ } else {
+ // uncompressed page is not found
+ if (ioptions_.info_log && !status.IsNotFound()) {
+ assert(!status.ok());
+ ROCKS_LOG_INFO(ioptions_.info_log,
+ "Error reading from persistent cache. %s",
+ status.ToString().c_str());
+ }
+ }
+ }
+ return false;
+}
+
+inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
+ if (prefetch_buffer_ != nullptr &&
+ prefetch_buffer_->TryReadFromCache(
+ handle_.offset(),
+ static_cast<size_t>(handle_.size()) + kBlockTrailerSize, &slice_,
+ for_compaction_)) {
+ block_size_ = static_cast<size_t>(handle_.size());
+ CheckBlockChecksum();
+ if (!status_.ok()) {
+ return true;
+ }
+ got_from_prefetch_buffer_ = true;
+ used_buf_ = const_cast<char*>(slice_.data());
+ }
+ return got_from_prefetch_buffer_;
+}
+
+inline bool BlockFetcher::TryGetCompressedBlockFromPersistentCache() {
+ if (cache_options_.persistent_cache &&
+ cache_options_.persistent_cache->IsCompressed()) {
+ // lookup uncompressed cache mode p-cache
+ std::unique_ptr<char[]> raw_data;
+ status_ = PersistentCacheHelper::LookupRawPage(
+ cache_options_, handle_, &raw_data, block_size_ + kBlockTrailerSize);
+ if (status_.ok()) {
+ heap_buf_ = CacheAllocationPtr(raw_data.release());
+ used_buf_ = heap_buf_.get();
+ slice_ = Slice(heap_buf_.get(), block_size_);
+ return true;
+ } else if (!status_.IsNotFound() && ioptions_.info_log) {
+ assert(!status_.ok());
+ ROCKS_LOG_INFO(ioptions_.info_log,
+ "Error reading from persistent cache. %s",
+ status_.ToString().c_str());
+ }
+ }
+ return false;
+}
+
+inline void BlockFetcher::PrepareBufferForBlockFromFile() {
+ // cache miss read from device
+ if (do_uncompress_ &&
+ block_size_ + kBlockTrailerSize < kDefaultStackBufferSize) {
+ // If we've got a small enough hunk of data, read it in to the
+ // trivially allocated stack buffer instead of needing a full malloc()
+ used_buf_ = &stack_buf_[0];
+ } else if (maybe_compressed_ && !do_uncompress_) {
+ compressed_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize,
+ memory_allocator_compressed_);
+ used_buf_ = compressed_buf_.get();
+ } else {
+ heap_buf_ =
+ AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_);
+ used_buf_ = heap_buf_.get();
+ }
+}
+
+inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() {
+ if (status_.ok() && read_options_.fill_cache &&
+ cache_options_.persistent_cache &&
+ cache_options_.persistent_cache->IsCompressed()) {
+ // insert to raw cache
+ PersistentCacheHelper::InsertRawPage(cache_options_, handle_, used_buf_,
+ block_size_ + kBlockTrailerSize);
+ }
+}
+
+inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() {
+ if (status_.ok() && !got_from_prefetch_buffer_ && read_options_.fill_cache &&
+ cache_options_.persistent_cache &&
+ !cache_options_.persistent_cache->IsCompressed()) {
+ // insert to uncompressed cache
+ PersistentCacheHelper::InsertUncompressedPage(cache_options_, handle_,
+ *contents_);
+ }
+}
+
+inline void BlockFetcher::CopyBufferToHeap() {
+ assert(used_buf_ != heap_buf_.get());
+ heap_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_);
+ memcpy(heap_buf_.get(), used_buf_, block_size_ + kBlockTrailerSize);
+}
+
+inline void BlockFetcher::GetBlockContents() {
+ if (slice_.data() != used_buf_) {
+ // the slice content is not the buffer provided
+ *contents_ = BlockContents(Slice(slice_.data(), block_size_));
+ } else {
+ // page can be either uncompressed or compressed, the buffer either stack
+ // or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096
+ if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) {
+ CopyBufferToHeap();
+ } else if (used_buf_ == compressed_buf_.get()) {
+ if (compression_type_ == kNoCompression &&
+ memory_allocator_ != memory_allocator_compressed_) {
+ CopyBufferToHeap();
+ } else {
+ heap_buf_ = std::move(compressed_buf_);
+ }
+ }
+ *contents_ = BlockContents(std::move(heap_buf_), block_size_);
+ }
+#ifndef NDEBUG
+ contents_->is_raw_block = true;
+#endif
+}
+
+Status BlockFetcher::ReadBlockContents() {
+ block_size_ = static_cast<size_t>(handle_.size());
+
+ if (TryGetUncompressBlockFromPersistentCache()) {
+ compression_type_ = kNoCompression;
+#ifndef NDEBUG
+ contents_->is_raw_block = true;
+#endif // NDEBUG
+ return Status::OK();
+ }
+ if (TryGetFromPrefetchBuffer()) {
+ if (!status_.ok()) {
+ return status_;
+ }
+ } else if (!TryGetCompressedBlockFromPersistentCache()) {
+ PrepareBufferForBlockFromFile();
+ Status s;
+
+ {
+ PERF_TIMER_GUARD(block_read_time);
+ // Actual file read
+ status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize,
+ &slice_, used_buf_, for_compaction_);
+ }
+ PERF_COUNTER_ADD(block_read_count, 1);
+
+ // TODO: introduce dedicated perf counter for range tombstones
+ switch (block_type_) {
+ case BlockType::kFilter:
+ PERF_COUNTER_ADD(filter_block_read_count, 1);
+ break;
+
+ case BlockType::kCompressionDictionary:
+ PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
+ break;
+
+ case BlockType::kIndex:
+ PERF_COUNTER_ADD(index_block_read_count, 1);
+ break;
+
+ // Nothing to do here as we don't have counters for the other types.
+ default:
+ break;
+ }
+
+ PERF_COUNTER_ADD(block_read_byte, block_size_ + kBlockTrailerSize);
+ if (!status_.ok()) {
+ return status_;
+ }
+
+ if (slice_.size() != block_size_ + kBlockTrailerSize) {
+ return Status::Corruption("truncated block read from " +
+ file_->file_name() + " offset " +
+ ToString(handle_.offset()) + ", expected " +
+ ToString(block_size_ + kBlockTrailerSize) +
+ " bytes, got " + ToString(slice_.size()));
+ }
+
+ CheckBlockChecksum();
+ if (status_.ok()) {
+ InsertCompressedBlockToPersistentCacheIfNeeded();
+ } else {
+ return status_;
+ }
+ }
+
+ PERF_TIMER_GUARD(block_decompress_time);
+
+ compression_type_ = get_block_compression_type(slice_.data(), block_size_);
+
+ if (do_uncompress_ && compression_type_ != kNoCompression) {
+ // compressed page, uncompress, update cache
+ UncompressionContext context(compression_type_);
+ UncompressionInfo info(context, uncompression_dict_, compression_type_);
+ status_ = UncompressBlockContents(info, slice_.data(), block_size_,
+ contents_, footer_.version(), ioptions_,
+ memory_allocator_);
+ compression_type_ = kNoCompression;
+ } else {
+ GetBlockContents();
+ }
+
+ InsertUncompressedBlockToPersistentCacheIfNeeded();
+
+ return status_;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_fetcher.h b/src/rocksdb/table/block_fetcher.h
new file mode 100644
index 000000000..1b003df15
--- /dev/null
+++ b/src/rocksdb/table/block_fetcher.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "memory/memory_allocator.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_type.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Retrieves a single block of a given file. Utilizes the prefetch buffer and/or
+// persistent cache provided (if any) to try to avoid reading from the file
+// directly. Note that both the prefetch buffer and the persistent cache are
+// optional; also, note that the persistent cache may be configured to store either
+// compressed or uncompressed blocks.
+//
+// If the retrieved block is compressed and the do_uncompress flag is set,
+// BlockFetcher uncompresses the block (using the uncompression dictionary,
+// if provided, to prime the compression algorithm), and returns the resulting
+// uncompressed block data. Otherwise, it returns the original block.
+//
+// Two read options affect the behavior of BlockFetcher: if verify_checksums is
+// true, the checksum of the (original) block is checked; if fill_cache is true,
+// the block is added to the persistent cache if needed.
+//
+// Memory for uncompressed and compressed blocks is allocated as needed
+// using memory_allocator and memory_allocator_compressed, respectively
+// (if provided; otherwise, the default allocator is used).
+
+class BlockFetcher {
+ public:
+ BlockFetcher(RandomAccessFileReader* file,
+ FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
+ const ReadOptions& read_options, const BlockHandle& handle,
+ BlockContents* contents, const ImmutableCFOptions& ioptions,
+ bool do_uncompress, bool maybe_compressed, BlockType block_type,
+ const UncompressionDict& uncompression_dict,
+ const PersistentCacheOptions& cache_options,
+ MemoryAllocator* memory_allocator = nullptr,
+ MemoryAllocator* memory_allocator_compressed = nullptr,
+ bool for_compaction = false)
+ : file_(file),
+ prefetch_buffer_(prefetch_buffer),
+ footer_(footer),
+ read_options_(read_options),
+ handle_(handle),
+ contents_(contents),
+ ioptions_(ioptions),
+ do_uncompress_(do_uncompress),
+ maybe_compressed_(maybe_compressed),
+ block_type_(block_type),
+ uncompression_dict_(uncompression_dict),
+ cache_options_(cache_options),
+ memory_allocator_(memory_allocator),
+ memory_allocator_compressed_(memory_allocator_compressed),
+ for_compaction_(for_compaction) {}
+
+ Status ReadBlockContents();
+ CompressionType get_compression_type() const { return compression_type_; }
+
+ private:
+ static const uint32_t kDefaultStackBufferSize = 5000;
+
+ RandomAccessFileReader* file_;
+ FilePrefetchBuffer* prefetch_buffer_;
+ const Footer& footer_;
+ const ReadOptions read_options_;
+ const BlockHandle& handle_;
+ BlockContents* contents_;
+ const ImmutableCFOptions& ioptions_;
+ bool do_uncompress_;
+ bool maybe_compressed_;
+ BlockType block_type_;
+ const UncompressionDict& uncompression_dict_;
+ const PersistentCacheOptions& cache_options_;
+ MemoryAllocator* memory_allocator_;
+ MemoryAllocator* memory_allocator_compressed_;
+ Status status_;
+ Slice slice_;
+ char* used_buf_ = nullptr;
+ size_t block_size_;
+ CacheAllocationPtr heap_buf_;
+ CacheAllocationPtr compressed_buf_;
+ char stack_buf_[kDefaultStackBufferSize];
+ bool got_from_prefetch_buffer_ = false;
+ ROCKSDB_NAMESPACE::CompressionType compression_type_;
+ bool for_compaction_ = false;
+
+ // return true if found
+ bool TryGetUncompressBlockFromPersistentCache();
+ // return true if found
+ bool TryGetFromPrefetchBuffer();
+ bool TryGetCompressedBlockFromPersistentCache();
+ void PrepareBufferForBlockFromFile();
+ // Copy content from used_buf_ to new heap buffer.
+ void CopyBufferToHeap();
+ void GetBlockContents();
+ void InsertCompressedBlockToPersistentCacheIfNeeded();
+ void InsertUncompressedBlockToPersistentCacheIfNeeded();
+ void CheckBlockChecksum();
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/cleanable_test.cc b/src/rocksdb/table/cleanable_test.cc
new file mode 100644
index 000000000..f7d80a39d
--- /dev/null
+++ b/src/rocksdb/table/cleanable_test.cc
@@ -0,0 +1,277 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <functional>
+
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/perf_context.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CleanableTest : public testing::Test {};
+
+// Use this to keep track of the cleanups that were actually performed
+void Multiplier(void* arg1, void* arg2) {
+ int* res = reinterpret_cast<int*>(arg1);
+ int* num = reinterpret_cast<int*>(arg2);
+ *res *= *num;
+}
+
+// the first Cleanup is on stack and the rest on heap, so test with both cases
+TEST_F(CleanableTest, Register) {
+ int n2 = 2, n3 = 3;
+ int res = 1;
+ { Cleanable c1; }
+ // ~Cleanable
+ ASSERT_EQ(1, res);
+
+ res = 1;
+ {
+ Cleanable c1;
+ c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2;
+ }
+ // ~Cleanable
+ ASSERT_EQ(2, res);
+
+ res = 1;
+ {
+ Cleanable c1;
+ c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2;
+ c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3;
+ }
+ // ~Cleanable
+ ASSERT_EQ(6, res);
+
+ // Test the Reset does cleanup
+ res = 1;
+ {
+ Cleanable c1;
+ c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2;
+ c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3;
+ c1.Reset();
+ ASSERT_EQ(6, res);
+ }
+ // ~Cleanable
+ ASSERT_EQ(6, res);
+
+ // Test Clenable is usable after Reset
+ res = 1;
+ {
+ Cleanable c1;
+ c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2;
+ c1.Reset();
+ ASSERT_EQ(2, res);
+ c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3;
+ }
+ // ~Cleanable
+ ASSERT_EQ(6, res);
+}
+
+// the first Cleanup is on stack and the rest on heap,
+// so test all the combinations of them
+TEST_F(CleanableTest, Delegation) {
+ int n2 = 2, n3 = 3, n5 = 5, n7 = 7;
+ int res = 1;
+ {
+ Cleanable c2;
+ {
+ Cleanable c1;
+ c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2;
+ c1.DelegateCleanupsTo(&c2);
+ }
+ // ~Cleanable
+ ASSERT_EQ(1, res);
+ }
+ // ~Cleanable
+ ASSERT_EQ(2, res);
+
+ res = 1;
+ {
+ Cleanable c2;
+ {
+ Cleanable c1;
+ c1.DelegateCleanupsTo(&c2);
+ }
+ // ~Cleanable
+ ASSERT_EQ(1, res);
+ }
+ // ~Cleanable
+ ASSERT_EQ(1, res);
+
+ res = 1;
+ {
+ Cleanable c2;
+ {
+ Cleanable c1;
+ c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2;
+ c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3;
+ c1.DelegateCleanupsTo(&c2);
+ }
+ // ~Cleanable
+ ASSERT_EQ(1, res);
+ }
+ // ~Cleanable
+ ASSERT_EQ(6, res);
+
+ res = 1;
+ {
+ Cleanable c2;
+ c2.RegisterCleanup(Multiplier, &res, &n5); // res = 5;
+ {
+ Cleanable c1;
+ c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2;
+ c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3;
+ c1.DelegateCleanupsTo(&c2); // res = 2 * 3 * 5;
+ }
+ // ~Cleanable
+ ASSERT_EQ(1, res);
+ }
+ // ~Cleanable
+ ASSERT_EQ(30, res);
+
+ res = 1;
+ {
+ Cleanable c2;
+ c2.RegisterCleanup(Multiplier, &res, &n5); // res = 5;
+ c2.RegisterCleanup(Multiplier, &res, &n7); // res = 5 * 7;
+ {
+ Cleanable c1;
+ c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2;
+ c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3;
+ c1.DelegateCleanupsTo(&c2); // res = 2 * 3 * 5 * 7;
+ }
+ // ~Cleanable
+ ASSERT_EQ(1, res);
+ }
+ // ~Cleanable
+ ASSERT_EQ(210, res);
+
+ res = 1;
+ {
+ Cleanable c2;
+ c2.RegisterCleanup(Multiplier, &res, &n5); // res = 5;
+ c2.RegisterCleanup(Multiplier, &res, &n7); // res = 5 * 7;
+ {
+ Cleanable c1;
+ c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2;
+ c1.DelegateCleanupsTo(&c2); // res = 2 * 5 * 7;
+ }
+ // ~Cleanable
+ ASSERT_EQ(1, res);
+ }
+ // ~Cleanable
+ ASSERT_EQ(70, res);
+
+ res = 1;
+ {
+ Cleanable c2;
+ c2.RegisterCleanup(Multiplier, &res, &n5); // res = 5;
+ c2.RegisterCleanup(Multiplier, &res, &n7); // res = 5 * 7;
+ {
+ Cleanable c1;
+ c1.DelegateCleanupsTo(&c2); // res = 5 * 7;
+ }
+ // ~Cleanable
+ ASSERT_EQ(1, res);
+ }
+ // ~Cleanable
+ ASSERT_EQ(35, res);
+
+ res = 1;
+ {
+ Cleanable c2;
+ c2.RegisterCleanup(Multiplier, &res, &n5); // res = 5;
+ {
+ Cleanable c1;
+ c1.DelegateCleanupsTo(&c2); // res = 5;
+ }
+ // ~Cleanable
+ ASSERT_EQ(1, res);
+ }
+ // ~Cleanable
+ ASSERT_EQ(5, res);
+}
+
+static void ReleaseStringHeap(void* s, void*) {
+ delete reinterpret_cast<const std::string*>(s);
+}
+
+class PinnableSlice4Test : public PinnableSlice {
+ public:
+ void TestStringIsRegistered(std::string* s) {
+ ASSERT_TRUE(cleanup_.function == ReleaseStringHeap);
+ ASSERT_EQ(cleanup_.arg1, s);
+ ASSERT_EQ(cleanup_.arg2, nullptr);
+ ASSERT_EQ(cleanup_.next, nullptr);
+ }
+};
+
+// Putting the PinnableSlice tests here due to similarity to Cleanable tests
+TEST_F(CleanableTest, PinnableSlice) {
+ int n2 = 2;
+ int res = 1;
+ const std::string const_str = "123";
+
+ {
+ res = 1;
+ PinnableSlice4Test value;
+ Slice slice(const_str);
+ value.PinSlice(slice, Multiplier, &res, &n2);
+ std::string str;
+ str.assign(value.data(), value.size());
+ ASSERT_EQ(const_str, str);
+ }
+ // ~Cleanable
+ ASSERT_EQ(2, res);
+
+ {
+ res = 1;
+ PinnableSlice4Test value;
+ Slice slice(const_str);
+ {
+ Cleanable c1;
+ c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2;
+ value.PinSlice(slice, &c1);
+ }
+ // ~Cleanable
+ ASSERT_EQ(1, res); // cleanups must have be delegated to value
+ std::string str;
+ str.assign(value.data(), value.size());
+ ASSERT_EQ(const_str, str);
+ }
+ // ~Cleanable
+ ASSERT_EQ(2, res);
+
+ {
+ PinnableSlice4Test value;
+ Slice slice(const_str);
+ value.PinSelf(slice);
+ std::string str;
+ str.assign(value.data(), value.size());
+ ASSERT_EQ(const_str, str);
+ }
+
+ {
+ PinnableSlice4Test value;
+ std::string* self_str_ptr = value.GetSelf();
+ self_str_ptr->assign(const_str);
+ value.PinSelf();
+ std::string str;
+ str.assign(value.data(), value.size());
+ ASSERT_EQ(const_str, str);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc b/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc
new file mode 100644
index 000000000..3ddce0b6f
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc
@@ -0,0 +1,528 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_builder.h"
+
+#include <assert.h>
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "util/autovector.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+const std::string CuckooTablePropertyNames::kEmptyKey =
+ "rocksdb.cuckoo.bucket.empty.key";
+const std::string CuckooTablePropertyNames::kNumHashFunc =
+ "rocksdb.cuckoo.hash.num";
+const std::string CuckooTablePropertyNames::kHashTableSize =
+ "rocksdb.cuckoo.hash.size";
+const std::string CuckooTablePropertyNames::kValueLength =
+ "rocksdb.cuckoo.value.length";
+const std::string CuckooTablePropertyNames::kIsLastLevel =
+ "rocksdb.cuckoo.file.islastlevel";
+const std::string CuckooTablePropertyNames::kCuckooBlockSize =
+ "rocksdb.cuckoo.hash.cuckooblocksize";
+const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
+ "rocksdb.cuckoo.hash.identityfirst";
+const std::string CuckooTablePropertyNames::kUseModuleHash =
+ "rocksdb.cuckoo.hash.usemodule";
+const std::string CuckooTablePropertyNames::kUserKeyLength =
+ "rocksdb.cuckoo.hash.userkeylength";
+
+// Obtained by running echo rocksdb.table.cuckoo | sha1sum
+extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
+
+CuckooTableBuilder::CuckooTableBuilder(
+ WritableFileWriter* file, double max_hash_table_ratio,
+ uint32_t max_num_hash_table, uint32_t max_search_depth,
+ const Comparator* user_comparator, uint32_t cuckoo_block_size,
+ bool use_module_hash, bool identity_as_first_hash,
+ uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
+ uint32_t column_family_id, const std::string& column_family_name)
+ : num_hash_func_(2),
+ file_(file),
+ max_hash_table_ratio_(max_hash_table_ratio),
+ max_num_hash_func_(max_num_hash_table),
+ max_search_depth_(max_search_depth),
+ cuckoo_block_size_(std::max(1U, cuckoo_block_size)),
+ hash_table_size_(use_module_hash ? 0 : 2),
+ is_last_level_file_(false),
+ has_seen_first_key_(false),
+ has_seen_first_value_(false),
+ key_size_(0),
+ value_size_(0),
+ num_entries_(0),
+ num_values_(0),
+ ucomp_(user_comparator),
+ use_module_hash_(use_module_hash),
+ identity_as_first_hash_(identity_as_first_hash),
+ get_slice_hash_(get_slice_hash),
+ closed_(false) {
+ // Data is in a huge block.
+ properties_.num_data_blocks = 1;
+ properties_.index_size = 0;
+ properties_.filter_size = 0;
+ properties_.column_family_id = column_family_id;
+ properties_.column_family_name = column_family_name;
+}
+
+void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
+ if (num_entries_ >= kMaxVectorIdx - 1) {
+ status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
+ return;
+ }
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(key, &ikey)) {
+ status_ = Status::Corruption("Unable to parse key into inernal key.");
+ return;
+ }
+ if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
+ status_ = Status::NotSupported("Unsupported key type " +
+ ToString(ikey.type));
+ return;
+ }
+
+ // Determine if we can ignore the sequence number and value type from
+ // internal keys by looking at sequence number from first key. We assume
+ // that if first key has a zero sequence number, then all the remaining
+ // keys will have zero seq. no.
+ if (!has_seen_first_key_) {
+ is_last_level_file_ = ikey.sequence == 0;
+ has_seen_first_key_ = true;
+ smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+ largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+ key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
+ }
+ if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) {
+ status_ = Status::NotSupported("all keys have to be the same size");
+ return;
+ }
+
+ if (ikey.type == kTypeValue) {
+ if (!has_seen_first_value_) {
+ has_seen_first_value_ = true;
+ value_size_ = value.size();
+ }
+ if (value_size_ != value.size()) {
+ status_ = Status::NotSupported("all values have to be the same size");
+ return;
+ }
+
+ if (is_last_level_file_) {
+ kvs_.append(ikey.user_key.data(), ikey.user_key.size());
+ } else {
+ kvs_.append(key.data(), key.size());
+ }
+ kvs_.append(value.data(), value.size());
+ ++num_values_;
+ } else {
+ if (is_last_level_file_) {
+ deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size());
+ } else {
+ deleted_keys_.append(key.data(), key.size());
+ }
+ }
+ ++num_entries_;
+
+ // In order to fill the empty buckets in the hash table, we identify a
+ // key which is not used so far (unused_user_key). We determine this by
+ // maintaining smallest and largest keys inserted so far in bytewise order
+ // and use them to find a key outside this range in Finish() operation.
+ // Note that this strategy is independent of user comparator used here.
+ if (ikey.user_key.compare(smallest_user_key_) < 0) {
+ smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+ } else if (ikey.user_key.compare(largest_user_key_) > 0) {
+ largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+ }
+ if (!use_module_hash_) {
+ if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
+ hash_table_size_ *= 2;
+ }
+ }
+}
+
+bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const {
+ assert(closed_);
+ return idx >= num_values_;
+}
+
+Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
+ assert(closed_);
+ if (IsDeletedKey(idx)) {
+ return Slice(&deleted_keys_[static_cast<size_t>((idx - num_values_) * key_size_)], static_cast<size_t>(key_size_));
+ }
+ return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_))], static_cast<size_t>(key_size_));
+}
+
+Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
+ assert(closed_);
+ return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
+}
+
+Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
+ assert(closed_);
+ if (IsDeletedKey(idx)) {
+ static std::string empty_value(static_cast<unsigned int>(value_size_), 'a');
+ return Slice(empty_value);
+ }
+ return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_) + key_size_)], static_cast<size_t>(value_size_));
+}
+
+Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
+ buckets->resize(static_cast<size_t>(hash_table_size_ + cuckoo_block_size_ - 1));
+ uint32_t make_space_for_key_call_id = 0;
+ for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
+ uint64_t bucket_id = 0;
+ bool bucket_found = false;
+ autovector<uint64_t> hash_vals;
+ Slice user_key = GetUserKey(vector_idx);
+ for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
+ ++hash_cnt) {
+ uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_,
+ hash_table_size_, identity_as_first_hash_, get_slice_hash_);
+ // If there is a collision, check next cuckoo_block_size_ locations for
+ // empty locations. While checking, if we reach end of the hash table,
+ // stop searching and proceed for next hash function.
+ for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+ ++block_idx, ++hash_val) {
+ if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
+ bucket_id = hash_val;
+ bucket_found = true;
+ break;
+ } else {
+ if (ucomp_->Compare(user_key,
+ GetUserKey((*buckets)[static_cast<size_t>(hash_val)].vector_idx)) == 0) {
+ return Status::NotSupported("Same key is being inserted again.");
+ }
+ hash_vals.push_back(hash_val);
+ }
+ }
+ }
+ while (!bucket_found && !MakeSpaceForKey(hash_vals,
+ ++make_space_for_key_call_id, buckets, &bucket_id)) {
+ // Rehash by increashing number of hash tables.
+ if (num_hash_func_ >= max_num_hash_func_) {
+ return Status::NotSupported("Too many collisions. Unable to hash.");
+ }
+ // We don't really need to rehash the entire table because old hashes are
+ // still valid and we only increased the number of hash functions.
+ uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_,
+ hash_table_size_, identity_as_first_hash_, get_slice_hash_);
+ ++num_hash_func_;
+ for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+ ++block_idx, ++hash_val) {
+ if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
+ bucket_found = true;
+ bucket_id = hash_val;
+ break;
+ } else {
+ hash_vals.push_back(hash_val);
+ }
+ }
+ }
+ (*buckets)[static_cast<size_t>(bucket_id)].vector_idx = vector_idx;
+ }
+ return Status::OK();
+}
+
+Status CuckooTableBuilder::Finish() {
+ assert(!closed_);
+ closed_ = true;
+ std::vector<CuckooBucket> buckets;
+ Status s;
+ std::string unused_bucket;
+ if (num_entries_ > 0) {
+ // Calculate the real hash size if module hash is enabled.
+ if (use_module_hash_) {
+ hash_table_size_ =
+ static_cast<uint64_t>(num_entries_ / max_hash_table_ratio_);
+ }
+ s = MakeHashTable(&buckets);
+ if (!s.ok()) {
+ return s;
+ }
+ // Determine unused_user_key to fill empty buckets.
+ std::string unused_user_key = smallest_user_key_;
+ int curr_pos = static_cast<int>(unused_user_key.size()) - 1;
+ while (curr_pos >= 0) {
+ --unused_user_key[curr_pos];
+ if (Slice(unused_user_key).compare(smallest_user_key_) < 0) {
+ break;
+ }
+ --curr_pos;
+ }
+ if (curr_pos < 0) {
+ // Try using the largest key to identify an unused key.
+ unused_user_key = largest_user_key_;
+ curr_pos = static_cast<int>(unused_user_key.size()) - 1;
+ while (curr_pos >= 0) {
+ ++unused_user_key[curr_pos];
+ if (Slice(unused_user_key).compare(largest_user_key_) > 0) {
+ break;
+ }
+ --curr_pos;
+ }
+ }
+ if (curr_pos < 0) {
+ return Status::Corruption("Unable to find unused key");
+ }
+ if (is_last_level_file_) {
+ unused_bucket = unused_user_key;
+ } else {
+ ParsedInternalKey ikey(unused_user_key, 0, kTypeValue);
+ AppendInternalKey(&unused_bucket, ikey);
+ }
+ }
+ properties_.num_entries = num_entries_;
+ properties_.num_deletions = num_entries_ - num_values_;
+ properties_.fixed_key_len = key_size_;
+ properties_.user_collected_properties[
+ CuckooTablePropertyNames::kValueLength].assign(
+ reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
+
+ uint64_t bucket_size = key_size_ + value_size_;
+ unused_bucket.resize(static_cast<size_t>(bucket_size), 'a');
+ // Write the table.
+ uint32_t num_added = 0;
+ for (auto& bucket : buckets) {
+ if (bucket.vector_idx == kMaxVectorIdx) {
+ s = file_->Append(Slice(unused_bucket));
+ } else {
+ ++num_added;
+ s = file_->Append(GetKey(bucket.vector_idx));
+ if (s.ok()) {
+ if (value_size_ > 0) {
+ s = file_->Append(GetValue(bucket.vector_idx));
+ }
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ assert(num_added == NumEntries());
+ properties_.raw_key_size = num_added * properties_.fixed_key_len;
+ properties_.raw_value_size = num_added * value_size_;
+
+ uint64_t offset = buckets.size() * bucket_size;
+ properties_.data_size = offset;
+ unused_bucket.resize(static_cast<size_t>(properties_.fixed_key_len));
+ properties_.user_collected_properties[
+ CuckooTablePropertyNames::kEmptyKey] = unused_bucket;
+ properties_.user_collected_properties[
+ CuckooTablePropertyNames::kNumHashFunc].assign(
+ reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_));
+
+ properties_.user_collected_properties[
+ CuckooTablePropertyNames::kHashTableSize].assign(
+ reinterpret_cast<const char*>(&hash_table_size_),
+ sizeof(hash_table_size_));
+ properties_.user_collected_properties[
+ CuckooTablePropertyNames::kIsLastLevel].assign(
+ reinterpret_cast<const char*>(&is_last_level_file_),
+ sizeof(is_last_level_file_));
+ properties_.user_collected_properties[
+ CuckooTablePropertyNames::kCuckooBlockSize].assign(
+ reinterpret_cast<const char*>(&cuckoo_block_size_),
+ sizeof(cuckoo_block_size_));
+ properties_.user_collected_properties[
+ CuckooTablePropertyNames::kIdentityAsFirstHash].assign(
+ reinterpret_cast<const char*>(&identity_as_first_hash_),
+ sizeof(identity_as_first_hash_));
+ properties_.user_collected_properties[
+ CuckooTablePropertyNames::kUseModuleHash].assign(
+ reinterpret_cast<const char*>(&use_module_hash_),
+ sizeof(use_module_hash_));
+ uint32_t user_key_len = static_cast<uint32_t>(smallest_user_key_.size());
+ properties_.user_collected_properties[
+ CuckooTablePropertyNames::kUserKeyLength].assign(
+ reinterpret_cast<const char*>(&user_key_len),
+ sizeof(user_key_len));
+
+ // Write meta blocks.
+ MetaIndexBuilder meta_index_builder;
+ PropertyBlockBuilder property_block_builder;
+
+ property_block_builder.AddTableProperty(properties_);
+ property_block_builder.Add(properties_.user_collected_properties);
+ Slice property_block = property_block_builder.Finish();
+ BlockHandle property_block_handle;
+ property_block_handle.set_offset(offset);
+ property_block_handle.set_size(property_block.size());
+ s = file_->Append(property_block);
+ offset += property_block.size();
+ if (!s.ok()) {
+ return s;
+ }
+
+ meta_index_builder.Add(kPropertiesBlock, property_block_handle);
+ Slice meta_index_block = meta_index_builder.Finish();
+
+ BlockHandle meta_index_block_handle;
+ meta_index_block_handle.set_offset(offset);
+ meta_index_block_handle.set_size(meta_index_block.size());
+ s = file_->Append(meta_index_block);
+ if (!s.ok()) {
+ return s;
+ }
+
+ Footer footer(kCuckooTableMagicNumber, 1);
+ footer.set_metaindex_handle(meta_index_block_handle);
+ footer.set_index_handle(BlockHandle::NullBlockHandle());
+ std::string footer_encoding;
+ footer.EncodeTo(&footer_encoding);
+ s = file_->Append(footer_encoding);
+
+ if (file_ != nullptr) {
+ file_checksum_ = file_->GetFileChecksum();
+ }
+ return s;
+}
+
+void CuckooTableBuilder::Abandon() {
+ assert(!closed_);
+ closed_ = true;
+}
+
+uint64_t CuckooTableBuilder::NumEntries() const {
+ return num_entries_;
+}
+
+uint64_t CuckooTableBuilder::FileSize() const {
+ if (closed_) {
+ return file_->GetFileSize();
+ } else if (num_entries_ == 0) {
+ return 0;
+ }
+
+ if (use_module_hash_) {
+ return static_cast<uint64_t>((key_size_ + value_size_) *
+ num_entries_ / max_hash_table_ratio_);
+ } else {
+ // Account for buckets being a power of two.
+ // As elements are added, file size remains constant for a while and
+ // doubles its size. Since compaction algorithm stops adding elements
+ // only after it exceeds the file limit, we account for the extra element
+ // being added here.
+ uint64_t expected_hash_table_size = hash_table_size_;
+ if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
+ expected_hash_table_size *= 2;
+ }
+ return (key_size_ + value_size_) * expected_hash_table_size - 1;
+ }
+}
+
+// This method is invoked when there is no place to insert the target key.
+// It searches for a set of elements that can be moved to accommodate target
+// key. The search is a BFS graph traversal with first level (hash_vals)
+// being all the buckets target key could go to.
+// Then, from each node (curr_node), we find all the buckets that curr_node
+// could go to. They form the children of curr_node in the tree.
+// We continue the traversal until we find an empty bucket, in which case, we
+// move all elements along the path from first level to this empty bucket, to
+// make space for target key which is inserted at first level (*bucket_id).
+// If tree depth exceedes max depth, we return false indicating failure.
+bool CuckooTableBuilder::MakeSpaceForKey(
+ const autovector<uint64_t>& hash_vals,
+ const uint32_t make_space_for_key_call_id,
+ std::vector<CuckooBucket>* buckets, uint64_t* bucket_id) {
+ struct CuckooNode {
+ uint64_t bucket_id;
+ uint32_t depth;
+ uint32_t parent_pos;
+ CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos)
+ : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {}
+ };
+ // This is BFS search tree that is stored simply as a vector.
+ // Each node stores the index of parent node in the vector.
+ std::vector<CuckooNode> tree;
+ // We want to identify already visited buckets in the current method call so
+ // that we don't add same buckets again for exploration in the tree.
+ // We do this by maintaining a count of current method call in
+ // make_space_for_key_call_id, which acts as a unique id for this invocation
+ // of the method. We store this number into the nodes that we explore in
+ // current method call.
+ // It is unlikely for the increment operation to overflow because the maximum
+ // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
+ for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
+ uint64_t bid = hash_vals[hash_cnt];
+ (*buckets)[static_cast<size_t>(bid)].make_space_for_key_call_id = make_space_for_key_call_id;
+ tree.push_back(CuckooNode(bid, 0, 0));
+ }
+ bool null_found = false;
+ uint32_t curr_pos = 0;
+ while (!null_found && curr_pos < tree.size()) {
+ CuckooNode& curr_node = tree[curr_pos];
+ uint32_t curr_depth = curr_node.depth;
+ if (curr_depth >= max_search_depth_) {
+ break;
+ }
+ CuckooBucket& curr_bucket = (*buckets)[static_cast<size_t>(curr_node.bucket_id)];
+ for (uint32_t hash_cnt = 0;
+ hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) {
+ uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx),
+ hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_,
+ get_slice_hash_);
+ // Iterate inside Cuckoo Block.
+ for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+ ++block_idx, ++child_bucket_id) {
+ if ((*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id ==
+ make_space_for_key_call_id) {
+ continue;
+ }
+ (*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id =
+ make_space_for_key_call_id;
+ tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1,
+ curr_pos));
+ if ((*buckets)[static_cast<size_t>(child_bucket_id)].vector_idx == kMaxVectorIdx) {
+ null_found = true;
+ break;
+ }
+ }
+ }
+ ++curr_pos;
+ }
+
+ if (null_found) {
+ // There is an empty node in tree.back(). Now, traverse the path from this
+ // empty node to top of the tree and at every node in the path, replace
+ // child with the parent. Stop when first level is reached in the tree
+ // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
+ // this location in first level for target key to be inserted.
+ uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
+ while (bucket_to_replace_pos >= num_hash_func_) {
+ CuckooNode& curr_node = tree[bucket_to_replace_pos];
+ (*buckets)[static_cast<size_t>(curr_node.bucket_id)] =
+ (*buckets)[static_cast<size_t>(tree[curr_node.parent_pos].bucket_id)];
+ bucket_to_replace_pos = curr_node.parent_pos;
+ }
+ *bucket_id = tree[bucket_to_replace_pos].bucket_id;
+ }
+ return null_found;
+}
+
+const char* CuckooTableBuilder::GetFileChecksumFuncName() const {
+ if (file_ != nullptr) {
+ return file_->GetFileChecksumFuncName();
+ } else {
+ return kUnknownFileChecksumFuncName.c_str();
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder.h b/src/rocksdb/table/cuckoo/cuckoo_table_builder.h
new file mode 100644
index 000000000..d41dfed79
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder.h
@@ -0,0 +1,136 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_builder.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CuckooTableBuilder: public TableBuilder {
+ public:
+ CuckooTableBuilder(WritableFileWriter* file, double max_hash_table_ratio,
+ uint32_t max_num_hash_func, uint32_t max_search_depth,
+ const Comparator* user_comparator,
+ uint32_t cuckoo_block_size, bool use_module_hash,
+ bool identity_as_first_hash,
+ uint64_t (*get_slice_hash)(const Slice&, uint32_t,
+ uint64_t),
+ uint32_t column_family_id,
+ const std::string& column_family_name);
+ // No copying allowed
+ CuckooTableBuilder(const CuckooTableBuilder&) = delete;
+ void operator=(const CuckooTableBuilder&) = delete;
+
+ // REQUIRES: Either Finish() or Abandon() has been called.
+ ~CuckooTableBuilder() {}
+
+ // Add key,value to the table being constructed.
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Add(const Slice& key, const Slice& value) override;
+
+ // Return non-ok iff some error has been detected.
+ Status status() const override { return status_; }
+
+ // Finish building the table. Stops using the file passed to the
+ // constructor after this function returns.
+ // REQUIRES: Finish(), Abandon() have not been called
+ Status Finish() override;
+
+ // Indicate that the contents of this builder should be abandoned. Stops
+ // using the file passed to the constructor after this function returns.
+ // If the caller is not going to call Finish(), it must call Abandon()
+ // before destroying this builder.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Abandon() override;
+
+ // Number of calls to Add() so far.
+ uint64_t NumEntries() const override;
+
+ // Size of the file generated so far. If invoked after a successful
+ // Finish() call, returns the size of the final generated file.
+ uint64_t FileSize() const override;
+
+ TableProperties GetTableProperties() const override { return properties_; }
+
+ // Get file checksum
+ const std::string& GetFileChecksum() const override { return file_checksum_; }
+
+ // Get file checksum function name
+ const char* GetFileChecksumFuncName() const override;
+
+ private:
+ struct CuckooBucket {
+ CuckooBucket()
+ : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {}
+ uint32_t vector_idx;
+ // This number will not exceed kvs_.size() + max_num_hash_func_.
+ // We assume number of items is <= 2^32.
+ uint32_t make_space_for_key_call_id;
+ };
+ static const uint32_t kMaxVectorIdx = port::kMaxInt32;
+
+ bool MakeSpaceForKey(const autovector<uint64_t>& hash_vals,
+ const uint32_t call_id,
+ std::vector<CuckooBucket>* buckets, uint64_t* bucket_id);
+ Status MakeHashTable(std::vector<CuckooBucket>* buckets);
+
+ inline bool IsDeletedKey(uint64_t idx) const;
+ inline Slice GetKey(uint64_t idx) const;
+ inline Slice GetUserKey(uint64_t idx) const;
+ inline Slice GetValue(uint64_t idx) const;
+
+ uint32_t num_hash_func_;
+ WritableFileWriter* file_;
+ const double max_hash_table_ratio_;
+ const uint32_t max_num_hash_func_;
+ const uint32_t max_search_depth_;
+ const uint32_t cuckoo_block_size_;
+ uint64_t hash_table_size_;
+ bool is_last_level_file_;
+ bool has_seen_first_key_;
+ bool has_seen_first_value_;
+ uint64_t key_size_;
+ uint64_t value_size_;
+ // A list of fixed-size key-value pairs concatenating into a string.
+ // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific
+ // key / value given an index
+ std::string kvs_;
+ std::string deleted_keys_;
+ // Number of key-value pairs stored in kvs_ + number of deleted keys
+ uint64_t num_entries_;
+ // Number of keys that contain value (non-deletion op)
+ uint64_t num_values_;
+ Status status_;
+ TableProperties properties_;
+ const Comparator* ucomp_;
+ bool use_module_hash_;
+ bool identity_as_first_hash_;
+ uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
+ uint64_t max_num_buckets);
+ std::string largest_user_key_ = "";
+ std::string smallest_user_key_ = "";
+
+ bool closed_; // Either Finish() or Abandon() has been called.
+
+ // Store file checksum. If checksum is disabled, its value is "0"
+ std::string file_checksum_ = kUnknownFileChecksum;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc b/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc
new file mode 100644
index 000000000..005ce717d
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc
@@ -0,0 +1,662 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <vector>
+#include <string>
+#include <map>
+#include <utility>
+
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+extern const uint64_t kCuckooTableMagicNumber;
+
+namespace {
+std::unordered_map<std::string, std::vector<uint64_t>> hash_map;
+
+uint64_t GetSliceHash(const Slice& s, uint32_t index,
+ uint64_t /*max_num_buckets*/) {
+ return hash_map[s.ToString()][index];
+}
+} // namespace
+
+class CuckooBuilderTest : public testing::Test {
+ public:
+ CuckooBuilderTest() {
+ env_ = Env::Default();
+ Options options;
+ options.allow_mmap_reads = true;
+ env_options_ = EnvOptions(options);
+ }
+
+ void CheckFileContents(const std::vector<std::string>& keys,
+ const std::vector<std::string>& values,
+ const std::vector<uint64_t>& expected_locations,
+ std::string expected_unused_bucket, uint64_t expected_table_size,
+ uint32_t expected_num_hash_func, bool expected_is_last_level,
+ uint32_t expected_cuckoo_block_size = 1) {
+ uint64_t num_deletions = 0;
+ for (const auto& key : keys) {
+ ParsedInternalKey parsed;
+ if (ParseInternalKey(key, &parsed) && parsed.type == kTypeDeletion) {
+ num_deletions++;
+ }
+ }
+ // Read file
+ std::unique_ptr<RandomAccessFile> read_file;
+ ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_));
+ uint64_t read_file_size;
+ ASSERT_OK(env_->GetFileSize(fname, &read_file_size));
+
+ // @lint-ignore TXT2 T25377293 Grandfathered in
+ Options options;
+ options.allow_mmap_reads = true;
+ ImmutableCFOptions ioptions(options);
+
+ // Assert Table Properties.
+ TableProperties* props = nullptr;
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
+ fname));
+ ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size,
+ kCuckooTableMagicNumber, ioptions,
+ &props, true /* compression_type_missing */));
+ // Check unused bucket.
+ std::string unused_key = props->user_collected_properties[
+ CuckooTablePropertyNames::kEmptyKey];
+ ASSERT_EQ(expected_unused_bucket.substr(0,
+ props->fixed_key_len), unused_key);
+
+ uint64_t value_len_found =
+ *reinterpret_cast<const uint64_t*>(props->user_collected_properties[
+ CuckooTablePropertyNames::kValueLength].data());
+ ASSERT_EQ(values.empty() ? 0 : values[0].size(), value_len_found);
+ ASSERT_EQ(props->raw_value_size, values.size()*value_len_found);
+ const uint64_t table_size =
+ *reinterpret_cast<const uint64_t*>(props->user_collected_properties[
+ CuckooTablePropertyNames::kHashTableSize].data());
+ ASSERT_EQ(expected_table_size, table_size);
+ const uint32_t num_hash_func_found =
+ *reinterpret_cast<const uint32_t*>(props->user_collected_properties[
+ CuckooTablePropertyNames::kNumHashFunc].data());
+ ASSERT_EQ(expected_num_hash_func, num_hash_func_found);
+ const uint32_t cuckoo_block_size =
+ *reinterpret_cast<const uint32_t*>(props->user_collected_properties[
+ CuckooTablePropertyNames::kCuckooBlockSize].data());
+ ASSERT_EQ(expected_cuckoo_block_size, cuckoo_block_size);
+ const bool is_last_level_found =
+ *reinterpret_cast<const bool*>(props->user_collected_properties[
+ CuckooTablePropertyNames::kIsLastLevel].data());
+ ASSERT_EQ(expected_is_last_level, is_last_level_found);
+
+ ASSERT_EQ(props->num_entries, keys.size());
+ ASSERT_EQ(props->num_deletions, num_deletions);
+ ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size());
+ ASSERT_EQ(props->data_size, expected_unused_bucket.size() *
+ (expected_table_size + expected_cuckoo_block_size - 1));
+ ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len);
+ ASSERT_EQ(props->column_family_id, 0);
+ ASSERT_EQ(props->column_family_name, kDefaultColumnFamilyName);
+ delete props;
+
+ // Check contents of the bucket.
+ std::vector<bool> keys_found(keys.size(), false);
+ size_t bucket_size = expected_unused_bucket.size();
+ for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) {
+ Slice read_slice;
+ ASSERT_OK(file_reader->Read(i * bucket_size, bucket_size, &read_slice,
+ nullptr));
+ size_t key_idx =
+ std::find(expected_locations.begin(), expected_locations.end(), i) -
+ expected_locations.begin();
+ if (key_idx == keys.size()) {
+ // i is not one of the expected locations. Empty bucket.
+ if (read_slice.data() == nullptr) {
+ ASSERT_EQ(0, expected_unused_bucket.size());
+ } else {
+ ASSERT_EQ(read_slice.compare(expected_unused_bucket), 0);
+ }
+ } else {
+ keys_found[key_idx] = true;
+ ASSERT_EQ(read_slice.compare(keys[key_idx] + values[key_idx]), 0);
+ }
+ }
+ for (auto key_found : keys_found) {
+ // Check that all keys wereReader found.
+ ASSERT_TRUE(key_found);
+ }
+ }
+
+ std::string GetInternalKey(Slice user_key, bool zero_seqno,
+ ValueType type = kTypeValue) {
+ IterKey ikey;
+ ikey.SetInternalKey(user_key, zero_seqno ? 0 : 1000, type);
+ return ikey.GetInternalKey().ToString();
+ }
+
+ uint64_t NextPowOf2(uint64_t num) {
+ uint64_t n = 2;
+ while (n <= num) {
+ n *= 2;
+ }
+ return n;
+ }
+
+ uint64_t GetExpectedTableSize(uint64_t num) {
+ return NextPowOf2(static_cast<uint64_t>(num / kHashTableRatio));
+ }
+
+
+ Env* env_;
+ EnvOptions env_options_;
+ std::string fname;
+ const double kHashTableRatio = 0.9;
+};
+
+TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
+ std::unique_ptr<WritableFile> writable_file;
+ fname = test::PerThreadDBPath("EmptyFile");
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ EnvOptions()));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100,
+ BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ ASSERT_EQ(0UL, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ CheckFileContents({}, {}, {}, "", 2, 2, false);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
+ for (auto type : {kTypeValue, kTypeDeletion}) {
+ uint32_t num_hash_fun = 4;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+ std::vector<std::string> values;
+ if (type == kTypeValue) {
+ values = {"v01", "v02", "v03", "v04"};
+ } else {
+ values = {"", "", "", ""};
+ }
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1, 2, 3}},
+ {user_keys[1], {1, 2, 3, 4}},
+ {user_keys[2], {2, 3, 4, 5}},
+ {user_keys[3], {3, 4, 5, 6}}};
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+ std::vector<std::string> keys;
+ for (auto& user_key : user_keys) {
+ keys.push_back(GetInternalKey(user_key, false, type));
+ }
+ uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+ std::unique_ptr<WritableFile> writable_file;
+ fname = test::PerThreadDBPath("NoCollisionFullKey");
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ EnvOptions()));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(keys[i]), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = GetInternalKey("key00", true);
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+ expected_table_size, 2, false);
+ }
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
+ uint32_t num_hash_fun = 4;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1, 2, 3}},
+ {user_keys[1], {0, 1, 2, 3}},
+ {user_keys[2], {0, 1, 2, 3}},
+ {user_keys[3], {0, 1, 2, 3}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+ std::vector<std::string> keys;
+ for (auto& user_key : user_keys) {
+ keys.push_back(GetInternalKey(user_key, false));
+ }
+ uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+ std::unique_ptr<WritableFile> writable_file;
+ fname = test::PerThreadDBPath("WithCollisionFullKey");
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ EnvOptions()));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(keys[i]), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = GetInternalKey("key00", true);
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(keys, values, expected_locations,
+ expected_unused_bucket, expected_table_size, 4, false);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
+ uint32_t num_hash_fun = 4;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1, 2, 3}},
+ {user_keys[1], {0, 1, 2, 3}},
+ {user_keys[2], {0, 1, 2, 3}},
+ {user_keys[3], {0, 1, 2, 3}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+ std::vector<std::string> keys;
+ for (auto& user_key : user_keys) {
+ keys.push_back(GetInternalKey(user_key, false));
+ }
+ uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+ std::unique_ptr<WritableFile> writable_file;
+ uint32_t cuckoo_block_size = 2;
+ fname = test::PerThreadDBPath("WithCollisionFullKey2");
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ EnvOptions()));
+ CuckooTableBuilder builder(
+ file_writer.get(), kHashTableRatio, num_hash_fun, 100,
+ BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash,
+ 0 /* column_family_id */, kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(keys[i]), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = GetInternalKey("key00", true);
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(keys, values, expected_locations,
+ expected_unused_bucket, expected_table_size, 3, false, cuckoo_block_size);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) {
+ // Have two hash functions. Insert elements with overlapping hashes.
+ // Finally insert an element with hash value somewhere in the middle
+ // so that it displaces all the elements after that.
+ uint32_t num_hash_fun = 2;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03",
+ "key04", "key05"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1}},
+ {user_keys[1], {1, 2}},
+ {user_keys[2], {2, 3}},
+ {user_keys[3], {3, 4}},
+ {user_keys[4], {0, 2}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
+ std::vector<std::string> keys;
+ for (auto& user_key : user_keys) {
+ keys.push_back(GetInternalKey(user_key, false));
+ }
+ uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+ std::unique_ptr<WritableFile> writable_file;
+ fname = test::PerThreadDBPath("WithCollisionPathFullKey");
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ EnvOptions()));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(keys[i]), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = GetInternalKey("key00", true);
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(keys, values, expected_locations,
+ expected_unused_bucket, expected_table_size, 2, false);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
+ uint32_t num_hash_fun = 2;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03",
+ "key04", "key05"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1}},
+ {user_keys[1], {1, 2}},
+ {user_keys[2], {3, 4}},
+ {user_keys[3], {4, 5}},
+ {user_keys[4], {0, 3}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {2, 1, 3, 4, 0};
+ std::vector<std::string> keys;
+ for (auto& user_key : user_keys) {
+ keys.push_back(GetInternalKey(user_key, false));
+ }
+ uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+ std::unique_ptr<WritableFile> writable_file;
+ fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock");
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ EnvOptions()));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 2, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(keys[i]), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = GetInternalKey("key00", true);
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(keys, values, expected_locations,
+ expected_unused_bucket, expected_table_size, 2, false, 2);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
+ uint32_t num_hash_fun = 4;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1, 2, 3}},
+ {user_keys[1], {1, 2, 3, 4}},
+ {user_keys[2], {2, 3, 4, 5}},
+ {user_keys[3], {3, 4, 5, 6}}};
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+ uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+ std::unique_ptr<WritableFile> writable_file;
+ fname = test::PerThreadDBPath("NoCollisionUserKey");
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ EnvOptions()));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = user_keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = "key00";
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(user_keys, values, expected_locations,
+ expected_unused_bucket, expected_table_size, 2, true);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
+ uint32_t num_hash_fun = 4;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1, 2, 3}},
+ {user_keys[1], {0, 1, 2, 3}},
+ {user_keys[2], {0, 1, 2, 3}},
+ {user_keys[3], {0, 1, 2, 3}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+ uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+ std::unique_ptr<WritableFile> writable_file;
+ fname = test::PerThreadDBPath("WithCollisionUserKey");
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ EnvOptions()));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = user_keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = "key00";
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(user_keys, values, expected_locations,
+ expected_unused_bucket, expected_table_size, 4, true);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) {
+ uint32_t num_hash_fun = 2;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03",
+ "key04", "key05"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1}},
+ {user_keys[1], {1, 2}},
+ {user_keys[2], {2, 3}},
+ {user_keys[3], {3, 4}},
+ {user_keys[4], {0, 2}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
+ uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+ std::unique_ptr<WritableFile> writable_file;
+ fname = test::PerThreadDBPath("WithCollisionPathUserKey");
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ EnvOptions()));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 2, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = user_keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = "key00";
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(user_keys, values, expected_locations,
+ expected_unused_bucket, expected_table_size, 2, true);
+}
+
+TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
+ // Have two hash functions. Insert elements with overlapping hashes.
+ // Finally try inserting an element with hash value somewhere in the middle
+ // and it should fail because the no. of elements to displace is too high.
+ uint32_t num_hash_fun = 2;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03",
+ "key04", "key05"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1}},
+ {user_keys[1], {1, 2}},
+ {user_keys[2], {2, 3}},
+ {user_keys[3], {3, 4}},
+ {user_keys[4], {0, 1}},
+ };
+ hash_map = std::move(hm);
+
+ std::unique_ptr<WritableFile> writable_file;
+ fname = test::PerThreadDBPath("WithCollisionPathUserKey");
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ EnvOptions()));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 2, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value"));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ ASSERT_TRUE(builder.Finish().IsNotSupported());
+ ASSERT_OK(file_writer->Close());
+}
+
+TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) {
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {"repeatedkey", {0, 1, 2, 3}}};
+ hash_map = std::move(hm);
+ uint32_t num_hash_fun = 4;
+ std::string user_key = "repeatedkey";
+
+ std::unique_ptr<WritableFile> writable_file;
+ fname = test::PerThreadDBPath("FailWhenSameKeyInserted");
+ ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ EnvOptions()));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+
+ builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1"));
+ ASSERT_EQ(builder.NumEntries(), 1u);
+ ASSERT_OK(builder.status());
+ builder.Add(Slice(GetInternalKey(user_key, true)), Slice("value2"));
+ ASSERT_EQ(builder.NumEntries(), 2u);
+ ASSERT_OK(builder.status());
+
+ ASSERT_TRUE(builder.Finish().IsNotSupported());
+ ASSERT_OK(file_writer->Close());
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc b/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc
new file mode 100644
index 000000000..5ba48f099
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_factory.h"
+
+#include "db/dbformat.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status CuckooTableFactory::NewTableReader(
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table,
+ bool /*prefetch_index_and_filter_in_cache*/) const {
+ std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(
+ table_reader_options.ioptions, std::move(file), file_size,
+ table_reader_options.internal_comparator.user_comparator(), nullptr));
+ Status s = new_reader->status();
+ if (s.ok()) {
+ *table = std::move(new_reader);
+ }
+ return s;
+}
+
+TableBuilder* CuckooTableFactory::NewTableBuilder(
+ const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+ WritableFileWriter* file) const {
+ // Ignore the skipFIlters flag. Does not apply to this file format
+ //
+
+ // TODO: change builder to take the option struct
+ return new CuckooTableBuilder(
+ file, table_options_.hash_table_ratio, 64,
+ table_options_.max_search_depth,
+ table_builder_options.internal_comparator.user_comparator(),
+ table_options_.cuckoo_block_size, table_options_.use_module_hash,
+ table_options_.identity_as_first_hash, nullptr /* get_slice_hash */,
+ column_family_id, table_builder_options.column_family_name);
+}
+
+std::string CuckooTableFactory::GetPrintableTableOptions() const {
+ std::string ret;
+ ret.reserve(2000);
+ const int kBufferSize = 200;
+ char buffer[kBufferSize];
+
+ snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n",
+ table_options_.hash_table_ratio);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " max_search_depth: %u\n",
+ table_options_.max_search_depth);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " cuckoo_block_size: %u\n",
+ table_options_.cuckoo_block_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " identity_as_first_hash: %d\n",
+ table_options_.identity_as_first_hash);
+ ret.append(buffer);
+ return ret;
+}
+
+TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) {
+ return new CuckooTableFactory(table_options);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_factory.h b/src/rocksdb/table/cuckoo/cuckoo_table_factory.h
new file mode 100644
index 000000000..1b5b6c200
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_factory.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include "rocksdb/table.h"
+#include "util/murmurhash.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint32_t kCuckooMurmurSeedMultiplier = 816922183;
+static inline uint64_t CuckooHash(
+ const Slice& user_key, uint32_t hash_cnt, bool use_module_hash,
+ uint64_t table_size_, bool identity_as_first_hash,
+ uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) {
+#if !defined NDEBUG || defined OS_WIN
+ // This part is used only in unit tests but we have to keep it for Windows
+ // build as we run test in both debug and release modes under Windows.
+ if (get_slice_hash != nullptr) {
+ return get_slice_hash(user_key, hash_cnt, table_size_);
+ }
+#else
+ (void)get_slice_hash;
+#endif
+
+ uint64_t value = 0;
+ if (hash_cnt == 0 && identity_as_first_hash) {
+ value = (*reinterpret_cast<const int64_t*>(user_key.data()));
+ } else {
+ value = MurmurHash(user_key.data(), static_cast<int>(user_key.size()),
+ kCuckooMurmurSeedMultiplier * hash_cnt);
+ }
+ if (use_module_hash) {
+ return value % table_size_;
+ } else {
+ return value & (table_size_ - 1);
+ }
+}
+
+// Cuckoo Table is designed for applications that require fast point lookups
+// but not fast range scans.
+//
+// Some assumptions:
+// - Key length and Value length are fixed.
+// - Does not support Snapshot.
+// - Does not support Merge operations.
+// - Does not support prefix bloom filters.
+class CuckooTableFactory : public TableFactory {
+ public:
+ explicit CuckooTableFactory(const CuckooTableOptions& table_options)
+ : table_options_(table_options) {}
+ ~CuckooTableFactory() {}
+
+ const char* Name() const override { return "CuckooTable"; }
+
+ Status NewTableReader(
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table,
+ bool prefetch_index_and_filter_in_cache = true) const override;
+
+ TableBuilder* NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ uint32_t column_family_id, WritableFileWriter* file) const override;
+
+ // Sanitizes the specified DB Options.
+ Status SanitizeOptions(
+ const DBOptions& /*db_opts*/,
+ const ColumnFamilyOptions& /*cf_opts*/) const override {
+ return Status::OK();
+ }
+
+ std::string GetPrintableTableOptions() const override;
+
+ void* GetOptions() override { return &table_options_; }
+
+ Status GetOptionString(std::string* /*opt_string*/,
+ const std::string& /*delimiter*/) const override {
+ return Status::OK();
+ }
+
+ private:
+ CuckooTableOptions table_options_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc b/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc
new file mode 100644
index 000000000..cd460bd0a
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc
@@ -0,0 +1,399 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_reader.h"
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+#include "memory/arena.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/table.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
+const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
+}
+
+extern const uint64_t kCuckooTableMagicNumber;
+
+CuckooTableReader::CuckooTableReader(
+ const ImmutableCFOptions& ioptions,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ const Comparator* comparator,
+ uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
+ : file_(std::move(file)),
+ is_last_level_(false),
+ identity_as_first_hash_(false),
+ use_module_hash_(false),
+ num_hash_func_(0),
+ unused_key_(""),
+ key_length_(0),
+ user_key_length_(0),
+ value_length_(0),
+ bucket_length_(0),
+ cuckoo_block_size_(0),
+ cuckoo_block_bytes_minus_one_(0),
+ table_size_(0),
+ ucomp_(comparator),
+ get_slice_hash_(get_slice_hash) {
+ if (!ioptions.allow_mmap_reads) {
+ status_ = Status::InvalidArgument("File is not mmaped");
+ }
+ TableProperties* props = nullptr;
+ status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
+ ioptions, &props, true /* compression_type_missing */);
+ if (!status_.ok()) {
+ return;
+ }
+ table_props_.reset(props);
+ auto& user_props = props->user_collected_properties;
+ auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
+ if (hash_funs == user_props.end()) {
+ status_ = Status::Corruption("Number of hash functions not found");
+ return;
+ }
+ num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
+ auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
+ if (unused_key == user_props.end()) {
+ status_ = Status::Corruption("Empty bucket value not found");
+ return;
+ }
+ unused_key_ = unused_key->second;
+
+ key_length_ = static_cast<uint32_t>(props->fixed_key_len);
+ auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
+ if (user_key_len == user_props.end()) {
+ status_ = Status::Corruption("User key length not found");
+ return;
+ }
+ user_key_length_ = *reinterpret_cast<const uint32_t*>(
+ user_key_len->second.data());
+
+ auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
+ if (value_length == user_props.end()) {
+ status_ = Status::Corruption("Value length not found");
+ return;
+ }
+ value_length_ = *reinterpret_cast<const uint32_t*>(
+ value_length->second.data());
+ bucket_length_ = key_length_ + value_length_;
+
+ auto hash_table_size = user_props.find(
+ CuckooTablePropertyNames::kHashTableSize);
+ if (hash_table_size == user_props.end()) {
+ status_ = Status::Corruption("Hash table size not found");
+ return;
+ }
+ table_size_ = *reinterpret_cast<const uint64_t*>(
+ hash_table_size->second.data());
+
+ auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
+ if (is_last_level == user_props.end()) {
+ status_ = Status::Corruption("Is last level not found");
+ return;
+ }
+ is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
+
+ auto identity_as_first_hash = user_props.find(
+ CuckooTablePropertyNames::kIdentityAsFirstHash);
+ if (identity_as_first_hash == user_props.end()) {
+ status_ = Status::Corruption("identity as first hash not found");
+ return;
+ }
+ identity_as_first_hash_ = *reinterpret_cast<const bool*>(
+ identity_as_first_hash->second.data());
+
+ auto use_module_hash = user_props.find(
+ CuckooTablePropertyNames::kUseModuleHash);
+ if (use_module_hash == user_props.end()) {
+ status_ = Status::Corruption("hash type is not found");
+ return;
+ }
+ use_module_hash_ = *reinterpret_cast<const bool*>(
+ use_module_hash->second.data());
+ auto cuckoo_block_size = user_props.find(
+ CuckooTablePropertyNames::kCuckooBlockSize);
+ if (cuckoo_block_size == user_props.end()) {
+ status_ = Status::Corruption("Cuckoo block size not found");
+ return;
+ }
+ cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
+ cuckoo_block_size->second.data());
+ cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
+ status_ = file_->Read(0, static_cast<size_t>(file_size), &file_data_, nullptr);
+}
+
+Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
+ const Slice& key, GetContext* get_context,
+ const SliceTransform* /* prefix_extractor */,
+ bool /*skip_filters*/) {
+ assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
+ Slice user_key = ExtractUserKey(key);
+ for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
+ uint64_t offset = bucket_length_ * CuckooHash(
+ user_key, hash_cnt, use_module_hash_, table_size_,
+ identity_as_first_hash_, get_slice_hash_);
+ const char* bucket = &file_data_.data()[offset];
+ for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+ ++block_idx, bucket += bucket_length_) {
+ if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
+ Slice(bucket, user_key.size()))) {
+ return Status::OK();
+ }
+ // Here, we compare only the user key part as we support only one entry
+ // per user key and we don't support snapshot.
+ if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
+ Slice value(bucket + key_length_, value_length_);
+ if (is_last_level_) {
+ // Sequence number is not stored at the last level, so we will use
+ // kMaxSequenceNumber since it is unknown. This could cause some
+ // transactions to fail to lock a key due to known sequence number.
+ // However, it is expected for anyone to use a CuckooTable in a
+ // TransactionDB.
+ get_context->SaveValue(value, kMaxSequenceNumber);
+ } else {
+ Slice full_key(bucket, key_length_);
+ ParsedInternalKey found_ikey;
+ ParseInternalKey(full_key, &found_ikey);
+ bool dont_care __attribute__((__unused__));
+ get_context->SaveValue(found_ikey, value, &dont_care);
+ }
+ // We don't support merge operations. So, we return here.
+ return Status::OK();
+ }
+ }
+ }
+ return Status::OK();
+}
+
+void CuckooTableReader::Prepare(const Slice& key) {
+ // Prefetch the first Cuckoo Block.
+ Slice user_key = ExtractUserKey(key);
+ uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) +
+ bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
+ identity_as_first_hash_, nullptr);
+ uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
+ for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
+ PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
+ }
+}
+
+class CuckooTableIterator : public InternalIterator {
+ public:
+ explicit CuckooTableIterator(CuckooTableReader* reader);
+ // No copying allowed
+ CuckooTableIterator(const CuckooTableIterator&) = delete;
+ void operator=(const Iterator&) = delete;
+ ~CuckooTableIterator() override {}
+ bool Valid() const override;
+ void SeekToFirst() override;
+ void SeekToLast() override;
+ void Seek(const Slice& target) override;
+ void SeekForPrev(const Slice& target) override;
+ void Next() override;
+ void Prev() override;
+ Slice key() const override;
+ Slice value() const override;
+ Status status() const override { return Status::OK(); }
+ void InitIfNeeded();
+
+ private:
+ struct BucketComparator {
+ BucketComparator(const Slice& file_data, const Comparator* ucomp,
+ uint32_t bucket_len, uint32_t user_key_len,
+ const Slice& target = Slice())
+ : file_data_(file_data),
+ ucomp_(ucomp),
+ bucket_len_(bucket_len),
+ user_key_len_(user_key_len),
+ target_(target) {}
+ bool operator()(const uint32_t first, const uint32_t second) const {
+ const char* first_bucket =
+ (first == kInvalidIndex) ? target_.data() :
+ &file_data_.data()[first * bucket_len_];
+ const char* second_bucket =
+ (second == kInvalidIndex) ? target_.data() :
+ &file_data_.data()[second * bucket_len_];
+ return ucomp_->Compare(Slice(first_bucket, user_key_len_),
+ Slice(second_bucket, user_key_len_)) < 0;
+ }
+ private:
+ const Slice file_data_;
+ const Comparator* ucomp_;
+ const uint32_t bucket_len_;
+ const uint32_t user_key_len_;
+ const Slice target_;
+ };
+
+ const BucketComparator bucket_comparator_;
+ void PrepareKVAtCurrIdx();
+ CuckooTableReader* reader_;
+ bool initialized_;
+ // Contains a map of keys to bucket_id sorted in key order.
+ std::vector<uint32_t> sorted_bucket_ids_;
+ // We assume that the number of items can be stored in uint32 (4 Billion).
+ uint32_t curr_key_idx_;
+ Slice curr_value_;
+ IterKey curr_key_;
+};
+
+CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
+ : bucket_comparator_(reader->file_data_, reader->ucomp_,
+ reader->bucket_length_, reader->user_key_length_),
+ reader_(reader),
+ initialized_(false),
+ curr_key_idx_(kInvalidIndex) {
+ sorted_bucket_ids_.clear();
+ curr_value_.clear();
+ curr_key_.Clear();
+}
+
+void CuckooTableIterator::InitIfNeeded() {
+ if (initialized_) {
+ return;
+ }
+ sorted_bucket_ids_.reserve(static_cast<size_t>(reader_->GetTableProperties()->num_entries));
+ uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
+ assert(num_buckets < kInvalidIndex);
+ const char* bucket = reader_->file_data_.data();
+ for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
+ if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
+ sorted_bucket_ids_.push_back(bucket_id);
+ }
+ bucket += reader_->bucket_length_;
+ }
+ assert(sorted_bucket_ids_.size() ==
+ reader_->GetTableProperties()->num_entries);
+ std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
+ bucket_comparator_);
+ curr_key_idx_ = kInvalidIndex;
+ initialized_ = true;
+}
+
+void CuckooTableIterator::SeekToFirst() {
+ InitIfNeeded();
+ curr_key_idx_ = 0;
+ PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::SeekToLast() {
+ InitIfNeeded();
+ curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
+ PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::Seek(const Slice& target) {
+ InitIfNeeded();
+ const BucketComparator seek_comparator(
+ reader_->file_data_, reader_->ucomp_,
+ reader_->bucket_length_, reader_->user_key_length_,
+ ExtractUserKey(target));
+ auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(),
+ sorted_bucket_ids_.end(),
+ kInvalidIndex,
+ seek_comparator);
+ curr_key_idx_ =
+ static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
+ PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) {
+ // Not supported
+ assert(false);
+}
+
+bool CuckooTableIterator::Valid() const {
+ return curr_key_idx_ < sorted_bucket_ids_.size();
+}
+
+void CuckooTableIterator::PrepareKVAtCurrIdx() {
+ if (!Valid()) {
+ curr_value_.clear();
+ curr_key_.Clear();
+ return;
+ }
+ uint32_t id = sorted_bucket_ids_[curr_key_idx_];
+ const char* offset = reader_->file_data_.data() +
+ id * reader_->bucket_length_;
+ if (reader_->is_last_level_) {
+ // Always return internal key.
+ curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_),
+ 0, kTypeValue);
+ } else {
+ curr_key_.SetInternalKey(Slice(offset, reader_->key_length_));
+ }
+ curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
+}
+
+void CuckooTableIterator::Next() {
+ if (!Valid()) {
+ curr_value_.clear();
+ curr_key_.Clear();
+ return;
+ }
+ ++curr_key_idx_;
+ PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::Prev() {
+ if (curr_key_idx_ == 0) {
+ curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
+ }
+ if (!Valid()) {
+ curr_value_.clear();
+ curr_key_.Clear();
+ return;
+ }
+ --curr_key_idx_;
+ PrepareKVAtCurrIdx();
+}
+
+Slice CuckooTableIterator::key() const {
+ assert(Valid());
+ return curr_key_.GetInternalKey();
+}
+
+Slice CuckooTableIterator::value() const {
+ assert(Valid());
+ return curr_value_;
+}
+
+InternalIterator* CuckooTableReader::NewIterator(
+ const ReadOptions& /*read_options*/,
+ const SliceTransform* /* prefix_extractor */, Arena* arena,
+ bool /*skip_filters*/, TableReaderCaller /*caller*/,
+ size_t /*compaction_readahead_size*/) {
+ if (!status().ok()) {
+ return NewErrorInternalIterator<Slice>(
+ Status::Corruption("CuckooTableReader status is not okay."), arena);
+ }
+ CuckooTableIterator* iter;
+ if (arena == nullptr) {
+ iter = new CuckooTableIterator(this);
+ } else {
+ auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
+ iter = new (iter_mem) CuckooTableIterator(this);
+ }
+ return iter;
+}
+
+size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
+
+} // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader.h b/src/rocksdb/table/cuckoo/cuckoo_table_reader.h
new file mode 100644
index 000000000..65bd13e1a
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class TableReader;
+
+class CuckooTableReader: public TableReader {
+ public:
+ CuckooTableReader(const ImmutableCFOptions& ioptions,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ uint64_t file_size, const Comparator* user_comparator,
+ uint64_t (*get_slice_hash)(const Slice&, uint32_t,
+ uint64_t));
+ ~CuckooTableReader() {}
+
+ std::shared_ptr<const TableProperties> GetTableProperties() const override {
+ return table_props_;
+ }
+
+ Status status() const { return status_; }
+
+ Status Get(const ReadOptions& readOptions, const Slice& key,
+ GetContext* get_context, const SliceTransform* prefix_extractor,
+ bool skip_filters = false) override;
+
+ // Returns a new iterator over table contents
+ // compaction_readahead_size: its value will only be used if for_compaction =
+ // true
+ InternalIterator* NewIterator(const ReadOptions&,
+ const SliceTransform* prefix_extractor,
+ Arena* arena, bool skip_filters,
+ TableReaderCaller caller,
+ size_t compaction_readahead_size = 0) override;
+ void Prepare(const Slice& target) override;
+
+ // Report an approximation of how much memory has been used.
+ size_t ApproximateMemoryUsage() const override;
+
+ // Following methods are not implemented for Cuckoo Table Reader
+ uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+ TableReaderCaller /*caller*/) override {
+ return 0;
+ }
+
+ uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+ TableReaderCaller /*caller*/) override {
+ return 0;
+ }
+
+ void SetupForCompaction() override {}
+ // End of methods not implemented.
+
+ private:
+ friend class CuckooTableIterator;
+ void LoadAllKeys(std::vector<std::pair<Slice, uint32_t>>* key_to_bucket_id);
+ std::unique_ptr<RandomAccessFileReader> file_;
+ Slice file_data_;
+ bool is_last_level_;
+ bool identity_as_first_hash_;
+ bool use_module_hash_;
+ std::shared_ptr<const TableProperties> table_props_;
+ Status status_;
+ uint32_t num_hash_func_;
+ std::string unused_key_;
+ uint32_t key_length_;
+ uint32_t user_key_length_;
+ uint32_t value_length_;
+ uint32_t bucket_length_;
+ uint32_t cuckoo_block_size_;
+ uint32_t cuckoo_block_bytes_minus_one_;
+ uint64_t table_size_;
+ const Comparator* ucomp_;
+ uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
+ uint64_t max_num_buckets);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc b/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc
new file mode 100644
index 000000000..cbca768a2
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc
@@ -0,0 +1,578 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+ return 0;
+}
+#else
+
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "memory/arena.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+#include "table/get_context.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_string(file_dir, "", "Directory where the files will be created"
+ " for benchmark. Added for using tmpfs.");
+DEFINE_bool(enable_perf, false, "Run Benchmark Tests too.");
+DEFINE_bool(write, false,
+ "Should write new values to file in performance tests?");
+DEFINE_bool(identity_as_first_hash, true, "use identity as first hash");
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const uint32_t kNumHashFunc = 10;
+// Methods, variables related to Hash functions.
+std::unordered_map<std::string, std::vector<uint64_t>> hash_map;
+
+void AddHashLookups(const std::string& s, uint64_t bucket_id,
+ uint32_t num_hash_fun) {
+ std::vector<uint64_t> v;
+ for (uint32_t i = 0; i < num_hash_fun; i++) {
+ v.push_back(bucket_id + i);
+ }
+ hash_map[s] = v;
+}
+
+uint64_t GetSliceHash(const Slice& s, uint32_t index,
+ uint64_t /*max_num_buckets*/) {
+ return hash_map[s.ToString()][index];
+}
+} // namespace
+
+class CuckooReaderTest : public testing::Test {
+ public:
+ using testing::Test::SetUp;
+
+ CuckooReaderTest() {
+ options.allow_mmap_reads = true;
+ env = options.env;
+ env_options = EnvOptions(options);
+ }
+
+ void SetUp(int num) {
+ num_items = num;
+ hash_map.clear();
+ keys.clear();
+ keys.resize(num_items);
+ user_keys.clear();
+ user_keys.resize(num_items);
+ values.clear();
+ values.resize(num_items);
+ }
+
+ std::string NumToStr(int64_t i) {
+ return std::string(reinterpret_cast<char*>(&i), sizeof(i));
+ }
+
+ void CreateCuckooFileAndCheckReader(
+ const Comparator* ucomp = BytewiseComparator()) {
+ std::unique_ptr<WritableFile> writable_file;
+ ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ env_options));
+
+ CuckooTableBuilder builder(
+ file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false,
+ GetSliceHash, 0 /* column_family_id */, kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) {
+ builder.Add(Slice(keys[key_idx]), Slice(values[key_idx]));
+ ASSERT_OK(builder.status());
+ ASSERT_EQ(builder.NumEntries(), key_idx + 1);
+ }
+ ASSERT_OK(builder.Finish());
+ ASSERT_EQ(num_items, builder.NumEntries());
+ file_size = builder.FileSize();
+ ASSERT_OK(file_writer->Close());
+
+ // Check reader now.
+ std::unique_ptr<RandomAccessFile> read_file;
+ ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
+ fname));
+ const ImmutableCFOptions ioptions(options);
+ CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
+ GetSliceHash);
+ ASSERT_OK(reader.status());
+ // Assume no merge/deletion
+ for (uint32_t i = 0; i < num_items; ++i) {
+ PinnableSlice value;
+ GetContext get_context(ucomp, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, Slice(user_keys[i]), &value,
+ nullptr, nullptr, true, nullptr, nullptr);
+ ASSERT_OK(
+ reader.Get(ReadOptions(), Slice(keys[i]), &get_context, nullptr));
+ ASSERT_STREQ(values[i].c_str(), value.data());
+ }
+ }
+ void UpdateKeys(bool with_zero_seqno) {
+ for (uint32_t i = 0; i < num_items; i++) {
+ ParsedInternalKey ikey(user_keys[i],
+ with_zero_seqno ? 0 : i + 1000, kTypeValue);
+ keys[i].clear();
+ AppendInternalKey(&keys[i], ikey);
+ }
+ }
+
+ void CheckIterator(const Comparator* ucomp = BytewiseComparator()) {
+ std::unique_ptr<RandomAccessFile> read_file;
+ ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
+ fname));
+ const ImmutableCFOptions ioptions(options);
+ CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
+ GetSliceHash);
+ ASSERT_OK(reader.status());
+ InternalIterator* it = reader.NewIterator(
+ ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(!it->Valid());
+ it->SeekToFirst();
+ int cnt = 0;
+ while (it->Valid()) {
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+ ASSERT_TRUE(Slice(values[cnt]) == it->value());
+ ++cnt;
+ it->Next();
+ }
+ ASSERT_EQ(static_cast<uint32_t>(cnt), num_items);
+
+ it->SeekToLast();
+ cnt = static_cast<int>(num_items) - 1;
+ ASSERT_TRUE(it->Valid());
+ while (it->Valid()) {
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+ ASSERT_TRUE(Slice(values[cnt]) == it->value());
+ --cnt;
+ it->Prev();
+ }
+ ASSERT_EQ(cnt, -1);
+
+ cnt = static_cast<int>(num_items) / 2;
+ it->Seek(keys[cnt]);
+ while (it->Valid()) {
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+ ASSERT_TRUE(Slice(values[cnt]) == it->value());
+ ++cnt;
+ it->Next();
+ }
+ ASSERT_EQ(static_cast<uint32_t>(cnt), num_items);
+ delete it;
+
+ Arena arena;
+ it = reader.NewIterator(ReadOptions(), /*prefix_extractor=*/nullptr, &arena,
+ /*skip_filters=*/false,
+ TableReaderCaller::kUncategorized);
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(!it->Valid());
+ it->Seek(keys[num_items/2]);
+ ASSERT_TRUE(it->Valid());
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(keys[num_items/2] == it->key());
+ ASSERT_TRUE(values[num_items/2] == it->value());
+ ASSERT_OK(it->status());
+ it->~InternalIterator();
+ }
+
+ std::vector<std::string> keys;
+ std::vector<std::string> user_keys;
+ std::vector<std::string> values;
+ uint64_t num_items;
+ std::string fname;
+ uint64_t file_size;
+ Options options;
+ Env* env;
+ EnvOptions env_options;
+};
+
+TEST_F(CuckooReaderTest, WhenKeyExists) {
+ SetUp(kNumHashFunc);
+ fname = test::PerThreadDBPath("CuckooReader_WhenKeyExists");
+ for (uint64_t i = 0; i < num_items; i++) {
+ user_keys[i] = "key" + NumToStr(i);
+ ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+ AppendInternalKey(&keys[i], ikey);
+ values[i] = "value" + NumToStr(i);
+ // Give disjoint hash values.
+ AddHashLookups(user_keys[i], i, kNumHashFunc);
+ }
+ CreateCuckooFileAndCheckReader();
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader();
+ // Test with collision. Make all hash values collide.
+ hash_map.clear();
+ for (uint32_t i = 0; i < num_items; i++) {
+ AddHashLookups(user_keys[i], 0, kNumHashFunc);
+ }
+ UpdateKeys(false);
+ CreateCuckooFileAndCheckReader();
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader();
+}
+
+TEST_F(CuckooReaderTest, WhenKeyExistsWithUint64Comparator) {
+ SetUp(kNumHashFunc);
+ fname = test::PerThreadDBPath("CuckooReaderUint64_WhenKeyExists");
+ for (uint64_t i = 0; i < num_items; i++) {
+ user_keys[i].resize(8);
+ memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
+ ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+ AppendInternalKey(&keys[i], ikey);
+ values[i] = "value" + NumToStr(i);
+ // Give disjoint hash values.
+ AddHashLookups(user_keys[i], i, kNumHashFunc);
+ }
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+ // Test with collision. Make all hash values collide.
+ hash_map.clear();
+ for (uint32_t i = 0; i < num_items; i++) {
+ AddHashLookups(user_keys[i], 0, kNumHashFunc);
+ }
+ UpdateKeys(false);
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+}
+
+TEST_F(CuckooReaderTest, CheckIterator) {
+ SetUp(2*kNumHashFunc);
+ fname = test::PerThreadDBPath("CuckooReader_CheckIterator");
+ for (uint64_t i = 0; i < num_items; i++) {
+ user_keys[i] = "key" + NumToStr(i);
+ ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue);
+ AppendInternalKey(&keys[i], ikey);
+ values[i] = "value" + NumToStr(i);
+ // Give disjoint hash values, in reverse order.
+ AddHashLookups(user_keys[i], num_items-i-1, kNumHashFunc);
+ }
+ CreateCuckooFileAndCheckReader();
+ CheckIterator();
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader();
+ CheckIterator();
+}
+
+TEST_F(CuckooReaderTest, CheckIteratorUint64) {
+ SetUp(2*kNumHashFunc);
+ fname = test::PerThreadDBPath("CuckooReader_CheckIterator");
+ for (uint64_t i = 0; i < num_items; i++) {
+ user_keys[i].resize(8);
+ memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
+ ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue);
+ AppendInternalKey(&keys[i], ikey);
+ values[i] = "value" + NumToStr(i);
+ // Give disjoint hash values, in reverse order.
+ AddHashLookups(user_keys[i], num_items-i-1, kNumHashFunc);
+ }
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+ CheckIterator(test::Uint64Comparator());
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+ CheckIterator(test::Uint64Comparator());
+}
+
+TEST_F(CuckooReaderTest, WhenKeyNotFound) {
+ // Add keys with colliding hash values.
+ SetUp(kNumHashFunc);
+ fname = test::PerThreadDBPath("CuckooReader_WhenKeyNotFound");
+ for (uint64_t i = 0; i < num_items; i++) {
+ user_keys[i] = "key" + NumToStr(i);
+ ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+ AppendInternalKey(&keys[i], ikey);
+ values[i] = "value" + NumToStr(i);
+ // Make all hash values collide.
+ AddHashLookups(user_keys[i], 0, kNumHashFunc);
+ }
+ auto* ucmp = BytewiseComparator();
+ CreateCuckooFileAndCheckReader();
+ std::unique_ptr<RandomAccessFile> read_file;
+ ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
+ fname));
+ const ImmutableCFOptions ioptions(options);
+ CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp,
+ GetSliceHash);
+ ASSERT_OK(reader.status());
+ // Search for a key with colliding hash values.
+ std::string not_found_user_key = "key" + NumToStr(num_items);
+ std::string not_found_key;
+ AddHashLookups(not_found_user_key, 0, kNumHashFunc);
+ ParsedInternalKey ikey(not_found_user_key, 1000, kTypeValue);
+ AppendInternalKey(&not_found_key, ikey);
+ PinnableSlice value;
+ GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound,
+ Slice(not_found_key), &value, nullptr, nullptr, true,
+ nullptr, nullptr);
+ ASSERT_OK(
+ reader.Get(ReadOptions(), Slice(not_found_key), &get_context, nullptr));
+ ASSERT_TRUE(value.empty());
+ ASSERT_OK(reader.status());
+ // Search for a key with an independent hash value.
+ std::string not_found_user_key2 = "key" + NumToStr(num_items + 1);
+ AddHashLookups(not_found_user_key2, kNumHashFunc, kNumHashFunc);
+ ParsedInternalKey ikey2(not_found_user_key2, 1000, kTypeValue);
+ std::string not_found_key2;
+ AppendInternalKey(&not_found_key2, ikey2);
+ value.Reset();
+ GetContext get_context2(ucmp, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, Slice(not_found_key2), &value,
+ nullptr, nullptr, true, nullptr, nullptr);
+ ASSERT_OK(
+ reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2, nullptr));
+ ASSERT_TRUE(value.empty());
+ ASSERT_OK(reader.status());
+
+ // Test read when key is unused key.
+ std::string unused_key =
+ reader.GetTableProperties()->user_collected_properties.at(
+ CuckooTablePropertyNames::kEmptyKey);
+ // Add hash values that map to empty buckets.
+ AddHashLookups(ExtractUserKey(unused_key).ToString(),
+ kNumHashFunc, kNumHashFunc);
+ value.Reset();
+ GetContext get_context3(ucmp, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, Slice(unused_key), &value,
+ nullptr, nullptr, true, nullptr, nullptr);
+ ASSERT_OK(
+ reader.Get(ReadOptions(), Slice(unused_key), &get_context3, nullptr));
+ ASSERT_TRUE(value.empty());
+ ASSERT_OK(reader.status());
+}
+
+// Performance tests
+namespace {
+void GetKeys(uint64_t num, std::vector<std::string>* keys) {
+ keys->clear();
+ IterKey k;
+ k.SetInternalKey("", 0, kTypeValue);
+ std::string internal_key_suffix = k.GetInternalKey().ToString();
+ ASSERT_EQ(static_cast<size_t>(8), internal_key_suffix.size());
+ for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
+ uint64_t value = 2 * key_idx;
+ std::string new_key(reinterpret_cast<char*>(&value), sizeof(value));
+ new_key += internal_key_suffix;
+ keys->push_back(new_key);
+ }
+}
+
+std::string GetFileName(uint64_t num) {
+ if (FLAGS_file_dir.empty()) {
+ FLAGS_file_dir = test::TmpDir();
+ }
+ return test::PerThreadDBPath(FLAGS_file_dir, "cuckoo_read_benchmark") +
+ ToString(num / 1000000) + "Mkeys";
+}
+
+// Create last level file as we are interested in measuring performance of
+// last level file only.
+void WriteFile(const std::vector<std::string>& keys,
+ const uint64_t num, double hash_ratio) {
+ Options options;
+ options.allow_mmap_reads = true;
+ Env* env = options.env;
+ EnvOptions env_options = EnvOptions(options);
+ std::string fname = GetFileName(num);
+
+ std::unique_ptr<WritableFile> writable_file;
+ ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
+ env_options));
+ CuckooTableBuilder builder(
+ file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5,
+ false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
+ // Value is just a part of key.
+ builder.Add(Slice(keys[key_idx]), Slice(&keys[key_idx][0], 4));
+ ASSERT_EQ(builder.NumEntries(), key_idx + 1);
+ ASSERT_OK(builder.status());
+ }
+ ASSERT_OK(builder.Finish());
+ ASSERT_EQ(num, builder.NumEntries());
+ ASSERT_OK(file_writer->Close());
+
+ uint64_t file_size;
+ env->GetFileSize(fname, &file_size);
+ std::unique_ptr<RandomAccessFile> read_file;
+ ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
+ fname));
+
+ const ImmutableCFOptions ioptions(options);
+ CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
+ test::Uint64Comparator(), nullptr);
+ ASSERT_OK(reader.status());
+ ReadOptions r_options;
+ PinnableSlice value;
+ // Assume only the fast path is triggered
+ GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, Slice(), &value, nullptr,
+ nullptr, true, nullptr, nullptr);
+ for (uint64_t i = 0; i < num; ++i) {
+ value.Reset();
+ value.clear();
+ ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context, nullptr));
+ ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4));
+ }
+}
+
+void ReadKeys(uint64_t num, uint32_t batch_size) {
+ Options options;
+ options.allow_mmap_reads = true;
+ Env* env = options.env;
+ EnvOptions env_options = EnvOptions(options);
+ std::string fname = GetFileName(num);
+
+ uint64_t file_size;
+ env->GetFileSize(fname, &file_size);
+ std::unique_ptr<RandomAccessFile> read_file;
+ ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
+ fname));
+
+ const ImmutableCFOptions ioptions(options);
+ CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
+ test::Uint64Comparator(), nullptr);
+ ASSERT_OK(reader.status());
+ const UserCollectedProperties user_props =
+ reader.GetTableProperties()->user_collected_properties;
+ const uint32_t num_hash_fun = *reinterpret_cast<const uint32_t*>(
+ user_props.at(CuckooTablePropertyNames::kNumHashFunc).data());
+ const uint64_t table_size = *reinterpret_cast<const uint64_t*>(
+ user_props.at(CuckooTablePropertyNames::kHashTableSize).data());
+ fprintf(stderr, "With %" PRIu64 " items, utilization is %.2f%%, number of"
+ " hash functions: %u.\n", num, num * 100.0 / (table_size), num_hash_fun);
+ ReadOptions r_options;
+
+ std::vector<uint64_t> keys;
+ keys.reserve(num);
+ for (uint64_t i = 0; i < num; ++i) {
+ keys.push_back(2 * i);
+ }
+ std::random_shuffle(keys.begin(), keys.end());
+
+ PinnableSlice value;
+ // Assume only the fast path is triggered
+ GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, Slice(), &value, nullptr,
+ nullptr, true, nullptr, nullptr);
+ uint64_t start_time = env->NowMicros();
+ if (batch_size > 0) {
+ for (uint64_t i = 0; i < num; i += batch_size) {
+ for (uint64_t j = i; j < i+batch_size && j < num; ++j) {
+ reader.Prepare(Slice(reinterpret_cast<char*>(&keys[j]), 16));
+ }
+ for (uint64_t j = i; j < i+batch_size && j < num; ++j) {
+ reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[j]), 16),
+ &get_context, nullptr);
+ }
+ }
+ } else {
+ for (uint64_t i = 0; i < num; i++) {
+ reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[i]), 16),
+ &get_context, nullptr);
+ }
+ }
+ float time_per_op = (env->NowMicros() - start_time) * 1.0f / num;
+ fprintf(stderr,
+ "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n",
+ time_per_op, 1.0 / time_per_op, batch_size);
+}
+} // namespace.
+
+TEST_F(CuckooReaderTest, TestReadPerformance) {
+ if (!FLAGS_enable_perf) {
+ return;
+ }
+ double hash_ratio = 0.95;
+ // These numbers are chosen to have a hash utilization % close to
+ // 0.9, 0.75, 0.6 and 0.5 respectively.
+ // They all create 128 M buckets.
+ std::vector<uint64_t> nums = {120*1024*1024, 100*1024*1024, 80*1024*1024,
+ 70*1024*1024};
+#ifndef NDEBUG
+ fprintf(stdout,
+ "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n");
+#endif
+ for (uint64_t num : nums) {
+ if (FLAGS_write ||
+ Env::Default()->FileExists(GetFileName(num)).IsNotFound()) {
+ std::vector<std::string> all_keys;
+ GetKeys(num, &all_keys);
+ WriteFile(all_keys, num, hash_ratio);
+ }
+ ReadKeys(num, 0);
+ ReadKeys(num, 10);
+ ReadKeys(num, 25);
+ ReadKeys(num, 50);
+ ReadKeys(num, 100);
+ fprintf(stderr, "\n");
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ if (ROCKSDB_NAMESPACE::port::kLittleEndian) {
+ ::testing::InitGoogleTest(&argc, argv);
+ ParseCommandLineFlags(&argc, &argv, true);
+ return RUN_ALL_TESTS();
+ } else {
+ fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n");
+ return 0;
+ }
+}
+
+#endif // GFLAGS.
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/format.cc b/src/rocksdb/table/format.cc
new file mode 100644
index 000000000..ee3766eb8
--- /dev/null
+++ b/src/rocksdb/table/format.cc
@@ -0,0 +1,465 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/format.h"
+
+#include <cinttypes>
+#include <string>
+
+#include "block_fetcher.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "memory/memory_allocator.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/env.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/persistent_cache_helper.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+
+#ifndef ROCKSDB_LITE
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+#else
+// ROCKSDB_LITE doesn't have plain table
+const uint64_t kLegacyPlainTableMagicNumber = 0;
+const uint64_t kPlainTableMagicNumber = 0;
+#endif
+
+bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
+ return env != nullptr && stats != nullptr &&
+ stats->get_stats_level() > kExceptDetailedTimers;
+}
+
+void BlockHandle::EncodeTo(std::string* dst) const {
+ // Sanity check that all fields have been set
+ assert(offset_ != ~static_cast<uint64_t>(0));
+ assert(size_ != ~static_cast<uint64_t>(0));
+ PutVarint64Varint64(dst, offset_, size_);
+}
+
+Status BlockHandle::DecodeFrom(Slice* input) {
+ if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
+ return Status::OK();
+ } else {
+ // reset in case failure after partially decoding
+ offset_ = 0;
+ size_ = 0;
+ return Status::Corruption("bad block handle");
+ }
+}
+
+Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
+ if (GetVarint64(input, &size_)) {
+ offset_ = _offset;
+ return Status::OK();
+ } else {
+ // reset in case failure after partially decoding
+ offset_ = 0;
+ size_ = 0;
+ return Status::Corruption("bad block handle");
+ }
+}
+
+// Return a string that contains the copy of handle.
+std::string BlockHandle::ToString(bool hex) const {
+ std::string handle_str;
+ EncodeTo(&handle_str);
+ if (hex) {
+ return Slice(handle_str).ToString(true);
+ } else {
+ return handle_str;
+ }
+}
+
+const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
+
+void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
+ const BlockHandle* previous_handle) const {
+ if (previous_handle) {
+ assert(handle.offset() == previous_handle->offset() +
+ previous_handle->size() + kBlockTrailerSize);
+ PutVarsignedint64(dst, handle.size() - previous_handle->size());
+ } else {
+ handle.EncodeTo(dst);
+ }
+ assert(dst->size() != 0);
+
+ if (have_first_key) {
+ PutLengthPrefixedSlice(dst, first_internal_key);
+ }
+}
+
+Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
+ const BlockHandle* previous_handle) {
+ if (previous_handle) {
+ int64_t delta;
+ if (!GetVarsignedint64(input, &delta)) {
+ return Status::Corruption("bad delta-encoded index value");
+ }
+ handle = BlockHandle(
+ previous_handle->offset() + previous_handle->size() + kBlockTrailerSize,
+ previous_handle->size() + delta);
+ } else {
+ Status s = handle.DecodeFrom(input);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (!have_first_key) {
+ first_internal_key = Slice();
+ } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
+ return Status::Corruption("bad first key in block info");
+ }
+
+ return Status::OK();
+}
+
+std::string IndexValue::ToString(bool hex, bool have_first_key) const {
+ std::string s;
+ EncodeTo(&s, have_first_key, nullptr);
+ if (hex) {
+ return Slice(s).ToString(true);
+ } else {
+ return s;
+ }
+}
+
+namespace {
+inline bool IsLegacyFooterFormat(uint64_t magic_number) {
+ return magic_number == kLegacyBlockBasedTableMagicNumber ||
+ magic_number == kLegacyPlainTableMagicNumber;
+}
+inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
+ if (magic_number == kLegacyBlockBasedTableMagicNumber) {
+ return kBlockBasedTableMagicNumber;
+ }
+ if (magic_number == kLegacyPlainTableMagicNumber) {
+ return kPlainTableMagicNumber;
+ }
+ assert(false);
+ return 0;
+}
+} // namespace
+
+// legacy footer format:
+// metaindex handle (varint64 offset, varint64 size)
+// index handle (varint64 offset, varint64 size)
+// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
+// table_magic_number (8 bytes)
+// new footer format:
+// checksum type (char, 1 byte)
+// metaindex handle (varint64 offset, varint64 size)
+// index handle (varint64 offset, varint64 size)
+// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
+// footer version (4 bytes)
+// table_magic_number (8 bytes)
+void Footer::EncodeTo(std::string* dst) const {
+ assert(HasInitializedTableMagicNumber());
+ if (IsLegacyFooterFormat(table_magic_number())) {
+ // has to be default checksum with legacy footer
+ assert(checksum_ == kCRC32c);
+ const size_t original_size = dst->size();
+ metaindex_handle_.EncodeTo(dst);
+ index_handle_.EncodeTo(dst);
+ dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding
+ PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
+ PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
+ assert(dst->size() == original_size + kVersion0EncodedLength);
+ } else {
+ const size_t original_size = dst->size();
+ dst->push_back(static_cast<char>(checksum_));
+ metaindex_handle_.EncodeTo(dst);
+ index_handle_.EncodeTo(dst);
+ dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding
+ PutFixed32(dst, version());
+ PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
+ PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
+ assert(dst->size() == original_size + kNewVersionsEncodedLength);
+ }
+}
+
+Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
+ : version_(_version),
+ checksum_(kCRC32c),
+ table_magic_number_(_table_magic_number) {
+ // This should be guaranteed by constructor callers
+ assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
+}
+
+Status Footer::DecodeFrom(Slice* input) {
+ assert(!HasInitializedTableMagicNumber());
+ assert(input != nullptr);
+ assert(input->size() >= kMinEncodedLength);
+
+ const char* magic_ptr =
+ input->data() + input->size() - kMagicNumberLengthByte;
+ const uint32_t magic_lo = DecodeFixed32(magic_ptr);
+ const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
+ uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
+ (static_cast<uint64_t>(magic_lo)));
+
+ // We check for legacy formats here and silently upconvert them
+ bool legacy = IsLegacyFooterFormat(magic);
+ if (legacy) {
+ magic = UpconvertLegacyFooterFormat(magic);
+ }
+ set_table_magic_number(magic);
+
+ if (legacy) {
+ // The size is already asserted to be at least kMinEncodedLength
+ // at the beginning of the function
+ input->remove_prefix(input->size() - kVersion0EncodedLength);
+ version_ = 0 /* legacy */;
+ checksum_ = kCRC32c;
+ } else {
+ version_ = DecodeFixed32(magic_ptr - 4);
+ // Footer version 1 and higher will always occupy exactly this many bytes.
+ // It consists of the checksum type, two block handles, padding,
+ // a version number, and a magic number
+ if (input->size() < kNewVersionsEncodedLength) {
+ return Status::Corruption("input is too short to be an sstable");
+ } else {
+ input->remove_prefix(input->size() - kNewVersionsEncodedLength);
+ }
+ uint32_t chksum;
+ if (!GetVarint32(input, &chksum)) {
+ return Status::Corruption("bad checksum type");
+ }
+ checksum_ = static_cast<ChecksumType>(chksum);
+ }
+
+ Status result = metaindex_handle_.DecodeFrom(input);
+ if (result.ok()) {
+ result = index_handle_.DecodeFrom(input);
+ }
+ if (result.ok()) {
+ // We skip over any leftover data (just padding for now) in "input"
+ const char* end = magic_ptr + kMagicNumberLengthByte;
+ *input = Slice(end, input->data() + input->size() - end);
+ }
+ return result;
+}
+
+std::string Footer::ToString() const {
+ std::string result;
+ result.reserve(1024);
+
+ bool legacy = IsLegacyFooterFormat(table_magic_number_);
+ if (legacy) {
+ result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
+ result.append("index handle: " + index_handle_.ToString() + "\n ");
+ result.append("table_magic_number: " +
+ ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n ");
+ } else {
+ result.append("checksum: " + ROCKSDB_NAMESPACE::ToString(checksum_) +
+ "\n ");
+ result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
+ result.append("index handle: " + index_handle_.ToString() + "\n ");
+ result.append("footer version: " + ROCKSDB_NAMESPACE::ToString(version_) +
+ "\n ");
+ result.append("table_magic_number: " +
+ ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n ");
+ }
+ return result;
+}
+
+Status ReadFooterFromFile(RandomAccessFileReader* file,
+ FilePrefetchBuffer* prefetch_buffer,
+ uint64_t file_size, Footer* footer,
+ uint64_t enforce_table_magic_number) {
+ if (file_size < Footer::kMinEncodedLength) {
+ return Status::Corruption("file is too short (" + ToString(file_size) +
+ " bytes) to be an "
+ "sstable: " +
+ file->file_name());
+ }
+
+ char footer_space[Footer::kMaxEncodedLength];
+ Slice footer_input;
+ size_t read_offset =
+ (file_size > Footer::kMaxEncodedLength)
+ ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
+ : 0;
+ Status s;
+ if (prefetch_buffer == nullptr ||
+ !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength,
+ &footer_input)) {
+ s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
+ footer_space);
+ if (!s.ok()) return s;
+ }
+
+ // Check that we actually read the whole footer from the file. It may be
+ // that size isn't correct.
+ if (footer_input.size() < Footer::kMinEncodedLength) {
+ return Status::Corruption("file is too short (" + ToString(file_size) +
+ " bytes) to be an "
+ "sstable" +
+ file->file_name());
+ }
+
+ s = footer->DecodeFrom(&footer_input);
+ if (!s.ok()) {
+ return s;
+ }
+ if (enforce_table_magic_number != 0 &&
+ enforce_table_magic_number != footer->table_magic_number()) {
+ return Status::Corruption(
+ "Bad table magic number: expected " +
+ ToString(enforce_table_magic_number) + ", found " +
+ ToString(footer->table_magic_number()) + " in " + file->file_name());
+ }
+ return Status::OK();
+}
+
+Status UncompressBlockContentsForCompressionType(
+ const UncompressionInfo& uncompression_info, const char* data, size_t n,
+ BlockContents* contents, uint32_t format_version,
+ const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) {
+ CacheAllocationPtr ubuf;
+
+ assert(uncompression_info.type() != kNoCompression &&
+ "Invalid compression type");
+
+ StopWatchNano timer(ioptions.env, ShouldReportDetailedTime(
+ ioptions.env, ioptions.statistics));
+ int decompress_size = 0;
+ switch (uncompression_info.type()) {
+ case kSnappyCompression: {
+ size_t ulength = 0;
+ static char snappy_corrupt_msg[] =
+ "Snappy not supported or corrupted Snappy compressed block contents";
+ if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
+ return Status::Corruption(snappy_corrupt_msg);
+ }
+ ubuf = AllocateBlock(ulength, allocator);
+ if (!Snappy_Uncompress(data, n, ubuf.get())) {
+ return Status::Corruption(snappy_corrupt_msg);
+ }
+ *contents = BlockContents(std::move(ubuf), ulength);
+ break;
+ }
+ case kZlibCompression:
+ ubuf = Zlib_Uncompress(
+ uncompression_info, data, n, &decompress_size,
+ GetCompressFormatForVersion(kZlibCompression, format_version),
+ allocator);
+ if (!ubuf) {
+ static char zlib_corrupt_msg[] =
+ "Zlib not supported or corrupted Zlib compressed block contents";
+ return Status::Corruption(zlib_corrupt_msg);
+ }
+ *contents = BlockContents(std::move(ubuf), decompress_size);
+ break;
+ case kBZip2Compression:
+ ubuf = BZip2_Uncompress(
+ data, n, &decompress_size,
+ GetCompressFormatForVersion(kBZip2Compression, format_version),
+ allocator);
+ if (!ubuf) {
+ static char bzip2_corrupt_msg[] =
+ "Bzip2 not supported or corrupted Bzip2 compressed block contents";
+ return Status::Corruption(bzip2_corrupt_msg);
+ }
+ *contents = BlockContents(std::move(ubuf), decompress_size);
+ break;
+ case kLZ4Compression:
+ ubuf = LZ4_Uncompress(
+ uncompression_info, data, n, &decompress_size,
+ GetCompressFormatForVersion(kLZ4Compression, format_version),
+ allocator);
+ if (!ubuf) {
+ static char lz4_corrupt_msg[] =
+ "LZ4 not supported or corrupted LZ4 compressed block contents";
+ return Status::Corruption(lz4_corrupt_msg);
+ }
+ *contents = BlockContents(std::move(ubuf), decompress_size);
+ break;
+ case kLZ4HCCompression:
+ ubuf = LZ4_Uncompress(
+ uncompression_info, data, n, &decompress_size,
+ GetCompressFormatForVersion(kLZ4HCCompression, format_version),
+ allocator);
+ if (!ubuf) {
+ static char lz4hc_corrupt_msg[] =
+ "LZ4HC not supported or corrupted LZ4HC compressed block contents";
+ return Status::Corruption(lz4hc_corrupt_msg);
+ }
+ *contents = BlockContents(std::move(ubuf), decompress_size);
+ break;
+ case kXpressCompression:
+ // XPRESS allocates memory internally, thus no support for custom
+ // allocator.
+ ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size));
+ if (!ubuf) {
+ static char xpress_corrupt_msg[] =
+ "XPRESS not supported or corrupted XPRESS compressed block "
+ "contents";
+ return Status::Corruption(xpress_corrupt_msg);
+ }
+ *contents = BlockContents(std::move(ubuf), decompress_size);
+ break;
+ case kZSTD:
+ case kZSTDNotFinalCompression:
+ ubuf = ZSTD_Uncompress(uncompression_info, data, n, &decompress_size,
+ allocator);
+ if (!ubuf) {
+ static char zstd_corrupt_msg[] =
+ "ZSTD not supported or corrupted ZSTD compressed block contents";
+ return Status::Corruption(zstd_corrupt_msg);
+ }
+ *contents = BlockContents(std::move(ubuf), decompress_size);
+ break;
+ default:
+ return Status::Corruption("bad block type");
+ }
+
+ if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) {
+ RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
+ timer.ElapsedNanos());
+ }
+ RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED,
+ contents->data.size());
+ RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
+
+ return Status::OK();
+}
+
+//
+// The 'data' points to the raw block contents that was read in from file.
+// This method allocates a new heap buffer and the raw block
+// contents are uncompresed into this buffer. This
+// buffer is returned via 'result' and it is upto the caller to
+// free this buffer.
+// format_version is the block format as defined in include/rocksdb/table.h
+Status UncompressBlockContents(const UncompressionInfo& uncompression_info,
+ const char* data, size_t n,
+ BlockContents* contents, uint32_t format_version,
+ const ImmutableCFOptions& ioptions,
+ MemoryAllocator* allocator) {
+ assert(data[n] != kNoCompression);
+ assert(data[n] == uncompression_info.type());
+ return UncompressBlockContentsForCompressionType(uncompression_info, data, n,
+ contents, format_version,
+ ioptions, allocator);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/format.h b/src/rocksdb/table/format.h
new file mode 100644
index 000000000..ad65fdbfb
--- /dev/null
+++ b/src/rocksdb/table/format.h
@@ -0,0 +1,344 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "file/file_prefetch_buffer.h"
+#include "file/random_access_file_reader.h"
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+
+#include "memory/memory_allocator.h"
+#include "options/cf_options.h"
+#include "port/malloc.h"
+#include "port/port.h" // noexcept
+#include "table/persistent_cache_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFile;
+struct ReadOptions;
+
+extern bool ShouldReportDetailedTime(Env* env, Statistics* stats);
+
+// the length of the magic number in bytes.
+const int kMagicNumberLengthByte = 8;
+
+// BlockHandle is a pointer to the extent of a file that stores a data
+// block or a meta block.
+class BlockHandle {
+ public:
+ BlockHandle();
+ BlockHandle(uint64_t offset, uint64_t size);
+
+ // The offset of the block in the file.
+ uint64_t offset() const { return offset_; }
+ void set_offset(uint64_t _offset) { offset_ = _offset; }
+
+ // The size of the stored block
+ uint64_t size() const { return size_; }
+ void set_size(uint64_t _size) { size_ = _size; }
+
+ void EncodeTo(std::string* dst) const;
+ Status DecodeFrom(Slice* input);
+ Status DecodeSizeFrom(uint64_t offset, Slice* input);
+
+ // Return a string that contains the copy of handle.
+ std::string ToString(bool hex = true) const;
+
+ // if the block handle's offset and size are both "0", we will view it
+ // as a null block handle that points to no where.
+ bool IsNull() const { return offset_ == 0 && size_ == 0; }
+
+ static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
+
+ // Maximum encoding length of a BlockHandle
+ enum { kMaxEncodedLength = 10 + 10 };
+
+ private:
+ uint64_t offset_;
+ uint64_t size_;
+
+ static const BlockHandle kNullBlockHandle;
+};
+
+// Value in block-based table file index.
+//
+// The index entry for block n is: y -> h, [x],
+// where: y is some key between the last key of block n (inclusive) and the
+// first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
+// x, if present, is the first key of block n (unshortened).
+// This struct represents the "h, [x]" part.
+struct IndexValue {
+ BlockHandle handle;
+ // Empty means unknown.
+ Slice first_internal_key;
+
+ IndexValue() = default;
+ IndexValue(BlockHandle _handle, Slice _first_internal_key)
+ : handle(_handle), first_internal_key(_first_internal_key) {}
+
+ // have_first_key indicates whether the `first_internal_key` is used.
+ // If previous_handle is not null, delta encoding is used;
+ // in this case, the two handles must point to consecutive blocks:
+ // handle.offset() ==
+ // previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
+ void EncodeTo(std::string* dst, bool have_first_key,
+ const BlockHandle* previous_handle) const;
+ Status DecodeFrom(Slice* input, bool have_first_key,
+ const BlockHandle* previous_handle);
+
+ std::string ToString(bool hex, bool have_first_key) const;
+};
+
+inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
+ uint32_t version) {
+#ifdef NDEBUG
+ (void)compression_type;
+#endif
+ // snappy is not versioned
+ assert(compression_type != kSnappyCompression &&
+ compression_type != kXpressCompression &&
+ compression_type != kNoCompression);
+ // As of version 2, we encode compressed block with
+ // compress_format_version == 2. Before that, the version is 1.
+ // DO NOT CHANGE THIS FUNCTION, it affects disk format
+ return version >= 2 ? 2 : 1;
+}
+
+inline bool BlockBasedTableSupportedVersion(uint32_t version) {
+ return version <= 5;
+}
+
+// Footer encapsulates the fixed information stored at the tail
+// end of every table file.
+class Footer {
+ public:
+ // Constructs a footer without specifying its table magic number.
+ // In such case, the table magic number of such footer should be
+ // initialized via @ReadFooterFromFile().
+ // Use this when you plan to load Footer with DecodeFrom(). Never use this
+ // when you plan to EncodeTo.
+ Footer() : Footer(kInvalidTableMagicNumber, 0) {}
+
+ // Use this constructor when you plan to write out the footer using
+ // EncodeTo(). Never use this constructor with DecodeFrom().
+ Footer(uint64_t table_magic_number, uint32_t version);
+
+ // The version of the footer in this file
+ uint32_t version() const { return version_; }
+
+ // The checksum type used in this file
+ ChecksumType checksum() const { return checksum_; }
+ void set_checksum(const ChecksumType c) { checksum_ = c; }
+
+ // The block handle for the metaindex block of the table
+ const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
+ void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; }
+
+ // The block handle for the index block of the table
+ const BlockHandle& index_handle() const { return index_handle_; }
+
+ void set_index_handle(const BlockHandle& h) { index_handle_ = h; }
+
+ uint64_t table_magic_number() const { return table_magic_number_; }
+
+ void EncodeTo(std::string* dst) const;
+
+ // Set the current footer based on the input slice.
+ //
+ // REQUIRES: table_magic_number_ is not set (i.e.,
+ // HasInitializedTableMagicNumber() is true). The function will initialize the
+ // magic number
+ Status DecodeFrom(Slice* input);
+
+ // Encoded length of a Footer. Note that the serialization of a Footer will
+ // always occupy at least kMinEncodedLength bytes. If fields are changed
+ // the version number should be incremented and kMaxEncodedLength should be
+ // increased accordingly.
+ enum {
+ // Footer version 0 (legacy) will always occupy exactly this many bytes.
+ // It consists of two block handles, padding, and a magic number.
+ kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8,
+ // Footer of versions 1 and higher will always occupy exactly this many
+ // bytes. It consists of the checksum type, two block handles, padding,
+ // a version number (bigger than 1), and a magic number
+ kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
+ kMinEncodedLength = kVersion0EncodedLength,
+ kMaxEncodedLength = kNewVersionsEncodedLength,
+ };
+
+ static const uint64_t kInvalidTableMagicNumber = 0;
+
+ // convert this object to a human readable form
+ std::string ToString() const;
+
+ private:
+ // REQUIRES: magic number wasn't initialized.
+ void set_table_magic_number(uint64_t magic_number) {
+ assert(!HasInitializedTableMagicNumber());
+ table_magic_number_ = magic_number;
+ }
+
+ // return true if @table_magic_number_ is set to a value different
+ // from @kInvalidTableMagicNumber.
+ bool HasInitializedTableMagicNumber() const {
+ return (table_magic_number_ != kInvalidTableMagicNumber);
+ }
+
+ uint32_t version_;
+ ChecksumType checksum_;
+ BlockHandle metaindex_handle_;
+ BlockHandle index_handle_;
+ uint64_t table_magic_number_ = 0;
+};
+
+// Read the footer from file
+// If enforce_table_magic_number != 0, ReadFooterFromFile() will return
+// corruption if table_magic number is not equal to enforce_table_magic_number
+Status ReadFooterFromFile(RandomAccessFileReader* file,
+ FilePrefetchBuffer* prefetch_buffer,
+ uint64_t file_size, Footer* footer,
+ uint64_t enforce_table_magic_number = 0);
+
+// 1-byte type + 32-bit crc
+static const size_t kBlockTrailerSize = 5;
+
+// Make block size calculation for IO less error prone
+inline uint64_t block_size(const BlockHandle& handle) {
+ return handle.size() + kBlockTrailerSize;
+}
+
+inline CompressionType get_block_compression_type(const char* block_data,
+ size_t block_size) {
+ return static_cast<CompressionType>(block_data[block_size]);
+}
+
+// Represents the contents of a block read from an SST file. Depending on how
+// it's created, it may or may not own the actual block bytes. As an example,
+// BlockContents objects representing data read from mmapped files only point
+// into the mmapped region.
+struct BlockContents {
+ Slice data; // Actual contents of data
+ CacheAllocationPtr allocation;
+
+#ifndef NDEBUG
+ // Whether the block is a raw block, which contains compression type
+ // byte. It is only used for assertion.
+ bool is_raw_block = false;
+#endif // NDEBUG
+
+ BlockContents() {}
+
+ // Does not take ownership of the underlying data bytes.
+ BlockContents(const Slice& _data) : data(_data) {}
+
+ // Takes ownership of the underlying data bytes.
+ BlockContents(CacheAllocationPtr&& _data, size_t _size)
+ : data(_data.get(), _size), allocation(std::move(_data)) {}
+
+ // Takes ownership of the underlying data bytes.
+ BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
+ : data(_data.get(), _size) {
+ allocation.reset(_data.release());
+ }
+
+ // Returns whether the object has ownership of the underlying data bytes.
+ bool own_bytes() const { return allocation.get() != nullptr; }
+
+ // It's the caller's responsibility to make sure that this is
+ // for raw block contents, which contains the compression
+ // byte in the end.
+ CompressionType get_compression_type() const {
+ assert(is_raw_block);
+ return get_block_compression_type(data.data(), data.size());
+ }
+
+ // The additional memory space taken by the block data.
+ size_t usable_size() const {
+ if (allocation.get() != nullptr) {
+ auto allocator = allocation.get_deleter().allocator;
+ if (allocator) {
+ return allocator->UsableSize(allocation.get(), data.size());
+ }
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ return malloc_usable_size(allocation.get());
+#else
+ return data.size();
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ } else {
+ return 0; // no extra memory is occupied by the data
+ }
+ }
+
+ size_t ApproximateMemoryUsage() const {
+ return usable_size() + sizeof(*this);
+ }
+
+ BlockContents(BlockContents&& other) ROCKSDB_NOEXCEPT {
+ *this = std::move(other);
+ }
+
+ BlockContents& operator=(BlockContents&& other) {
+ data = std::move(other.data);
+ allocation = std::move(other.allocation);
+#ifndef NDEBUG
+ is_raw_block = other.is_raw_block;
+#endif // NDEBUG
+ return *this;
+ }
+};
+
+// Read the block identified by "handle" from "file". On failure
+// return non-OK. On success fill *result and return OK.
+extern Status ReadBlockContents(
+ RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+ const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
+ BlockContents* contents, const ImmutableCFOptions& ioptions,
+ bool do_uncompress = true, const Slice& compression_dict = Slice(),
+ const PersistentCacheOptions& cache_options = PersistentCacheOptions());
+
+// The 'data' points to the raw block contents read in from file.
+// This method allocates a new heap buffer and the raw block
+// contents are uncompresed into this buffer. This buffer is
+// returned via 'result' and it is upto the caller to
+// free this buffer.
+// For description of compress_format_version and possible values, see
+// util/compression.h
+extern Status UncompressBlockContents(const UncompressionInfo& info,
+ const char* data, size_t n,
+ BlockContents* contents,
+ uint32_t compress_format_version,
+ const ImmutableCFOptions& ioptions,
+ MemoryAllocator* allocator = nullptr);
+
+// This is an extension to UncompressBlockContents that accepts
+// a specific compression type. This is used by un-wrapped blocks
+// with no compression header.
+extern Status UncompressBlockContentsForCompressionType(
+ const UncompressionInfo& info, const char* data, size_t n,
+ BlockContents* contents, uint32_t compress_format_version,
+ const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr);
+
+// Implementation details follow. Clients should ignore,
+
+// TODO(andrewkr): we should prefer one way of representing a null/uninitialized
+// BlockHandle. Currently we use zeros for null and use negation-of-zeros for
+// uninitialized.
+inline BlockHandle::BlockHandle()
+ : BlockHandle(~static_cast<uint64_t>(0), ~static_cast<uint64_t>(0)) {}
+
+inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
+ : offset_(_offset), size_(_size) {}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/get_context.cc b/src/rocksdb/table/get_context.cc
new file mode 100644
index 000000000..0e7ac0598
--- /dev/null
+++ b/src/rocksdb/table/get_context.cc
@@ -0,0 +1,366 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "table/get_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/read_callback.h"
+#include "monitoring/file_read_sample.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
+#ifndef ROCKSDB_LITE
+ if (replay_log) {
+ if (replay_log->empty()) {
+ // Optimization: in the common case of only one operation in the
+ // log, we allocate the exact amount of space needed.
+ replay_log->reserve(1 + VarintLength(value.size()) + value.size());
+ }
+ replay_log->push_back(type);
+ PutLengthPrefixedSlice(replay_log, value);
+ }
+#else
+ (void)replay_log;
+ (void)type;
+ (void)value;
+#endif // ROCKSDB_LITE
+}
+
+} // namespace
+
+GetContext::GetContext(
+ const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
+ Statistics* statistics, GetState init_state, const Slice& user_key,
+ PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
+ bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env,
+ SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr,
+ ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id)
+ : ucmp_(ucmp),
+ merge_operator_(merge_operator),
+ logger_(logger),
+ statistics_(statistics),
+ state_(init_state),
+ user_key_(user_key),
+ pinnable_val_(pinnable_val),
+ value_found_(value_found),
+ merge_context_(merge_context),
+ max_covering_tombstone_seq_(_max_covering_tombstone_seq),
+ env_(env),
+ seq_(seq),
+ replay_log_(nullptr),
+ pinned_iters_mgr_(_pinned_iters_mgr),
+ callback_(callback),
+ do_merge_(do_merge),
+ is_blob_index_(is_blob_index),
+ tracing_get_id_(tracing_get_id) {
+ if (seq_) {
+ *seq_ = kMaxSequenceNumber;
+ }
+ sample_ = should_sample_file_read();
+}
+
+// Called from TableCache::Get and Table::Get when file/block in which
+// key may exist are not there in TableCache/BlockCache respectively. In this
+// case we can't guarantee that key does not exist and are not permitted to do
+// IO to be certain.Set the status=kFound and value_found=false to let the
+// caller know that key may exist but is not there in memory
+void GetContext::MarkKeyMayExist() {
+ state_ = kFound;
+ if (value_found_ != nullptr) {
+ *value_found_ = false;
+ }
+}
+
+void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) {
+ assert(state_ == kNotFound);
+ appendToReplayLog(replay_log_, kTypeValue, value);
+
+ state_ = kFound;
+ if (LIKELY(pinnable_val_ != nullptr)) {
+ pinnable_val_->PinSelf(value);
+ }
+}
+
+void GetContext::ReportCounters() {
+ if (get_context_stats_.num_cache_hit > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit);
+ }
+ if (get_context_stats_.num_cache_index_hit > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT,
+ get_context_stats_.num_cache_index_hit);
+ }
+ if (get_context_stats_.num_cache_data_hit > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_DATA_HIT,
+ get_context_stats_.num_cache_data_hit);
+ }
+ if (get_context_stats_.num_cache_filter_hit > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT,
+ get_context_stats_.num_cache_filter_hit);
+ }
+ if (get_context_stats_.num_cache_compression_dict_hit > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_HIT,
+ get_context_stats_.num_cache_compression_dict_hit);
+ }
+ if (get_context_stats_.num_cache_index_miss > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS,
+ get_context_stats_.num_cache_index_miss);
+ }
+ if (get_context_stats_.num_cache_filter_miss > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS,
+ get_context_stats_.num_cache_filter_miss);
+ }
+ if (get_context_stats_.num_cache_data_miss > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_DATA_MISS,
+ get_context_stats_.num_cache_data_miss);
+ }
+ if (get_context_stats_.num_cache_compression_dict_miss > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_MISS,
+ get_context_stats_.num_cache_compression_dict_miss);
+ }
+ if (get_context_stats_.num_cache_bytes_read > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_BYTES_READ,
+ get_context_stats_.num_cache_bytes_read);
+ }
+ if (get_context_stats_.num_cache_miss > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_MISS,
+ get_context_stats_.num_cache_miss);
+ }
+ if (get_context_stats_.num_cache_add > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add);
+ }
+ if (get_context_stats_.num_cache_bytes_write > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE,
+ get_context_stats_.num_cache_bytes_write);
+ }
+ if (get_context_stats_.num_cache_index_add > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD,
+ get_context_stats_.num_cache_index_add);
+ }
+ if (get_context_stats_.num_cache_index_bytes_insert > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT,
+ get_context_stats_.num_cache_index_bytes_insert);
+ }
+ if (get_context_stats_.num_cache_data_add > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_DATA_ADD,
+ get_context_stats_.num_cache_data_add);
+ }
+ if (get_context_stats_.num_cache_data_bytes_insert > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT,
+ get_context_stats_.num_cache_data_bytes_insert);
+ }
+ if (get_context_stats_.num_cache_filter_add > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD,
+ get_context_stats_.num_cache_filter_add);
+ }
+ if (get_context_stats_.num_cache_filter_bytes_insert > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT,
+ get_context_stats_.num_cache_filter_bytes_insert);
+ }
+ if (get_context_stats_.num_cache_compression_dict_add > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD,
+ get_context_stats_.num_cache_compression_dict_add);
+ }
+ if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) {
+ RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+ get_context_stats_.num_cache_compression_dict_bytes_insert);
+ }
+}
+
+bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
+ const Slice& value, bool* matched,
+ Cleanable* value_pinner) {
+ assert(matched);
+ assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
+ merge_context_ != nullptr);
+ if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) {
+ *matched = true;
+ // If the value is not in the snapshot, skip it
+ if (!CheckCallback(parsed_key.sequence)) {
+ return true; // to continue to the next seq
+ }
+
+ appendToReplayLog(replay_log_, parsed_key.type, value);
+
+ if (seq_ != nullptr) {
+ // Set the sequence number if it is uninitialized
+ if (*seq_ == kMaxSequenceNumber) {
+ *seq_ = parsed_key.sequence;
+ }
+ }
+
+ auto type = parsed_key.type;
+ // Key matches. Process it
+ if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
+ max_covering_tombstone_seq_ != nullptr &&
+ *max_covering_tombstone_seq_ > parsed_key.sequence) {
+ type = kTypeRangeDeletion;
+ }
+ switch (type) {
+ case kTypeValue:
+ case kTypeBlobIndex:
+ assert(state_ == kNotFound || state_ == kMerge);
+ if (type == kTypeBlobIndex && is_blob_index_ == nullptr) {
+ // Blob value not supported. Stop.
+ state_ = kBlobIndex;
+ return false;
+ }
+ if (kNotFound == state_) {
+ state_ = kFound;
+ if (do_merge_) {
+ if (LIKELY(pinnable_val_ != nullptr)) {
+ if (LIKELY(value_pinner != nullptr)) {
+ // If the backing resources for the value are provided, pin them
+ pinnable_val_->PinSlice(value, value_pinner);
+ } else {
+ TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf",
+ this);
+
+ // Otherwise copy the value
+ pinnable_val_->PinSelf(value);
+ }
+ }
+ } else {
+ // It means this function is called as part of DB GetMergeOperands
+ // API and the current value should be part of
+ // merge_context_->operand_list
+ push_operand(value, value_pinner);
+ }
+ } else if (kMerge == state_) {
+ assert(merge_operator_ != nullptr);
+ state_ = kFound;
+ if (do_merge_) {
+ if (LIKELY(pinnable_val_ != nullptr)) {
+ Status merge_status = MergeHelper::TimedFullMerge(
+ merge_operator_, user_key_, &value,
+ merge_context_->GetOperands(), pinnable_val_->GetSelf(),
+ logger_, statistics_, env_);
+ pinnable_val_->PinSelf();
+ if (!merge_status.ok()) {
+ state_ = kCorrupt;
+ }
+ }
+ } else {
+ // It means this function is called as part of DB GetMergeOperands
+ // API and the current value should be part of
+ // merge_context_->operand_list
+ push_operand(value, value_pinner);
+ }
+ }
+ if (is_blob_index_ != nullptr) {
+ *is_blob_index_ = (type == kTypeBlobIndex);
+ }
+ return false;
+
+ case kTypeDeletion:
+ case kTypeSingleDeletion:
+ case kTypeRangeDeletion:
+ // TODO(noetzli): Verify correctness once merge of single-deletes
+ // is supported
+ assert(state_ == kNotFound || state_ == kMerge);
+ if (kNotFound == state_) {
+ state_ = kDeleted;
+ } else if (kMerge == state_) {
+ state_ = kFound;
+ if (LIKELY(pinnable_val_ != nullptr)) {
+ if (do_merge_) {
+ Status merge_status = MergeHelper::TimedFullMerge(
+ merge_operator_, user_key_, nullptr,
+ merge_context_->GetOperands(), pinnable_val_->GetSelf(),
+ logger_, statistics_, env_);
+ pinnable_val_->PinSelf();
+ if (!merge_status.ok()) {
+ state_ = kCorrupt;
+ }
+ }
+ // If do_merge_ = false then the current value shouldn't be part of
+ // merge_context_->operand_list
+ }
+ }
+ return false;
+
+ case kTypeMerge:
+ assert(state_ == kNotFound || state_ == kMerge);
+ state_ = kMerge;
+ // value_pinner is not set from plain_table_reader.cc for example.
+ push_operand(value, value_pinner);
+ if (do_merge_ && merge_operator_ != nullptr &&
+ merge_operator_->ShouldMerge(
+ merge_context_->GetOperandsDirectionBackward())) {
+ state_ = kFound;
+ if (LIKELY(pinnable_val_ != nullptr)) {
+ // do_merge_ = true this is the case where this function is called
+ // as part of DB Get API hence merge operators should be merged.
+ if (do_merge_) {
+ Status merge_status = MergeHelper::TimedFullMerge(
+ merge_operator_, user_key_, nullptr,
+ merge_context_->GetOperands(), pinnable_val_->GetSelf(),
+ logger_, statistics_, env_);
+ pinnable_val_->PinSelf();
+ if (!merge_status.ok()) {
+ state_ = kCorrupt;
+ }
+ }
+ }
+ return false;
+ }
+ return true;
+
+ default:
+ assert(false);
+ break;
+ }
+ }
+
+ // state_ could be Corrupt, merge or notfound
+ return false;
+}
+
+void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
+ if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
+ value_pinner != nullptr) {
+ value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
+ merge_context_->PushOperand(value, true /*value_pinned*/);
+ } else {
+ merge_context_->PushOperand(value, false);
+ }
+}
+
+void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
+ GetContext* get_context, Cleanable* value_pinner) {
+#ifndef ROCKSDB_LITE
+ Slice s = replay_log;
+ while (s.size()) {
+ auto type = static_cast<ValueType>(*s.data());
+ s.remove_prefix(1);
+ Slice value;
+ bool ret = GetLengthPrefixedSlice(&s, &value);
+ assert(ret);
+ (void)ret;
+
+ bool dont_care __attribute__((__unused__));
+ // Since SequenceNumber is not stored and unknown, we will use
+ // kMaxSequenceNumber.
+ get_context->SaveValue(
+ ParsedInternalKey(user_key, kMaxSequenceNumber, type), value,
+ &dont_care, value_pinner);
+ }
+#else // ROCKSDB_LITE
+ (void)replay_log;
+ (void)user_key;
+ (void)get_context;
+ (void)value_pinner;
+ assert(false);
+#endif // ROCKSDB_LITE
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/get_context.h b/src/rocksdb/table/get_context.h
new file mode 100644
index 000000000..79bae2214
--- /dev/null
+++ b/src/rocksdb/table/get_context.h
@@ -0,0 +1,191 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "db/read_callback.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/types.h"
+#include "table/block_based/block.h"
+
+namespace ROCKSDB_NAMESPACE {
+class MergeContext;
+class PinnedIteratorsManager;
+
+// Data structure for accumulating statistics during a point lookup. At the
+// end of the point lookup, the corresponding ticker stats are updated. This
+// avoids the overhead of frequent ticker stats updates
+struct GetContextStats {
+ uint64_t num_cache_hit = 0;
+ uint64_t num_cache_index_hit = 0;
+ uint64_t num_cache_data_hit = 0;
+ uint64_t num_cache_filter_hit = 0;
+ uint64_t num_cache_compression_dict_hit = 0;
+ uint64_t num_cache_index_miss = 0;
+ uint64_t num_cache_filter_miss = 0;
+ uint64_t num_cache_data_miss = 0;
+ uint64_t num_cache_compression_dict_miss = 0;
+ uint64_t num_cache_bytes_read = 0;
+ uint64_t num_cache_miss = 0;
+ uint64_t num_cache_add = 0;
+ uint64_t num_cache_bytes_write = 0;
+ uint64_t num_cache_index_add = 0;
+ uint64_t num_cache_index_bytes_insert = 0;
+ uint64_t num_cache_data_add = 0;
+ uint64_t num_cache_data_bytes_insert = 0;
+ uint64_t num_cache_filter_add = 0;
+ uint64_t num_cache_filter_bytes_insert = 0;
+ uint64_t num_cache_compression_dict_add = 0;
+ uint64_t num_cache_compression_dict_bytes_insert = 0;
+};
+
+// A class to hold context about a point lookup, such as pointer to value
+// slice, key, merge context etc, as well as the current state of the
+// lookup. Any user using GetContext to track the lookup result must call
+// SaveValue() whenever the internal key is found. This can happen
+// repeatedly in case of merge operands. In case the key may exist with
+// high probability, but IO is required to confirm and the user doesn't allow
+// it, MarkKeyMayExist() must be called instead of SaveValue().
+class GetContext {
+ public:
+ // Current state of the point lookup. All except kNotFound and kMerge are
+ // terminal states
+ enum GetState {
+ kNotFound,
+ kFound,
+ kDeleted,
+ kCorrupt,
+ kMerge, // saver contains the current merge result (the operands)
+ kBlobIndex,
+ };
+ GetContextStats get_context_stats_;
+
+ // Constructor
+ // @param value Holds the value corresponding to user_key. If its nullptr
+ // then return all merge operands corresponding to user_key
+ // via merge_context
+ // @param value_found If non-nullptr, set to false if key may be present
+ // but we can't be certain because we cannot do IO
+ // @param max_covering_tombstone_seq Pointer to highest sequence number of
+ // range deletion covering the key. When an internal key
+ // is found with smaller sequence number, the lookup
+ // terminates
+ // @param seq If non-nullptr, the sequence number of the found key will be
+ // saved here
+ // @param callback Pointer to ReadCallback to perform additional checks
+ // for visibility of a key
+ // @param is_blob_index If non-nullptr, will be used to indicate if a found
+ // key is of type blob index
+ // @param do_merge True if value associated with user_key has to be returned
+ // and false if all the merge operands associated with user_key has to be
+ // returned. Id do_merge=false then all the merge operands are stored in
+ // merge_context and they are never merged. The value pointer is untouched.
+ GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
+ Logger* logger, Statistics* statistics, GetState init_state,
+ const Slice& user_key, PinnableSlice* value, bool* value_found,
+ MergeContext* merge_context, bool do_merge,
+ SequenceNumber* max_covering_tombstone_seq, Env* env,
+ SequenceNumber* seq = nullptr,
+ PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
+ ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+ uint64_t tracing_get_id = 0);
+
+ GetContext() = delete;
+
+ // This can be called to indicate that a key may be present, but cannot be
+ // confirmed due to IO not allowed
+ void MarkKeyMayExist();
+
+ // Records this key, value, and any meta-data (such as sequence number and
+ // state) into this GetContext.
+ //
+ // If the parsed_key matches the user key that we are looking for, sets
+ // matched to true.
+ //
+ // Returns True if more keys need to be read (due to merges) or
+ // False if the complete value has been found.
+ bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value,
+ bool* matched, Cleanable* value_pinner = nullptr);
+
+ // Simplified version of the previous function. Should only be used when we
+ // know that the operation is a Put.
+ void SaveValue(const Slice& value, SequenceNumber seq);
+
+ GetState State() const { return state_; }
+
+ SequenceNumber* max_covering_tombstone_seq() {
+ return max_covering_tombstone_seq_;
+ }
+
+ PinnedIteratorsManager* pinned_iters_mgr() { return pinned_iters_mgr_; }
+
+ // If a non-null string is passed, all the SaveValue calls will be
+ // logged into the string. The operations can then be replayed on
+ // another GetContext with replayGetContextLog.
+ void SetReplayLog(std::string* replay_log) { replay_log_ = replay_log; }
+
+ // Do we need to fetch the SequenceNumber for this key?
+ bool NeedToReadSequence() const { return (seq_ != nullptr); }
+
+ bool sample() const { return sample_; }
+
+ bool CheckCallback(SequenceNumber seq) {
+ if (callback_) {
+ return callback_->IsVisible(seq);
+ }
+ return true;
+ }
+
+ void ReportCounters();
+
+ bool has_callback() const { return callback_ != nullptr; }
+
+ uint64_t get_tracing_get_id() const { return tracing_get_id_; }
+
+ void push_operand(const Slice& value, Cleanable* value_pinner);
+
+ private:
+ const Comparator* ucmp_;
+ const MergeOperator* merge_operator_;
+ // the merge operations encountered;
+ Logger* logger_;
+ Statistics* statistics_;
+
+ GetState state_;
+ Slice user_key_;
+ PinnableSlice* pinnable_val_;
+ bool* value_found_; // Is value set correctly? Used by KeyMayExist
+ MergeContext* merge_context_;
+ SequenceNumber* max_covering_tombstone_seq_;
+ Env* env_;
+ // If a key is found, seq_ will be set to the SequenceNumber of most recent
+ // write to the key or kMaxSequenceNumber if unknown
+ SequenceNumber* seq_;
+ std::string* replay_log_;
+ // Used to temporarily pin blocks when state_ == GetContext::kMerge
+ PinnedIteratorsManager* pinned_iters_mgr_;
+ ReadCallback* callback_;
+ bool sample_;
+ // Value is true if it's called as part of DB Get API and false if it's
+ // called as part of DB GetMergeOperands API. When it's false merge operators
+ // are never merged.
+ bool do_merge_;
+ bool* is_blob_index_;
+ // Used for block cache tracing only. A tracing get id uniquely identifies a
+ // Get or a MultiGet.
+ const uint64_t tracing_get_id_;
+};
+
+// Call this to replay a log and bring the get_context up to date. The replay
+// log must have been created by another GetContext object, whose replay log
+// must have been set by calling GetContext::SetReplayLog().
+void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
+ GetContext* get_context,
+ Cleanable* value_pinner = nullptr);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/internal_iterator.h b/src/rocksdb/table/internal_iterator.h
new file mode 100644
index 000000000..780db64b3
--- /dev/null
+++ b/src/rocksdb/table/internal_iterator.h
@@ -0,0 +1,182 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <string>
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/status.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PinnedIteratorsManager;
+
+struct IterateResult {
+ Slice key;
+ bool may_be_out_of_upper_bound;
+};
+
+template <class TValue>
+class InternalIteratorBase : public Cleanable {
+ public:
+ InternalIteratorBase() {}
+
+ // No copying allowed
+ InternalIteratorBase(const InternalIteratorBase&) = delete;
+ InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
+
+ virtual ~InternalIteratorBase() {}
+
+ // An iterator is either positioned at a key/value pair, or
+ // not valid. This method returns true iff the iterator is valid.
+ // Always returns false if !status().ok().
+ virtual bool Valid() const = 0;
+
+ // Position at the first key in the source. The iterator is Valid()
+ // after this call iff the source is not empty.
+ virtual void SeekToFirst() = 0;
+
+ // Position at the last key in the source. The iterator is
+ // Valid() after this call iff the source is not empty.
+ virtual void SeekToLast() = 0;
+
+ // Position at the first key in the source that at or past target
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or past target.
+ // All Seek*() methods clear any error status() that the iterator had prior to
+ // the call; after the seek, status() indicates only the error (if any) that
+ // happened during the seek, not any past errors.
+ virtual void Seek(const Slice& target) = 0;
+
+ // Position at the first key in the source that at or before target
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or before target.
+ virtual void SeekForPrev(const Slice& target) = 0;
+
+ // Moves to the next entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the last entry in the source.
+ // REQUIRES: Valid()
+ virtual void Next() = 0;
+
+ // Moves to the next entry in the source, and return result. Iterator
+ // implementation should override this method to help methods inline better,
+ // or when MayBeOutOfUpperBound() is non-trivial.
+ // REQUIRES: Valid()
+ virtual bool NextAndGetResult(IterateResult* result) {
+ Next();
+ bool is_valid = Valid();
+ if (is_valid) {
+ result->key = key();
+ // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual
+ // call. If an implementation has non-trivial MayBeOutOfUpperBound(),
+ // it should also override NextAndGetResult().
+ result->may_be_out_of_upper_bound = true;
+ assert(MayBeOutOfUpperBound());
+ }
+ return is_valid;
+ }
+
+ // Moves to the previous entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the first entry in source.
+ // REQUIRES: Valid()
+ virtual void Prev() = 0;
+
+ // Return the key for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: Valid()
+ virtual Slice key() const = 0;
+
+ // Return user key for the current entry.
+ // REQUIRES: Valid()
+ virtual Slice user_key() const { return ExtractUserKey(key()); }
+
+ // Return the value for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: Valid()
+ virtual TValue value() const = 0;
+
+ // If an error has occurred, return it. Else return an ok status.
+ // If non-blocking IO is requested and this operation cannot be
+ // satisfied without doing some IO, then this returns Status::Incomplete().
+ virtual Status status() const = 0;
+
+ // True if the iterator is invalidated because it reached a key that is above
+ // the iterator upper bound. Used by LevelIterator to decide whether it should
+ // stop or move on to the next file.
+ // Important: if iterator reached the end of the file without encountering any
+ // keys above the upper bound, IsOutOfBound() must return false.
+ virtual bool IsOutOfBound() { return false; }
+
+ // Keys return from this iterator can be smaller than iterate_lower_bound.
+ virtual bool MayBeOutOfLowerBound() { return true; }
+
+ // Keys return from this iterator can be larger or equal to
+ // iterate_upper_bound.
+ virtual bool MayBeOutOfUpperBound() { return true; }
+
+ // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont
+ // communicate with PinnedIteratorsManager so default implementation is no-op
+ // but for Iterators that need to communicate with PinnedIteratorsManager
+ // they will implement this function and use the passed pointer to communicate
+ // with PinnedIteratorsManager.
+ virtual void SetPinnedItersMgr(PinnedIteratorsManager* /*pinned_iters_mgr*/) {
+ }
+
+ // If true, this means that the Slice returned by key() is valid as long as
+ // PinnedIteratorsManager::ReleasePinnedData is not called and the
+ // Iterator is not deleted.
+ //
+ // IsKeyPinned() is guaranteed to always return true if
+ // - Iterator is created with ReadOptions::pin_data = true
+ // - DB tables were created with BlockBasedTableOptions::use_delta_encoding
+ // set to false.
+ virtual bool IsKeyPinned() const { return false; }
+
+ // If true, this means that the Slice returned by value() is valid as long as
+ // PinnedIteratorsManager::ReleasePinnedData is not called and the
+ // Iterator is not deleted.
+ virtual bool IsValuePinned() const { return false; }
+
+ virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) {
+ return Status::NotSupported("");
+ }
+
+ protected:
+ void SeekForPrevImpl(const Slice& target, const Comparator* cmp) {
+ Seek(target);
+ if (!Valid()) {
+ SeekToLast();
+ }
+ while (Valid() && cmp->Compare(target, key()) < 0) {
+ Prev();
+ }
+ }
+
+ bool is_mutable_;
+};
+
+using InternalIterator = InternalIteratorBase<Slice>;
+
+// Return an empty iterator (yields nothing).
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewEmptyInternalIterator();
+
+// Return an empty iterator with the specified status.
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
+ const Status& status);
+
+// Return an empty iterator with the specified status, allocated arena.
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
+ const Status& status, Arena* arena);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/iter_heap.h b/src/rocksdb/table/iter_heap.h
new file mode 100644
index 000000000..f6812fa03
--- /dev/null
+++ b/src/rocksdb/table/iter_heap.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include "db/dbformat.h"
+#include "table/iterator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// When used with std::priority_queue, this comparison functor puts the
+// iterator with the max/largest key on top.
+class MaxIteratorComparator {
+ public:
+ MaxIteratorComparator(const InternalKeyComparator* comparator)
+ : comparator_(comparator) {}
+
+ bool operator()(IteratorWrapper* a, IteratorWrapper* b) const {
+ return comparator_->Compare(a->key(), b->key()) < 0;
+ }
+ private:
+ const InternalKeyComparator* comparator_;
+};
+
+// When used with std::priority_queue, this comparison functor puts the
+// iterator with the min/smallest key on top.
+class MinIteratorComparator {
+ public:
+ MinIteratorComparator(const InternalKeyComparator* comparator)
+ : comparator_(comparator) {}
+
+ bool operator()(IteratorWrapper* a, IteratorWrapper* b) const {
+ return comparator_->Compare(a->key(), b->key()) > 0;
+ }
+ private:
+ const InternalKeyComparator* comparator_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/iterator.cc b/src/rocksdb/table/iterator.cc
new file mode 100644
index 000000000..4ecfc007b
--- /dev/null
+++ b/src/rocksdb/table/iterator.cc
@@ -0,0 +1,210 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/iterator.h"
+#include "memory/arena.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Cleanable::Cleanable() {
+ cleanup_.function = nullptr;
+ cleanup_.next = nullptr;
+}
+
+Cleanable::~Cleanable() { DoCleanup(); }
+
+Cleanable::Cleanable(Cleanable&& other) {
+ *this = std::move(other);
+}
+
+Cleanable& Cleanable::operator=(Cleanable&& other) {
+ if (this != &other) {
+ cleanup_ = other.cleanup_;
+ other.cleanup_.function = nullptr;
+ other.cleanup_.next = nullptr;
+ }
+ return *this;
+}
+
+// If the entire linked list was on heap we could have simply add attach one
+// link list to another. However the head is an embeded object to avoid the cost
+// of creating objects for most of the use cases when the Cleanable has only one
+// Cleanup to do. We could put evernything on heap if benchmarks show no
+// negative impact on performance.
+// Also we need to iterate on the linked list since there is no pointer to the
+// tail. We can add the tail pointer but maintainin it might negatively impact
+// the perforamnce for the common case of one cleanup where tail pointer is not
+// needed. Again benchmarks could clarify that.
+// Even without a tail pointer we could iterate on the list, find the tail, and
+// have only that node updated without the need to insert the Cleanups one by
+// one. This however would be redundant when the source Cleanable has one or a
+// few Cleanups which is the case most of the time.
+// TODO(myabandeh): if the list is too long we should maintain a tail pointer
+// and have the entire list (minus the head that has to be inserted separately)
+// merged with the target linked list at once.
+void Cleanable::DelegateCleanupsTo(Cleanable* other) {
+ assert(other != nullptr);
+ if (cleanup_.function == nullptr) {
+ return;
+ }
+ Cleanup* c = &cleanup_;
+ other->RegisterCleanup(c->function, c->arg1, c->arg2);
+ c = c->next;
+ while (c != nullptr) {
+ Cleanup* next = c->next;
+ other->RegisterCleanup(c);
+ c = next;
+ }
+ cleanup_.function = nullptr;
+ cleanup_.next = nullptr;
+}
+
+void Cleanable::RegisterCleanup(Cleanable::Cleanup* c) {
+ assert(c != nullptr);
+ if (cleanup_.function == nullptr) {
+ cleanup_.function = c->function;
+ cleanup_.arg1 = c->arg1;
+ cleanup_.arg2 = c->arg2;
+ delete c;
+ } else {
+ c->next = cleanup_.next;
+ cleanup_.next = c;
+ }
+}
+
+void Cleanable::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) {
+ assert(func != nullptr);
+ Cleanup* c;
+ if (cleanup_.function == nullptr) {
+ c = &cleanup_;
+ } else {
+ c = new Cleanup;
+ c->next = cleanup_.next;
+ cleanup_.next = c;
+ }
+ c->function = func;
+ c->arg1 = arg1;
+ c->arg2 = arg2;
+}
+
+Status Iterator::GetProperty(std::string prop_name, std::string* prop) {
+ if (prop == nullptr) {
+ return Status::InvalidArgument("prop is nullptr");
+ }
+ if (prop_name == "rocksdb.iterator.is-key-pinned") {
+ *prop = "0";
+ return Status::OK();
+ }
+ return Status::InvalidArgument("Unidentified property.");
+}
+
+namespace {
+class EmptyIterator : public Iterator {
+ public:
+ explicit EmptyIterator(const Status& s) : status_(s) { }
+ bool Valid() const override { return false; }
+ void Seek(const Slice& /*target*/) override {}
+ void SeekForPrev(const Slice& /*target*/) override {}
+ void SeekToFirst() override {}
+ void SeekToLast() override {}
+ void Next() override { assert(false); }
+ void Prev() override { assert(false); }
+ Slice key() const override {
+ assert(false);
+ return Slice();
+ }
+ Slice value() const override {
+ assert(false);
+ return Slice();
+ }
+ Status status() const override { return status_; }
+
+ private:
+ Status status_;
+};
+
+template <class TValue = Slice>
+class EmptyInternalIterator : public InternalIteratorBase<TValue> {
+ public:
+ explicit EmptyInternalIterator(const Status& s) : status_(s) {}
+ bool Valid() const override { return false; }
+ void Seek(const Slice& /*target*/) override {}
+ void SeekForPrev(const Slice& /*target*/) override {}
+ void SeekToFirst() override {}
+ void SeekToLast() override {}
+ void Next() override { assert(false); }
+ void Prev() override { assert(false); }
+ Slice key() const override {
+ assert(false);
+ return Slice();
+ }
+ TValue value() const override {
+ assert(false);
+ return TValue();
+ }
+ Status status() const override { return status_; }
+
+ private:
+ Status status_;
+};
+} // namespace
+
+Iterator* NewEmptyIterator() { return new EmptyIterator(Status::OK()); }
+
+Iterator* NewErrorIterator(const Status& status) {
+ return new EmptyIterator(status);
+}
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status) {
+ return new EmptyInternalIterator<TValue>(status);
+}
+template InternalIteratorBase<IndexValue>* NewErrorInternalIterator(
+ const Status& status);
+template InternalIteratorBase<Slice>* NewErrorInternalIterator(
+ const Status& status);
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status,
+ Arena* arena) {
+ if (arena == nullptr) {
+ return NewErrorInternalIterator<TValue>(status);
+ } else {
+ auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator<TValue>));
+ return new (mem) EmptyInternalIterator<TValue>(status);
+ }
+}
+template InternalIteratorBase<IndexValue>* NewErrorInternalIterator(
+ const Status& status, Arena* arena);
+template InternalIteratorBase<Slice>* NewErrorInternalIterator(
+ const Status& status, Arena* arena);
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewEmptyInternalIterator() {
+ return new EmptyInternalIterator<TValue>(Status::OK());
+}
+template InternalIteratorBase<IndexValue>* NewEmptyInternalIterator();
+template InternalIteratorBase<Slice>* NewEmptyInternalIterator();
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena) {
+ if (arena == nullptr) {
+ return NewEmptyInternalIterator<TValue>();
+ } else {
+ auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator<TValue>));
+ return new (mem) EmptyInternalIterator<TValue>(Status::OK());
+ }
+}
+template InternalIteratorBase<IndexValue>* NewEmptyInternalIterator(
+ Arena* arena);
+template InternalIteratorBase<Slice>* NewEmptyInternalIterator(Arena* arena);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/iterator_wrapper.h b/src/rocksdb/table/iterator_wrapper.h
new file mode 100644
index 000000000..c13359e99
--- /dev/null
+++ b/src/rocksdb/table/iterator_wrapper.h
@@ -0,0 +1,149 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <set>
+
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A internal wrapper class with an interface similar to Iterator that caches
+// the valid() and key() results for an underlying iterator.
+// This can help avoid virtual function calls and also gives better
+// cache locality.
+template <class TValue = Slice>
+class IteratorWrapperBase {
+ public:
+ IteratorWrapperBase() : iter_(nullptr), valid_(false) {}
+ explicit IteratorWrapperBase(InternalIteratorBase<TValue>* _iter)
+ : iter_(nullptr) {
+ Set(_iter);
+ }
+ ~IteratorWrapperBase() {}
+ InternalIteratorBase<TValue>* iter() const { return iter_; }
+
+ // Set the underlying Iterator to _iter and return
+ // previous underlying Iterator.
+ InternalIteratorBase<TValue>* Set(InternalIteratorBase<TValue>* _iter) {
+ InternalIteratorBase<TValue>* old_iter = iter_;
+
+ iter_ = _iter;
+ if (iter_ == nullptr) {
+ valid_ = false;
+ } else {
+ Update();
+ }
+ return old_iter;
+ }
+
+ void DeleteIter(bool is_arena_mode) {
+ if (iter_) {
+ if (!is_arena_mode) {
+ delete iter_;
+ } else {
+ iter_->~InternalIteratorBase<TValue>();
+ }
+ }
+ }
+
+ // Iterator interface methods
+ bool Valid() const { return valid_; }
+ Slice key() const {
+ assert(Valid());
+ return result_.key;
+ }
+ TValue value() const {
+ assert(Valid());
+ return iter_->value();
+ }
+ // Methods below require iter() != nullptr
+ Status status() const {
+ assert(iter_);
+ return iter_->status();
+ }
+ void Next() {
+ assert(iter_);
+ valid_ = iter_->NextAndGetResult(&result_);
+ assert(!valid_ || iter_->status().ok());
+ }
+ void Prev() {
+ assert(iter_);
+ iter_->Prev();
+ Update();
+ }
+ void Seek(const Slice& k) {
+ assert(iter_);
+ iter_->Seek(k);
+ Update();
+ }
+ void SeekForPrev(const Slice& k) {
+ assert(iter_);
+ iter_->SeekForPrev(k);
+ Update();
+ }
+ void SeekToFirst() {
+ assert(iter_);
+ iter_->SeekToFirst();
+ Update();
+ }
+ void SeekToLast() {
+ assert(iter_);
+ iter_->SeekToLast();
+ Update();
+ }
+
+ bool MayBeOutOfLowerBound() {
+ assert(Valid());
+ return iter_->MayBeOutOfLowerBound();
+ }
+
+ bool MayBeOutOfUpperBound() {
+ assert(Valid());
+ return result_.may_be_out_of_upper_bound;
+ }
+
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {
+ assert(iter_);
+ iter_->SetPinnedItersMgr(pinned_iters_mgr);
+ }
+ bool IsKeyPinned() const {
+ assert(Valid());
+ return iter_->IsKeyPinned();
+ }
+ bool IsValuePinned() const {
+ assert(Valid());
+ return iter_->IsValuePinned();
+ }
+
+ private:
+ void Update() {
+ valid_ = iter_->Valid();
+ if (valid_) {
+ assert(iter_->status().ok());
+ result_.key = iter_->key();
+ result_.may_be_out_of_upper_bound = true;
+ }
+ }
+
+ InternalIteratorBase<TValue>* iter_;
+ IterateResult result_;
+ bool valid_;
+};
+
+using IteratorWrapper = IteratorWrapperBase<Slice>;
+
+class Arena;
+// Return an empty iterator (yields nothing) allocated from arena.
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/merger_test.cc b/src/rocksdb/table/merger_test.cc
new file mode 100644
index 000000000..466e0eb42
--- /dev/null
+++ b/src/rocksdb/table/merger_test.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <vector>
+#include <string>
+
+#include "table/merging_iterator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergerTest : public testing::Test {
+ public:
+ MergerTest()
+ : icomp_(BytewiseComparator()),
+ rnd_(3),
+ merging_iterator_(nullptr),
+ single_iterator_(nullptr) {}
+ ~MergerTest() override = default;
+ std::vector<std::string> GenerateStrings(size_t len, int string_len) {
+ std::vector<std::string> ret;
+
+ for (size_t i = 0; i < len; ++i) {
+ InternalKey ik(test::RandomHumanReadableString(&rnd_, string_len), 0,
+ ValueType::kTypeValue);
+ ret.push_back(ik.Encode().ToString(false));
+ }
+ return ret;
+ }
+
+ void AssertEquivalence() {
+ auto a = merging_iterator_.get();
+ auto b = single_iterator_.get();
+ if (!a->Valid()) {
+ ASSERT_TRUE(!b->Valid());
+ } else {
+ ASSERT_TRUE(b->Valid());
+ ASSERT_EQ(b->key().ToString(), a->key().ToString());
+ ASSERT_EQ(b->value().ToString(), a->value().ToString());
+ }
+ }
+
+ void SeekToRandom() {
+ InternalKey ik(test::RandomHumanReadableString(&rnd_, 5), 0,
+ ValueType::kTypeValue);
+ Seek(ik.Encode().ToString(false));
+ }
+
+ void Seek(std::string target) {
+ merging_iterator_->Seek(target);
+ single_iterator_->Seek(target);
+ }
+
+ void SeekToFirst() {
+ merging_iterator_->SeekToFirst();
+ single_iterator_->SeekToFirst();
+ }
+
+ void SeekToLast() {
+ merging_iterator_->SeekToLast();
+ single_iterator_->SeekToLast();
+ }
+
+ void Next(int times) {
+ for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+ AssertEquivalence();
+ merging_iterator_->Next();
+ single_iterator_->Next();
+ }
+ AssertEquivalence();
+ }
+
+ void Prev(int times) {
+ for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+ AssertEquivalence();
+ merging_iterator_->Prev();
+ single_iterator_->Prev();
+ }
+ AssertEquivalence();
+ }
+
+ void NextAndPrev(int times) {
+ for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+ AssertEquivalence();
+ if (rnd_.OneIn(2)) {
+ merging_iterator_->Prev();
+ single_iterator_->Prev();
+ } else {
+ merging_iterator_->Next();
+ single_iterator_->Next();
+ }
+ }
+ AssertEquivalence();
+ }
+
+ void Generate(size_t num_iterators, size_t strings_per_iterator,
+ int letters_per_string) {
+ std::vector<InternalIterator*> small_iterators;
+ for (size_t i = 0; i < num_iterators; ++i) {
+ auto strings = GenerateStrings(strings_per_iterator, letters_per_string);
+ small_iterators.push_back(new test::VectorIterator(strings));
+ all_keys_.insert(all_keys_.end(), strings.begin(), strings.end());
+ }
+
+ merging_iterator_.reset(
+ NewMergingIterator(&icomp_, &small_iterators[0],
+ static_cast<int>(small_iterators.size())));
+ single_iterator_.reset(new test::VectorIterator(all_keys_));
+ }
+
+ InternalKeyComparator icomp_;
+ Random rnd_;
+ std::unique_ptr<InternalIterator> merging_iterator_;
+ std::unique_ptr<InternalIterator> single_iterator_;
+ std::vector<std::string> all_keys_;
+};
+
+TEST_F(MergerTest, SeekToRandomNextTest) {
+ Generate(1000, 50, 50);
+ for (int i = 0; i < 10; ++i) {
+ SeekToRandom();
+ AssertEquivalence();
+ Next(50000);
+ }
+}
+
+TEST_F(MergerTest, SeekToRandomNextSmallStringsTest) {
+ Generate(1000, 50, 2);
+ for (int i = 0; i < 10; ++i) {
+ SeekToRandom();
+ AssertEquivalence();
+ Next(50000);
+ }
+}
+
+TEST_F(MergerTest, SeekToRandomPrevTest) {
+ Generate(1000, 50, 50);
+ for (int i = 0; i < 10; ++i) {
+ SeekToRandom();
+ AssertEquivalence();
+ Prev(50000);
+ }
+}
+
+TEST_F(MergerTest, SeekToRandomRandomTest) {
+ Generate(200, 50, 50);
+ for (int i = 0; i < 3; ++i) {
+ SeekToRandom();
+ AssertEquivalence();
+ NextAndPrev(5000);
+ }
+}
+
+TEST_F(MergerTest, SeekToFirstTest) {
+ Generate(1000, 50, 50);
+ for (int i = 0; i < 10; ++i) {
+ SeekToFirst();
+ AssertEquivalence();
+ Next(50000);
+ }
+}
+
+TEST_F(MergerTest, SeekToLastTest) {
+ Generate(1000, 50, 50);
+ for (int i = 0; i < 10; ++i) {
+ SeekToLast();
+ AssertEquivalence();
+ Prev(50000);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/merging_iterator.cc b/src/rocksdb/table/merging_iterator.cc
new file mode 100644
index 000000000..47fa048f3
--- /dev/null
+++ b/src/rocksdb/table/merging_iterator.cc
@@ -0,0 +1,468 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/merging_iterator.h"
+#include <string>
+#include <vector>
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "memory/arena.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iter_heap.h"
+#include "table/iterator_wrapper.h"
+#include "test_util/sync_point.h"
+#include "util/autovector.h"
+#include "util/heap.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+typedef BinaryHeap<IteratorWrapper*, MaxIteratorComparator> MergerMaxIterHeap;
+typedef BinaryHeap<IteratorWrapper*, MinIteratorComparator> MergerMinIterHeap;
+} // namespace
+
+const size_t kNumIterReserve = 4;
+
+class MergingIterator : public InternalIterator {
+ public:
+ MergingIterator(const InternalKeyComparator* comparator,
+ InternalIterator** children, int n, bool is_arena_mode,
+ bool prefix_seek_mode)
+ : is_arena_mode_(is_arena_mode),
+ comparator_(comparator),
+ current_(nullptr),
+ direction_(kForward),
+ minHeap_(comparator_),
+ prefix_seek_mode_(prefix_seek_mode),
+ pinned_iters_mgr_(nullptr) {
+ children_.resize(n);
+ for (int i = 0; i < n; i++) {
+ children_[i].Set(children[i]);
+ }
+ for (auto& child : children_) {
+ AddToMinHeapOrCheckStatus(&child);
+ }
+ current_ = CurrentForward();
+ }
+
+ void considerStatus(Status s) {
+ if (!s.ok() && status_.ok()) {
+ status_ = s;
+ }
+ }
+
+ virtual void AddIterator(InternalIterator* iter) {
+ assert(direction_ == kForward);
+ children_.emplace_back(iter);
+ if (pinned_iters_mgr_) {
+ iter->SetPinnedItersMgr(pinned_iters_mgr_);
+ }
+ auto new_wrapper = children_.back();
+ AddToMinHeapOrCheckStatus(&new_wrapper);
+ if (new_wrapper.Valid()) {
+ current_ = CurrentForward();
+ }
+ }
+
+ ~MergingIterator() override {
+ for (auto& child : children_) {
+ child.DeleteIter(is_arena_mode_);
+ }
+ }
+
+ bool Valid() const override { return current_ != nullptr && status_.ok(); }
+
+ Status status() const override { return status_; }
+
+ void SeekToFirst() override {
+ ClearHeaps();
+ status_ = Status::OK();
+ for (auto& child : children_) {
+ child.SeekToFirst();
+ AddToMinHeapOrCheckStatus(&child);
+ }
+ direction_ = kForward;
+ current_ = CurrentForward();
+ }
+
+ void SeekToLast() override {
+ ClearHeaps();
+ InitMaxHeap();
+ status_ = Status::OK();
+ for (auto& child : children_) {
+ child.SeekToLast();
+ AddToMaxHeapOrCheckStatus(&child);
+ }
+ direction_ = kReverse;
+ current_ = CurrentReverse();
+ }
+
+ void Seek(const Slice& target) override {
+ ClearHeaps();
+ status_ = Status::OK();
+ for (auto& child : children_) {
+ {
+ PERF_TIMER_GUARD(seek_child_seek_time);
+ child.Seek(target);
+ }
+
+ PERF_COUNTER_ADD(seek_child_seek_count, 1);
+ {
+ // Strictly, we timed slightly more than min heap operation,
+ // but these operations are very cheap.
+ PERF_TIMER_GUARD(seek_min_heap_time);
+ AddToMinHeapOrCheckStatus(&child);
+ }
+ }
+ direction_ = kForward;
+ {
+ PERF_TIMER_GUARD(seek_min_heap_time);
+ current_ = CurrentForward();
+ }
+ }
+
+ void SeekForPrev(const Slice& target) override {
+ ClearHeaps();
+ InitMaxHeap();
+ status_ = Status::OK();
+
+ for (auto& child : children_) {
+ {
+ PERF_TIMER_GUARD(seek_child_seek_time);
+ child.SeekForPrev(target);
+ }
+ PERF_COUNTER_ADD(seek_child_seek_count, 1);
+
+ {
+ PERF_TIMER_GUARD(seek_max_heap_time);
+ AddToMaxHeapOrCheckStatus(&child);
+ }
+ }
+ direction_ = kReverse;
+ {
+ PERF_TIMER_GUARD(seek_max_heap_time);
+ current_ = CurrentReverse();
+ }
+ }
+
+ void Next() override {
+ assert(Valid());
+
+ // Ensure that all children are positioned after key().
+ // If we are moving in the forward direction, it is already
+ // true for all of the non-current children since current_ is
+ // the smallest child and key() == current_->key().
+ if (direction_ != kForward) {
+ SwitchToForward();
+ // The loop advanced all non-current children to be > key() so current_
+ // should still be strictly the smallest key.
+ }
+
+ // For the heap modifications below to be correct, current_ must be the
+ // current top of the heap.
+ assert(current_ == CurrentForward());
+
+ // as the current points to the current record. move the iterator forward.
+ current_->Next();
+ if (current_->Valid()) {
+ // current is still valid after the Next() call above. Call
+ // replace_top() to restore the heap property. When the same child
+ // iterator yields a sequence of keys, this is cheap.
+ assert(current_->status().ok());
+ minHeap_.replace_top(current_);
+ } else {
+ // current stopped being valid, remove it from the heap.
+ considerStatus(current_->status());
+ minHeap_.pop();
+ }
+ current_ = CurrentForward();
+ }
+
+ bool NextAndGetResult(IterateResult* result) override {
+ Next();
+ bool is_valid = Valid();
+ if (is_valid) {
+ result->key = key();
+ result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+ }
+ return is_valid;
+ }
+
+ void Prev() override {
+ assert(Valid());
+ // Ensure that all children are positioned before key().
+ // If we are moving in the reverse direction, it is already
+ // true for all of the non-current children since current_ is
+ // the largest child and key() == current_->key().
+ if (direction_ != kReverse) {
+ // Otherwise, retreat the non-current children. We retreat current_
+ // just after the if-block.
+ SwitchToBackward();
+ }
+
+ // For the heap modifications below to be correct, current_ must be the
+ // current top of the heap.
+ assert(current_ == CurrentReverse());
+
+ current_->Prev();
+ if (current_->Valid()) {
+ // current is still valid after the Prev() call above. Call
+ // replace_top() to restore the heap property. When the same child
+ // iterator yields a sequence of keys, this is cheap.
+ assert(current_->status().ok());
+ maxHeap_->replace_top(current_);
+ } else {
+ // current stopped being valid, remove it from the heap.
+ considerStatus(current_->status());
+ maxHeap_->pop();
+ }
+ current_ = CurrentReverse();
+ }
+
+ Slice key() const override {
+ assert(Valid());
+ return current_->key();
+ }
+
+ Slice value() const override {
+ assert(Valid());
+ return current_->value();
+ }
+
+ // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
+ // from current child iterator. Potentially as long as one of child iterator
+ // report out of bound is not possible, we know current key is within bound.
+
+ bool MayBeOutOfLowerBound() override {
+ assert(Valid());
+ return current_->MayBeOutOfLowerBound();
+ }
+
+ bool MayBeOutOfUpperBound() override {
+ assert(Valid());
+ return current_->MayBeOutOfUpperBound();
+ }
+
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ for (auto& child : children_) {
+ child.SetPinnedItersMgr(pinned_iters_mgr);
+ }
+ }
+
+ bool IsKeyPinned() const override {
+ assert(Valid());
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ current_->IsKeyPinned();
+ }
+
+ bool IsValuePinned() const override {
+ assert(Valid());
+ return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+ current_->IsValuePinned();
+ }
+
+ private:
+ // Clears heaps for both directions, used when changing direction or seeking
+ void ClearHeaps();
+ // Ensures that maxHeap_ is initialized when starting to go in the reverse
+ // direction
+ void InitMaxHeap();
+
+ bool is_arena_mode_;
+ const InternalKeyComparator* comparator_;
+ autovector<IteratorWrapper, kNumIterReserve> children_;
+
+ // Cached pointer to child iterator with the current key, or nullptr if no
+ // child iterators are valid. This is the top of minHeap_ or maxHeap_
+ // depending on the direction.
+ IteratorWrapper* current_;
+ // If any of the children have non-ok status, this is one of them.
+ Status status_;
+ // Which direction is the iterator moving?
+ enum Direction {
+ kForward,
+ kReverse
+ };
+ Direction direction_;
+ MergerMinIterHeap minHeap_;
+ bool prefix_seek_mode_;
+
+ // Max heap is used for reverse iteration, which is way less common than
+ // forward. Lazily initialize it to save memory.
+ std::unique_ptr<MergerMaxIterHeap> maxHeap_;
+ PinnedIteratorsManager* pinned_iters_mgr_;
+
+ // In forward direction, process a child that is not in the min heap.
+ // If valid, add to the min heap. Otherwise, check status.
+ void AddToMinHeapOrCheckStatus(IteratorWrapper*);
+
+ // In backward direction, process a child that is not in the max heap.
+ // If valid, add to the min heap. Otherwise, check status.
+ void AddToMaxHeapOrCheckStatus(IteratorWrapper*);
+
+ void SwitchToForward();
+
+ // Switch the direction from forward to backward without changing the
+ // position. Iterator should still be valid.
+ void SwitchToBackward();
+
+ IteratorWrapper* CurrentForward() const {
+ assert(direction_ == kForward);
+ return !minHeap_.empty() ? minHeap_.top() : nullptr;
+ }
+
+ IteratorWrapper* CurrentReverse() const {
+ assert(direction_ == kReverse);
+ assert(maxHeap_);
+ return !maxHeap_->empty() ? maxHeap_->top() : nullptr;
+ }
+};
+
+void MergingIterator::AddToMinHeapOrCheckStatus(IteratorWrapper* child) {
+ if (child->Valid()) {
+ assert(child->status().ok());
+ minHeap_.push(child);
+ } else {
+ considerStatus(child->status());
+ }
+}
+
+void MergingIterator::AddToMaxHeapOrCheckStatus(IteratorWrapper* child) {
+ if (child->Valid()) {
+ assert(child->status().ok());
+ maxHeap_->push(child);
+ } else {
+ considerStatus(child->status());
+ }
+}
+
+void MergingIterator::SwitchToForward() {
+ // Otherwise, advance the non-current children. We advance current_
+ // just after the if-block.
+ ClearHeaps();
+ Slice target = key();
+ for (auto& child : children_) {
+ if (&child != current_) {
+ child.Seek(target);
+ if (child.Valid() && comparator_->Equal(target, child.key())) {
+ assert(child.status().ok());
+ child.Next();
+ }
+ }
+ AddToMinHeapOrCheckStatus(&child);
+ }
+ direction_ = kForward;
+}
+
+void MergingIterator::SwitchToBackward() {
+ ClearHeaps();
+ InitMaxHeap();
+ Slice target = key();
+ for (auto& child : children_) {
+ if (&child != current_) {
+ child.SeekForPrev(target);
+ TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
+ if (child.Valid() && comparator_->Equal(target, child.key())) {
+ assert(child.status().ok());
+ child.Prev();
+ }
+ }
+ AddToMaxHeapOrCheckStatus(&child);
+ }
+ direction_ = kReverse;
+ if (!prefix_seek_mode_) {
+ // Note that we don't do assert(current_ == CurrentReverse()) here
+ // because it is possible to have some keys larger than the seek-key
+ // inserted between Seek() and SeekToLast(), which makes current_ not
+ // equal to CurrentReverse().
+ current_ = CurrentReverse();
+ }
+ assert(current_ == CurrentReverse());
+}
+
+void MergingIterator::ClearHeaps() {
+ minHeap_.clear();
+ if (maxHeap_) {
+ maxHeap_->clear();
+ }
+}
+
+void MergingIterator::InitMaxHeap() {
+ if (!maxHeap_) {
+ maxHeap_.reset(new MergerMaxIterHeap(comparator_));
+ }
+}
+
+InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp,
+ InternalIterator** list, int n,
+ Arena* arena, bool prefix_seek_mode) {
+ assert(n >= 0);
+ if (n == 0) {
+ return NewEmptyInternalIterator<Slice>(arena);
+ } else if (n == 1) {
+ return list[0];
+ } else {
+ if (arena == nullptr) {
+ return new MergingIterator(cmp, list, n, false, prefix_seek_mode);
+ } else {
+ auto mem = arena->AllocateAligned(sizeof(MergingIterator));
+ return new (mem) MergingIterator(cmp, list, n, true, prefix_seek_mode);
+ }
+ }
+}
+
+MergeIteratorBuilder::MergeIteratorBuilder(
+ const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode)
+ : first_iter(nullptr), use_merging_iter(false), arena(a) {
+ auto mem = arena->AllocateAligned(sizeof(MergingIterator));
+ merge_iter =
+ new (mem) MergingIterator(comparator, nullptr, 0, true, prefix_seek_mode);
+}
+
+MergeIteratorBuilder::~MergeIteratorBuilder() {
+ if (first_iter != nullptr) {
+ first_iter->~InternalIterator();
+ }
+ if (merge_iter != nullptr) {
+ merge_iter->~MergingIterator();
+ }
+}
+
+void MergeIteratorBuilder::AddIterator(InternalIterator* iter) {
+ if (!use_merging_iter && first_iter != nullptr) {
+ merge_iter->AddIterator(first_iter);
+ use_merging_iter = true;
+ first_iter = nullptr;
+ }
+ if (use_merging_iter) {
+ merge_iter->AddIterator(iter);
+ } else {
+ first_iter = iter;
+ }
+}
+
+InternalIterator* MergeIteratorBuilder::Finish() {
+ InternalIterator* ret = nullptr;
+ if (!use_merging_iter) {
+ ret = first_iter;
+ first_iter = nullptr;
+ } else {
+ ret = merge_iter;
+ merge_iter = nullptr;
+ }
+ return ret;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/merging_iterator.h b/src/rocksdb/table/merging_iterator.h
new file mode 100644
index 000000000..c4429bf58
--- /dev/null
+++ b/src/rocksdb/table/merging_iterator.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/dbformat.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+class Env;
+class Arena;
+template <class TValue>
+class InternalIteratorBase;
+using InternalIterator = InternalIteratorBase<Slice>;
+
+// Return an iterator that provided the union of the data in
+// children[0,n-1]. Takes ownership of the child iterators and
+// will delete them when the result iterator is deleted.
+//
+// The result does no duplicate suppression. I.e., if a particular
+// key is present in K child iterators, it will be yielded K times.
+//
+// REQUIRES: n >= 0
+extern InternalIterator* NewMergingIterator(
+ const InternalKeyComparator* comparator, InternalIterator** children, int n,
+ Arena* arena = nullptr, bool prefix_seek_mode = false);
+
+class MergingIterator;
+
+// A builder class to build a merging iterator by adding iterators one by one.
+class MergeIteratorBuilder {
+ public:
+ // comparator: the comparator used in merging comparator
+ // arena: where the merging iterator needs to be allocated from.
+ explicit MergeIteratorBuilder(const InternalKeyComparator* comparator,
+ Arena* arena, bool prefix_seek_mode = false);
+ ~MergeIteratorBuilder();
+
+ // Add iter to the merging iterator.
+ void AddIterator(InternalIterator* iter);
+
+ // Get arena used to build the merging iterator. It is called one a child
+ // iterator needs to be allocated.
+ Arena* GetArena() { return arena; }
+
+ // Return the result merging iterator.
+ InternalIterator* Finish();
+
+ private:
+ MergingIterator* merge_iter;
+ InternalIterator* first_iter;
+ bool use_merging_iter;
+ Arena* arena;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/meta_blocks.cc b/src/rocksdb/table/meta_blocks.cc
new file mode 100644
index 000000000..6920bb14e
--- /dev/null
+++ b/src/rocksdb/table/meta_blocks.cc
@@ -0,0 +1,525 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#include "table/meta_blocks.h"
+
+#include <map>
+#include <string>
+
+#include "block_fetcher.h"
+#include "db/table_properties_collector.h"
+#include "file/random_access_file_reader.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "table/persistent_cache_helper.h"
+#include "table/table_properties_internal.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+MetaIndexBuilder::MetaIndexBuilder()
+ : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}
+
+void MetaIndexBuilder::Add(const std::string& key,
+ const BlockHandle& handle) {
+ std::string handle_encoding;
+ handle.EncodeTo(&handle_encoding);
+ meta_block_handles_.insert({key, handle_encoding});
+}
+
+Slice MetaIndexBuilder::Finish() {
+ for (const auto& metablock : meta_block_handles_) {
+ meta_index_block_->Add(metablock.first, metablock.second);
+ }
+ return meta_index_block_->Finish();
+}
+
+// Property block will be read sequentially and cached in a heap located
+// object, so there's no need for restart points. Thus we set the restart
+// interval to infinity to save space.
+PropertyBlockBuilder::PropertyBlockBuilder()
+ : properties_block_(
+ new BlockBuilder(port::kMaxInt32 /* restart interval */)) {}
+
+void PropertyBlockBuilder::Add(const std::string& name,
+ const std::string& val) {
+ props_.insert({name, val});
+}
+
+void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) {
+ assert(props_.find(name) == props_.end());
+
+ std::string dst;
+ PutVarint64(&dst, val);
+
+ Add(name, dst);
+}
+
+void PropertyBlockBuilder::Add(
+ const UserCollectedProperties& user_collected_properties) {
+ for (const auto& prop : user_collected_properties) {
+ Add(prop.first, prop.second);
+ }
+}
+
+void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
+ TEST_SYNC_POINT_CALLBACK("PropertyBlockBuilder::AddTableProperty:Start",
+ const_cast<TableProperties*>(&props));
+
+ Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
+ Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
+ Add(TablePropertiesNames::kDataSize, props.data_size);
+ Add(TablePropertiesNames::kIndexSize, props.index_size);
+ if (props.index_partitions != 0) {
+ Add(TablePropertiesNames::kIndexPartitions, props.index_partitions);
+ Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size);
+ }
+ Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key);
+ Add(TablePropertiesNames::kIndexValueIsDeltaEncoded,
+ props.index_value_is_delta_encoded);
+ Add(TablePropertiesNames::kNumEntries, props.num_entries);
+ Add(TablePropertiesNames::kDeletedKeys, props.num_deletions);
+ Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands);
+ Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
+ Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
+ Add(TablePropertiesNames::kFilterSize, props.filter_size);
+ Add(TablePropertiesNames::kFormatVersion, props.format_version);
+ Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
+ Add(TablePropertiesNames::kColumnFamilyId, props.column_family_id);
+ Add(TablePropertiesNames::kCreationTime, props.creation_time);
+ Add(TablePropertiesNames::kOldestKeyTime, props.oldest_key_time);
+ if (props.file_creation_time > 0) {
+ Add(TablePropertiesNames::kFileCreationTime, props.file_creation_time);
+ }
+
+ if (!props.filter_policy_name.empty()) {
+ Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name);
+ }
+ if (!props.comparator_name.empty()) {
+ Add(TablePropertiesNames::kComparator, props.comparator_name);
+ }
+
+ if (!props.merge_operator_name.empty()) {
+ Add(TablePropertiesNames::kMergeOperator, props.merge_operator_name);
+ }
+ if (!props.prefix_extractor_name.empty()) {
+ Add(TablePropertiesNames::kPrefixExtractorName,
+ props.prefix_extractor_name);
+ }
+ if (!props.property_collectors_names.empty()) {
+ Add(TablePropertiesNames::kPropertyCollectors,
+ props.property_collectors_names);
+ }
+ if (!props.column_family_name.empty()) {
+ Add(TablePropertiesNames::kColumnFamilyName, props.column_family_name);
+ }
+
+ if (!props.compression_name.empty()) {
+ Add(TablePropertiesNames::kCompression, props.compression_name);
+ }
+ if (!props.compression_options.empty()) {
+ Add(TablePropertiesNames::kCompressionOptions, props.compression_options);
+ }
+}
+
+Slice PropertyBlockBuilder::Finish() {
+ for (const auto& prop : props_) {
+ properties_block_->Add(prop.first, prop.second);
+ }
+
+ return properties_block_->Finish();
+}
+
+void LogPropertiesCollectionError(
+ Logger* info_log, const std::string& method, const std::string& name) {
+ assert(method == "Add" || method == "Finish");
+
+ std::string msg =
+ "Encountered error when calling TablePropertiesCollector::" +
+ method + "() with collector name: " + name;
+ ROCKS_LOG_ERROR(info_log, "%s", msg.c_str());
+}
+
+bool NotifyCollectTableCollectorsOnAdd(
+ const Slice& key, const Slice& value, uint64_t file_size,
+ const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+ Logger* info_log) {
+ bool all_succeeded = true;
+ for (auto& collector : collectors) {
+ Status s = collector->InternalAdd(key, value, file_size);
+ all_succeeded = all_succeeded && s.ok();
+ if (!s.ok()) {
+ LogPropertiesCollectionError(info_log, "Add" /* method */,
+ collector->Name());
+ }
+ }
+ return all_succeeded;
+}
+
+void NotifyCollectTableCollectorsOnBlockAdd(
+ const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+ const uint64_t blockRawBytes, const uint64_t blockCompressedBytesFast,
+ const uint64_t blockCompressedBytesSlow) {
+ for (auto& collector : collectors) {
+ collector->BlockAdd(blockRawBytes, blockCompressedBytesFast,
+ blockCompressedBytesSlow);
+ }
+}
+
+bool NotifyCollectTableCollectorsOnFinish(
+ const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+ Logger* info_log, PropertyBlockBuilder* builder) {
+ bool all_succeeded = true;
+ for (auto& collector : collectors) {
+ UserCollectedProperties user_collected_properties;
+ Status s = collector->Finish(&user_collected_properties);
+
+ all_succeeded = all_succeeded && s.ok();
+ if (!s.ok()) {
+ LogPropertiesCollectionError(info_log, "Finish" /* method */,
+ collector->Name());
+ } else {
+ builder->Add(user_collected_properties);
+ }
+ }
+
+ return all_succeeded;
+}
+
+Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
+ FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
+ const ImmutableCFOptions& ioptions,
+ TableProperties** table_properties, bool verify_checksum,
+ BlockHandle* ret_block_handle,
+ CacheAllocationPtr* verification_buf,
+ bool /*compression_type_missing*/,
+ MemoryAllocator* memory_allocator) {
+ assert(table_properties);
+
+ Slice v = handle_value;
+ BlockHandle handle;
+ if (!handle.DecodeFrom(&v).ok()) {
+ return Status::InvalidArgument("Failed to decode properties block handle");
+ }
+
+ BlockContents block_contents;
+ ReadOptions read_options;
+ read_options.verify_checksums = verify_checksum;
+ Status s;
+ PersistentCacheOptions cache_options;
+
+ BlockFetcher block_fetcher(
+ file, prefetch_buffer, footer, read_options, handle, &block_contents,
+ ioptions, false /* decompress */, false /*maybe_compressed*/,
+ BlockType::kProperties, UncompressionDict::GetEmptyDict(), cache_options,
+ memory_allocator);
+ s = block_fetcher.ReadBlockContents();
+ // property block is never compressed. Need to add uncompress logic if we are
+ // to compress it..
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ Block properties_block(std::move(block_contents),
+ kDisableGlobalSequenceNumber);
+ DataBlockIter iter;
+ properties_block.NewDataIterator(BytewiseComparator(), BytewiseComparator(),
+ &iter);
+
+ auto new_table_properties = new TableProperties();
+ // All pre-defined properties of type uint64_t
+ std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
+ {TablePropertiesNames::kDataSize, &new_table_properties->data_size},
+ {TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
+ {TablePropertiesNames::kIndexPartitions,
+ &new_table_properties->index_partitions},
+ {TablePropertiesNames::kTopLevelIndexSize,
+ &new_table_properties->top_level_index_size},
+ {TablePropertiesNames::kIndexKeyIsUserKey,
+ &new_table_properties->index_key_is_user_key},
+ {TablePropertiesNames::kIndexValueIsDeltaEncoded,
+ &new_table_properties->index_value_is_delta_encoded},
+ {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
+ {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
+ {TablePropertiesNames::kRawValueSize,
+ &new_table_properties->raw_value_size},
+ {TablePropertiesNames::kNumDataBlocks,
+ &new_table_properties->num_data_blocks},
+ {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
+ {TablePropertiesNames::kDeletedKeys,
+ &new_table_properties->num_deletions},
+ {TablePropertiesNames::kMergeOperands,
+ &new_table_properties->num_merge_operands},
+ {TablePropertiesNames::kNumRangeDeletions,
+ &new_table_properties->num_range_deletions},
+ {TablePropertiesNames::kFormatVersion,
+ &new_table_properties->format_version},
+ {TablePropertiesNames::kFixedKeyLen,
+ &new_table_properties->fixed_key_len},
+ {TablePropertiesNames::kColumnFamilyId,
+ &new_table_properties->column_family_id},
+ {TablePropertiesNames::kCreationTime,
+ &new_table_properties->creation_time},
+ {TablePropertiesNames::kOldestKeyTime,
+ &new_table_properties->oldest_key_time},
+ {TablePropertiesNames::kFileCreationTime,
+ &new_table_properties->file_creation_time},
+ };
+
+ std::string last_key;
+ for (iter.SeekToFirstOrReport(); iter.Valid(); iter.NextOrReport()) {
+ s = iter.status();
+ if (!s.ok()) {
+ break;
+ }
+
+ auto key = iter.key().ToString();
+ // properties block should be strictly sorted with no duplicate key.
+ if (!last_key.empty() &&
+ BytewiseComparator()->Compare(key, last_key) <= 0) {
+ s = Status::Corruption("properties unsorted");
+ break;
+ }
+ last_key = key;
+
+ auto raw_val = iter.value();
+ auto pos = predefined_uint64_properties.find(key);
+
+ new_table_properties->properties_offsets.insert(
+ {key, handle.offset() + iter.ValueOffset()});
+
+ if (pos != predefined_uint64_properties.end()) {
+ if (key == TablePropertiesNames::kDeletedKeys ||
+ key == TablePropertiesNames::kMergeOperands) {
+ // Insert in user-collected properties for API backwards compatibility
+ new_table_properties->user_collected_properties.insert(
+ {key, raw_val.ToString()});
+ }
+ // handle predefined rocksdb properties
+ uint64_t val;
+ if (!GetVarint64(&raw_val, &val)) {
+ // skip malformed value
+ auto error_msg =
+ "Detect malformed value in properties meta-block:"
+ "\tkey: " + key + "\tval: " + raw_val.ToString();
+ ROCKS_LOG_ERROR(ioptions.info_log, "%s", error_msg.c_str());
+ continue;
+ }
+ *(pos->second) = val;
+ } else if (key == TablePropertiesNames::kFilterPolicy) {
+ new_table_properties->filter_policy_name = raw_val.ToString();
+ } else if (key == TablePropertiesNames::kColumnFamilyName) {
+ new_table_properties->column_family_name = raw_val.ToString();
+ } else if (key == TablePropertiesNames::kComparator) {
+ new_table_properties->comparator_name = raw_val.ToString();
+ } else if (key == TablePropertiesNames::kMergeOperator) {
+ new_table_properties->merge_operator_name = raw_val.ToString();
+ } else if (key == TablePropertiesNames::kPrefixExtractorName) {
+ new_table_properties->prefix_extractor_name = raw_val.ToString();
+ } else if (key == TablePropertiesNames::kPropertyCollectors) {
+ new_table_properties->property_collectors_names = raw_val.ToString();
+ } else if (key == TablePropertiesNames::kCompression) {
+ new_table_properties->compression_name = raw_val.ToString();
+ } else if (key == TablePropertiesNames::kCompressionOptions) {
+ new_table_properties->compression_options = raw_val.ToString();
+ } else {
+ // handle user-collected properties
+ new_table_properties->user_collected_properties.insert(
+ {key, raw_val.ToString()});
+ }
+ }
+ if (s.ok()) {
+ *table_properties = new_table_properties;
+ if (ret_block_handle != nullptr) {
+ *ret_block_handle = handle;
+ }
+ if (verification_buf != nullptr) {
+ size_t len = static_cast<size_t>(handle.size() + kBlockTrailerSize);
+ *verification_buf =
+ ROCKSDB_NAMESPACE::AllocateBlock(len, memory_allocator);
+ if (verification_buf->get() != nullptr) {
+ memcpy(verification_buf->get(), block_contents.data.data(), len);
+ }
+ }
+ } else {
+ delete new_table_properties;
+ }
+
+ return s;
+}
+
+Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
+ uint64_t table_magic_number,
+ const ImmutableCFOptions& ioptions,
+ TableProperties** properties,
+ bool compression_type_missing,
+ MemoryAllocator* memory_allocator) {
+ // -- Read metaindex block
+ Footer footer;
+ auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
+ &footer, table_magic_number);
+ if (!s.ok()) {
+ return s;
+ }
+
+ auto metaindex_handle = footer.metaindex_handle();
+ BlockContents metaindex_contents;
+ ReadOptions read_options;
+ read_options.verify_checksums = false;
+ PersistentCacheOptions cache_options;
+
+ BlockFetcher block_fetcher(
+ file, nullptr /* prefetch_buffer */, footer, read_options,
+ metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
+ false /*maybe_compressed*/, BlockType::kMetaIndex,
+ UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+ s = block_fetcher.ReadBlockContents();
+ if (!s.ok()) {
+ return s;
+ }
+ // property blocks are never compressed. Need to add uncompress logic if we
+ // are to compress it.
+ Block metaindex_block(std::move(metaindex_contents),
+ kDisableGlobalSequenceNumber);
+ std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
+ BytewiseComparator(), BytewiseComparator()));
+
+ // -- Read property block
+ bool found_properties_block = true;
+ s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block);
+ if (!s.ok()) {
+ return s;
+ }
+
+ TableProperties table_properties;
+ if (found_properties_block == true) {
+ s = ReadProperties(
+ meta_iter->value(), file, nullptr /* prefetch_buffer */, footer,
+ ioptions, properties, false /* verify_checksum */,
+ nullptr /* ret_block_hanel */, nullptr /* ret_block_contents */,
+ compression_type_missing, memory_allocator);
+ } else {
+ s = Status::NotFound();
+ }
+
+ return s;
+}
+
+Status FindMetaBlock(InternalIterator* meta_index_iter,
+ const std::string& meta_block_name,
+ BlockHandle* block_handle) {
+ meta_index_iter->Seek(meta_block_name);
+ if (meta_index_iter->status().ok() && meta_index_iter->Valid() &&
+ meta_index_iter->key() == meta_block_name) {
+ Slice v = meta_index_iter->value();
+ return block_handle->DecodeFrom(&v);
+ } else {
+ return Status::Corruption("Cannot find the meta block", meta_block_name);
+ }
+}
+
+Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
+ uint64_t table_magic_number,
+ const ImmutableCFOptions& ioptions,
+ const std::string& meta_block_name,
+ BlockHandle* block_handle,
+ bool /*compression_type_missing*/,
+ MemoryAllocator* memory_allocator) {
+ Footer footer;
+ auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
+ &footer, table_magic_number);
+ if (!s.ok()) {
+ return s;
+ }
+
+ auto metaindex_handle = footer.metaindex_handle();
+ BlockContents metaindex_contents;
+ ReadOptions read_options;
+ read_options.verify_checksums = false;
+ PersistentCacheOptions cache_options;
+ BlockFetcher block_fetcher(
+ file, nullptr /* prefetch_buffer */, footer, read_options,
+ metaindex_handle, &metaindex_contents, ioptions,
+ false /* do decompression */, false /*maybe_compressed*/,
+ BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), cache_options,
+ memory_allocator);
+ s = block_fetcher.ReadBlockContents();
+ if (!s.ok()) {
+ return s;
+ }
+ // meta blocks are never compressed. Need to add uncompress logic if we are to
+ // compress it.
+ Block metaindex_block(std::move(metaindex_contents),
+ kDisableGlobalSequenceNumber);
+
+ std::unique_ptr<InternalIterator> meta_iter;
+ meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(),
+ BytewiseComparator()));
+
+ return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
+}
+
+Status ReadMetaBlock(RandomAccessFileReader* file,
+ FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
+ uint64_t table_magic_number,
+ const ImmutableCFOptions& ioptions,
+ const std::string& meta_block_name, BlockType block_type,
+ BlockContents* contents, bool /*compression_type_missing*/,
+ MemoryAllocator* memory_allocator) {
+ Status status;
+ Footer footer;
+ status = ReadFooterFromFile(file, prefetch_buffer, file_size, &footer,
+ table_magic_number);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Reading metaindex block
+ auto metaindex_handle = footer.metaindex_handle();
+ BlockContents metaindex_contents;
+ ReadOptions read_options;
+ read_options.verify_checksums = false;
+ PersistentCacheOptions cache_options;
+
+ BlockFetcher block_fetcher(
+ file, prefetch_buffer, footer, read_options, metaindex_handle,
+ &metaindex_contents, ioptions, false /* decompress */,
+ false /*maybe_compressed*/, BlockType::kMetaIndex,
+ UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+ status = block_fetcher.ReadBlockContents();
+ if (!status.ok()) {
+ return status;
+ }
+ // meta block is never compressed. Need to add uncompress logic if we are to
+ // compress it.
+
+ // Finding metablock
+ Block metaindex_block(std::move(metaindex_contents),
+ kDisableGlobalSequenceNumber);
+
+ std::unique_ptr<InternalIterator> meta_iter;
+ meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(),
+ BytewiseComparator()));
+
+ BlockHandle block_handle;
+ status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
+
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Reading metablock
+ BlockFetcher block_fetcher2(
+ file, prefetch_buffer, footer, read_options, block_handle, contents,
+ ioptions, false /* decompress */, false /*maybe_compressed*/, block_type,
+ UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+ return block_fetcher2.ReadBlockContents();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/meta_blocks.h b/src/rocksdb/table/meta_blocks.h
new file mode 100644
index 000000000..0fd140cf0
--- /dev/null
+++ b/src/rocksdb/table/meta_blocks.h
@@ -0,0 +1,152 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/table_properties_collector.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/memory_allocator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/block_type.h"
+#include "table/format.h"
+#include "util/kv_map.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class Env;
+class Footer;
+class Logger;
+class RandomAccessFile;
+struct TableProperties;
+
+class MetaIndexBuilder {
+ public:
+ MetaIndexBuilder(const MetaIndexBuilder&) = delete;
+ MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete;
+
+ MetaIndexBuilder();
+ void Add(const std::string& key, const BlockHandle& handle);
+
+ // Write all the added key/value pairs to the block and return the contents
+ // of the block.
+ Slice Finish();
+
+ private:
+ // store the sorted key/handle of the metablocks.
+ stl_wrappers::KVMap meta_block_handles_;
+ std::unique_ptr<BlockBuilder> meta_index_block_;
+};
+
+class PropertyBlockBuilder {
+ public:
+ PropertyBlockBuilder(const PropertyBlockBuilder&) = delete;
+ PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete;
+
+ PropertyBlockBuilder();
+
+ void AddTableProperty(const TableProperties& props);
+ void Add(const std::string& key, uint64_t value);
+ void Add(const std::string& key, const std::string& value);
+ void Add(const UserCollectedProperties& user_collected_properties);
+
+ // Write all the added entries to the block and return the block contents
+ Slice Finish();
+
+ private:
+ std::unique_ptr<BlockBuilder> properties_block_;
+ stl_wrappers::KVMap props_;
+};
+
+// Were we encounter any error occurs during user-defined statistics collection,
+// we'll write the warning message to info log.
+void LogPropertiesCollectionError(
+ Logger* info_log, const std::string& method, const std::string& name);
+
+// Utility functions help table builder to trigger batch events for user
+// defined property collectors.
+// Return value indicates if there is any error occurred; if error occurred,
+// the warning message will be logged.
+// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all
+// property collectors.
+bool NotifyCollectTableCollectorsOnAdd(
+ const Slice& key, const Slice& value, uint64_t file_size,
+ const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+ Logger* info_log);
+
+void NotifyCollectTableCollectorsOnBlockAdd(
+ const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+ uint64_t blockRawBytes, uint64_t blockCompressedBytesFast,
+ uint64_t blockCompressedBytesSlow);
+
+// NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all
+// property collectors. The collected properties will be added to `builder`.
+bool NotifyCollectTableCollectorsOnFinish(
+ const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+ Logger* info_log, PropertyBlockBuilder* builder);
+
+// Read the properties from the table.
+// @returns a status to indicate if the operation succeeded. On success,
+// *table_properties will point to a heap-allocated TableProperties
+// object, otherwise value of `table_properties` will not be modified.
+Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
+ FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
+ const ImmutableCFOptions& ioptions,
+ TableProperties** table_properties, bool verify_checksum,
+ BlockHandle* block_handle,
+ CacheAllocationPtr* verification_buf,
+ bool compression_type_missing = false,
+ MemoryAllocator* memory_allocator = nullptr);
+
+// Directly read the properties from the properties block of a plain table.
+// @returns a status to indicate if the operation succeeded. On success,
+// *table_properties will point to a heap-allocated TableProperties
+// object, otherwise value of `table_properties` will not be modified.
+// certain tables do not have compression_type byte setup properly for
+// uncompressed blocks, caller can request to reset compression type by
+// passing compression_type_missing = true, the same applies to
+// `ReadProperties`, `FindMetaBlock`, and `ReadMetaBlock`
+Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
+ uint64_t table_magic_number,
+ const ImmutableCFOptions& ioptions,
+ TableProperties** properties,
+ bool compression_type_missing = false,
+ MemoryAllocator* memory_allocator = nullptr);
+
+// Find the meta block from the meta index block.
+Status FindMetaBlock(InternalIterator* meta_index_iter,
+ const std::string& meta_block_name,
+ BlockHandle* block_handle);
+
+// Find the meta block
+Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
+ uint64_t table_magic_number,
+ const ImmutableCFOptions& ioptions,
+ const std::string& meta_block_name,
+ BlockHandle* block_handle,
+ bool compression_type_missing = false,
+ MemoryAllocator* memory_allocator = nullptr);
+
+// Read the specified meta block with name meta_block_name
+// from `file` and initialize `contents` with contents of this block.
+// Return Status::OK in case of success.
+Status ReadMetaBlock(RandomAccessFileReader* file,
+ FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
+ uint64_t table_magic_number,
+ const ImmutableCFOptions& ioptions,
+ const std::string& meta_block_name, BlockType block_type,
+ BlockContents* contents,
+ bool compression_type_missing = false,
+ MemoryAllocator* memory_allocator = nullptr);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/mock_table.cc b/src/rocksdb/table/mock_table.cc
new file mode 100644
index 000000000..9ef44628a
--- /dev/null
+++ b/src/rocksdb/table/mock_table.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "table/mock_table.h"
+
+#include "db/dbformat.h"
+#include "env/composite_env_wrapper.h"
+#include "file/random_access_file_reader.h"
+#include "port/port.h"
+#include "rocksdb/table_properties.h"
+#include "table/get_context.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace mock {
+
+namespace {
+
+const InternalKeyComparator icmp_(BytewiseComparator());
+
+} // namespace
+
+stl_wrappers::KVMap MakeMockFile(
+ std::initializer_list<std::pair<const std::string, std::string>> l) {
+ return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_));
+}
+
+InternalIterator* MockTableReader::NewIterator(
+ const ReadOptions&, const SliceTransform* /* prefix_extractor */,
+ Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/,
+ size_t /*compaction_readahead_size*/) {
+ return new MockTableIterator(table_);
+}
+
+Status MockTableReader::Get(const ReadOptions&, const Slice& key,
+ GetContext* get_context,
+ const SliceTransform* /*prefix_extractor*/,
+ bool /*skip_filters*/) {
+ std::unique_ptr<MockTableIterator> iter(new MockTableIterator(table_));
+ for (iter->Seek(key); iter->Valid(); iter->Next()) {
+ ParsedInternalKey parsed_key;
+ if (!ParseInternalKey(iter->key(), &parsed_key)) {
+ return Status::Corruption(Slice());
+ }
+
+ bool dont_care __attribute__((__unused__));
+ if (!get_context->SaveValue(parsed_key, iter->value(), &dont_care)) {
+ break;
+ }
+ }
+ return Status::OK();
+}
+
+std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties()
+ const {
+ return std::shared_ptr<const TableProperties>(new TableProperties());
+}
+
+MockTableFactory::MockTableFactory() : next_id_(1) {}
+
+Status MockTableFactory::NewTableReader(
+ const TableReaderOptions& /*table_reader_options*/,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /*file_size*/,
+ std::unique_ptr<TableReader>* table_reader,
+ bool /*prefetch_index_and_filter_in_cache*/) const {
+ uint32_t id = GetIDFromFile(file.get());
+
+ MutexLock lock_guard(&file_system_.mutex);
+
+ auto it = file_system_.files.find(id);
+ if (it == file_system_.files.end()) {
+ return Status::IOError("Mock file not found");
+ }
+
+ table_reader->reset(new MockTableReader(it->second));
+
+ return Status::OK();
+}
+
+TableBuilder* MockTableFactory::NewTableBuilder(
+ const TableBuilderOptions& /*table_builder_options*/,
+ uint32_t /*column_family_id*/, WritableFileWriter* file) const {
+ uint32_t id = GetAndWriteNextID(file);
+
+ return new MockTableBuilder(id, &file_system_);
+}
+
+Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname,
+ stl_wrappers::KVMap file_contents) {
+ std::unique_ptr<WritableFile> file;
+ auto s = env->NewWritableFile(fname, &file, EnvOptions());
+ if (!s.ok()) {
+ return s;
+ }
+
+ WritableFileWriter file_writer(NewLegacyWritableFileWrapper(std::move(file)),
+ fname, EnvOptions());
+
+ uint32_t id = GetAndWriteNextID(&file_writer);
+ file_system_.files.insert({id, std::move(file_contents)});
+ return Status::OK();
+}
+
+uint32_t MockTableFactory::GetAndWriteNextID(WritableFileWriter* file) const {
+ uint32_t next_id = next_id_.fetch_add(1);
+ char buf[4];
+ EncodeFixed32(buf, next_id);
+ file->Append(Slice(buf, 4));
+ return next_id;
+}
+
+uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const {
+ char buf[4];
+ Slice result;
+ file->Read(0, 4, &result, buf);
+ assert(result.size() == 4);
+ return DecodeFixed32(buf);
+}
+
+void MockTableFactory::AssertSingleFile(
+ const stl_wrappers::KVMap& file_contents) {
+ ASSERT_EQ(file_system_.files.size(), 1U);
+ ASSERT_EQ(file_contents, file_system_.files.begin()->second);
+}
+
+void MockTableFactory::AssertLatestFile(
+ const stl_wrappers::KVMap& file_contents) {
+ ASSERT_GE(file_system_.files.size(), 1U);
+ auto latest = file_system_.files.end();
+ --latest;
+
+ if (file_contents != latest->second) {
+ std::cout << "Wrong content! Content of latest file:" << std::endl;
+ for (const auto& kv : latest->second) {
+ ParsedInternalKey ikey;
+ std::string key, value;
+ std::tie(key, value) = kv;
+ ParseInternalKey(Slice(key), &ikey);
+ std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
+ }
+ FAIL();
+ }
+}
+
+} // namespace mock
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/mock_table.h b/src/rocksdb/table/mock_table.h
new file mode 100644
index 000000000..9e80c8d04
--- /dev/null
+++ b/src/rocksdb/table/mock_table.h
@@ -0,0 +1,214 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/table.h"
+#include "table/internal_iterator.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/kv_map.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace mock {
+
+stl_wrappers::KVMap MakeMockFile(
+ std::initializer_list<std::pair<const std::string, std::string>> l = {});
+
+struct MockTableFileSystem {
+ port::Mutex mutex;
+ std::map<uint32_t, stl_wrappers::KVMap> files;
+};
+
+class MockTableReader : public TableReader {
+ public:
+ explicit MockTableReader(const stl_wrappers::KVMap& table) : table_(table) {}
+
+ InternalIterator* NewIterator(const ReadOptions&,
+ const SliceTransform* prefix_extractor,
+ Arena* arena, bool skip_filters,
+ TableReaderCaller caller,
+ size_t compaction_readahead_size = 0) override;
+
+ Status Get(const ReadOptions& readOptions, const Slice& key,
+ GetContext* get_context, const SliceTransform* prefix_extractor,
+ bool skip_filters = false) override;
+
+ uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+ TableReaderCaller /*caller*/) override {
+ return 0;
+ }
+
+ uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+ TableReaderCaller /*caller*/) override {
+ return 0;
+ }
+
+ size_t ApproximateMemoryUsage() const override { return 0; }
+
+ void SetupForCompaction() override {}
+
+ std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+ ~MockTableReader() {}
+
+ private:
+ const stl_wrappers::KVMap& table_;
+};
+
+class MockTableIterator : public InternalIterator {
+ public:
+ explicit MockTableIterator(const stl_wrappers::KVMap& table) : table_(table) {
+ itr_ = table_.end();
+ }
+
+ bool Valid() const override { return itr_ != table_.end(); }
+
+ void SeekToFirst() override { itr_ = table_.begin(); }
+
+ void SeekToLast() override {
+ itr_ = table_.end();
+ --itr_;
+ }
+
+ void Seek(const Slice& target) override {
+ std::string str_target(target.data(), target.size());
+ itr_ = table_.lower_bound(str_target);
+ }
+
+ void SeekForPrev(const Slice& target) override {
+ std::string str_target(target.data(), target.size());
+ itr_ = table_.upper_bound(str_target);
+ Prev();
+ }
+
+ void Next() override { ++itr_; }
+
+ void Prev() override {
+ if (itr_ == table_.begin()) {
+ itr_ = table_.end();
+ } else {
+ --itr_;
+ }
+ }
+
+ Slice key() const override { return Slice(itr_->first); }
+
+ Slice value() const override { return Slice(itr_->second); }
+
+ Status status() const override { return Status::OK(); }
+
+ private:
+ const stl_wrappers::KVMap& table_;
+ stl_wrappers::KVMap::const_iterator itr_;
+};
+
+class MockTableBuilder : public TableBuilder {
+ public:
+ MockTableBuilder(uint32_t id, MockTableFileSystem* file_system)
+ : id_(id), file_system_(file_system) {
+ table_ = MakeMockFile({});
+ }
+
+ // REQUIRES: Either Finish() or Abandon() has been called.
+ ~MockTableBuilder() {}
+
+ // Add key,value to the table being constructed.
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Add(const Slice& key, const Slice& value) override {
+ table_.insert({key.ToString(), value.ToString()});
+ }
+
+ // Return non-ok iff some error has been detected.
+ Status status() const override { return Status::OK(); }
+
+ Status Finish() override {
+ MutexLock lock_guard(&file_system_->mutex);
+ file_system_->files.insert({id_, table_});
+ return Status::OK();
+ }
+
+ void Abandon() override {}
+
+ uint64_t NumEntries() const override { return table_.size(); }
+
+ uint64_t FileSize() const override { return table_.size(); }
+
+ TableProperties GetTableProperties() const override {
+ return TableProperties();
+ }
+
+ // Get file checksum
+ const std::string& GetFileChecksum() const override { return file_checksum_; }
+ // Get file checksum function name
+ const char* GetFileChecksumFuncName() const override {
+ return kUnknownFileChecksumFuncName.c_str();
+ }
+
+ private:
+ uint32_t id_;
+ MockTableFileSystem* file_system_;
+ stl_wrappers::KVMap table_;
+ std::string file_checksum_ = kUnknownFileChecksum;
+};
+
+class MockTableFactory : public TableFactory {
+ public:
+ MockTableFactory();
+ const char* Name() const override { return "MockTable"; }
+ Status NewTableReader(
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader,
+ bool prefetch_index_and_filter_in_cache = true) const override;
+ TableBuilder* NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ uint32_t column_familly_id, WritableFileWriter* file) const override;
+
+ // This function will directly create mock table instead of going through
+ // MockTableBuilder. file_contents has to have a format of <internal_key,
+ // value>. Those key-value pairs will then be inserted into the mock table.
+ Status CreateMockTable(Env* env, const std::string& fname,
+ stl_wrappers::KVMap file_contents);
+
+ virtual Status SanitizeOptions(
+ const DBOptions& /*db_opts*/,
+ const ColumnFamilyOptions& /*cf_opts*/) const override {
+ return Status::OK();
+ }
+
+ virtual std::string GetPrintableTableOptions() const override {
+ return std::string();
+ }
+
+ // This function will assert that only a single file exists and that the
+ // contents are equal to file_contents
+ void AssertSingleFile(const stl_wrappers::KVMap& file_contents);
+ void AssertLatestFile(const stl_wrappers::KVMap& file_contents);
+
+ private:
+ uint32_t GetAndWriteNextID(WritableFileWriter* file) const;
+ uint32_t GetIDFromFile(RandomAccessFileReader* file) const;
+
+ mutable MockTableFileSystem file_system_;
+ mutable std::atomic<uint32_t> next_id_;
+};
+
+} // namespace mock
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/multiget_context.h b/src/rocksdb/table/multiget_context.h
new file mode 100644
index 000000000..0c5848c82
--- /dev/null
+++ b/src/rocksdb/table/multiget_context.h
@@ -0,0 +1,259 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <algorithm>
+#include <array>
+#include <string>
+#include "db/lookup_key.h"
+#include "db/merge_context.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/types.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+class GetContext;
+
+struct KeyContext {
+ const Slice* key;
+ LookupKey* lkey;
+ Slice ukey;
+ Slice ikey;
+ ColumnFamilyHandle* column_family;
+ Status* s;
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq;
+ bool key_exists;
+ void* cb_arg;
+ PinnableSlice* value;
+ GetContext* get_context;
+
+ KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key,
+ PinnableSlice* val, Status* stat)
+ : key(&user_key),
+ lkey(nullptr),
+ column_family(col_family),
+ s(stat),
+ max_covering_tombstone_seq(0),
+ key_exists(false),
+ cb_arg(nullptr),
+ value(val),
+ get_context(nullptr) {}
+
+ KeyContext() = default;
+};
+
+// The MultiGetContext class is a container for the sorted list of keys that
+// we need to lookup in a batch. Its main purpose is to make batch execution
+// easier by allowing various stages of the MultiGet lookups to operate on
+// subsets of keys, potentially non-contiguous. In order to accomplish this,
+// it defines the following classes -
+//
+// MultiGetContext::Range
+// MultiGetContext::Range::Iterator
+// MultiGetContext::Range::IteratorWrapper
+//
+// Here is an example of how this can be used -
+//
+// {
+// MultiGetContext ctx(...);
+// MultiGetContext::Range range = ctx.GetMultiGetRange();
+//
+// // Iterate to determine some subset of the keys
+// MultiGetContext::Range::Iterator start = range.begin();
+// MultiGetContext::Range::Iterator end = ...;
+//
+// // Make a new range with a subset of keys
+// MultiGetContext::Range subrange(range, start, end);
+//
+// // Define an auxillary vector, if needed, to hold additional data for
+// // each key
+// std::array<Foo, MultiGetContext::MAX_BATCH_SIZE> aux;
+//
+// // Iterate over the subrange and the auxillary vector simultaneously
+// MultiGetContext::Range::Iterator iter = subrange.begin();
+// for (; iter != subrange.end(); ++iter) {
+// KeyContext& key = *iter;
+// Foo& aux_key = aux_iter[iter.index()];
+// ...
+// }
+// }
+class MultiGetContext {
+ public:
+ // Limit the number of keys in a batch to this number. Benchmarks show that
+ // there is negligible benefit for batches exceeding this. Keeping this < 64
+ // simplifies iteration, as well as reduces the amount of stack allocations
+ // htat need to be performed
+ static const int MAX_BATCH_SIZE = 32;
+
+ MultiGetContext(autovector<KeyContext*, MAX_BATCH_SIZE>* sorted_keys,
+ size_t begin, size_t num_keys, SequenceNumber snapshot)
+ : num_keys_(num_keys),
+ value_mask_(0),
+ lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf)) {
+ if (num_keys > MAX_LOOKUP_KEYS_ON_STACK) {
+ lookup_key_heap_buf.reset(new char[sizeof(LookupKey) * num_keys]);
+ lookup_key_ptr_ = reinterpret_cast<LookupKey*>(
+ lookup_key_heap_buf.get());
+ }
+
+ for (size_t iter = 0; iter != num_keys_; ++iter) {
+ // autovector may not be contiguous storage, so make a copy
+ sorted_keys_[iter] = (*sorted_keys)[begin + iter];
+ sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter])
+ LookupKey(*sorted_keys_[iter]->key, snapshot);
+ sorted_keys_[iter]->ukey = sorted_keys_[iter]->lkey->user_key();
+ sorted_keys_[iter]->ikey = sorted_keys_[iter]->lkey->internal_key();
+ }
+ }
+
+ ~MultiGetContext() {
+ for (size_t i = 0; i < num_keys_; ++i) {
+ lookup_key_ptr_[i].~LookupKey();
+ }
+ }
+
+ private:
+ static const int MAX_LOOKUP_KEYS_ON_STACK = 16;
+ alignas(alignof(LookupKey))
+ char lookup_key_stack_buf[sizeof(LookupKey) * MAX_LOOKUP_KEYS_ON_STACK];
+ std::array<KeyContext*, MAX_BATCH_SIZE> sorted_keys_;
+ size_t num_keys_;
+ uint64_t value_mask_;
+ std::unique_ptr<char[]> lookup_key_heap_buf;
+ LookupKey* lookup_key_ptr_;
+
+ public:
+ // MultiGetContext::Range - Specifies a range of keys, by start and end index,
+ // from the parent MultiGetContext. Each range contains a bit vector that
+ // indicates whether the corresponding keys need to be processed or skipped.
+ // A Range object can be copy constructed, and the new object inherits the
+ // original Range's bit vector. This is useful for progressively skipping
+ // keys as the lookup goes through various stages. For example, when looking
+ // up keys in the same SST file, a Range is created excluding keys not
+ // belonging to that file. A new Range is then copy constructed and individual
+ // keys are skipped based on bloom filter lookup.
+ class Range {
+ public:
+ // MultiGetContext::Range::Iterator - A forward iterator that iterates over
+ // non-skippable keys in a Range, as well as keys whose final value has been
+ // found. The latter is tracked by MultiGetContext::value_mask_
+ class Iterator {
+ public:
+ // -- iterator traits
+ typedef Iterator self_type;
+ typedef KeyContext value_type;
+ typedef KeyContext& reference;
+ typedef KeyContext* pointer;
+ typedef int difference_type;
+ typedef std::forward_iterator_tag iterator_category;
+
+ Iterator(const Range* range, size_t idx)
+ : range_(range), ctx_(range->ctx_), index_(idx) {
+ while (index_ < range_->end_ &&
+ (1ull << index_) &
+ (range_->ctx_->value_mask_ | range_->skip_mask_))
+ index_++;
+ }
+
+ Iterator(const Iterator&) = default;
+ Iterator& operator=(const Iterator&) = default;
+
+ Iterator& operator++() {
+ while (++index_ < range_->end_ &&
+ (1ull << index_) &
+ (range_->ctx_->value_mask_ | range_->skip_mask_))
+ ;
+ return *this;
+ }
+
+ bool operator==(Iterator other) const {
+ assert(range_->ctx_ == other.range_->ctx_);
+ return index_ == other.index_;
+ }
+
+ bool operator!=(Iterator other) const {
+ assert(range_->ctx_ == other.range_->ctx_);
+ return index_ != other.index_;
+ }
+
+ KeyContext& operator*() {
+ assert(index_ < range_->end_ && index_ >= range_->start_);
+ return *(ctx_->sorted_keys_[index_]);
+ }
+
+ KeyContext* operator->() {
+ assert(index_ < range_->end_ && index_ >= range_->start_);
+ return ctx_->sorted_keys_[index_];
+ }
+
+ size_t index() { return index_; }
+
+ private:
+ friend Range;
+ const Range* range_;
+ const MultiGetContext* ctx_;
+ size_t index_;
+ };
+
+ Range(const Range& mget_range,
+ const Iterator& first,
+ const Iterator& last) {
+ ctx_ = mget_range.ctx_;
+ start_ = first.index_;
+ end_ = last.index_;
+ skip_mask_ = mget_range.skip_mask_;
+ }
+
+ Range() = default;
+
+ Iterator begin() const { return Iterator(this, start_); }
+
+ Iterator end() const { return Iterator(this, end_); }
+
+ bool empty() {
+ return (((1ull << end_) - 1) & ~((1ull << start_) - 1) &
+ ~(ctx_->value_mask_ | skip_mask_)) == 0;
+ }
+
+ void SkipKey(const Iterator& iter) { skip_mask_ |= 1ull << iter.index_; }
+
+ // Update the value_mask_ in MultiGetContext so its
+ // immediately reflected in all the Range Iterators
+ void MarkKeyDone(Iterator& iter) {
+ ctx_->value_mask_ |= (1ull << iter.index_);
+ }
+
+ bool CheckKeyDone(Iterator& iter) {
+ return ctx_->value_mask_ & (1ull << iter.index_);
+ }
+
+ uint64_t KeysLeft() {
+ uint64_t new_val = skip_mask_ | ctx_->value_mask_;
+ uint64_t count = 0;
+ while (new_val) {
+ new_val = new_val & (new_val - 1);
+ count++;
+ }
+ return end_ - count;
+ }
+
+ private:
+ friend MultiGetContext;
+ MultiGetContext* ctx_;
+ size_t start_;
+ size_t end_;
+ uint64_t skip_mask_;
+
+ Range(MultiGetContext* ctx, size_t num_keys)
+ : ctx_(ctx), start_(0), end_(num_keys), skip_mask_(0) {}
+ };
+
+ // Return the initial range that encompasses all the keys in the batch
+ Range GetMultiGetRange() { return Range(this, num_keys_); }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/persistent_cache_helper.cc b/src/rocksdb/table/persistent_cache_helper.cc
new file mode 100644
index 000000000..8797c9b9b
--- /dev/null
+++ b/src/rocksdb/table/persistent_cache_helper.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "table/persistent_cache_helper.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void PersistentCacheHelper::InsertRawPage(
+ const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+ const char* data, const size_t size) {
+ assert(cache_options.persistent_cache);
+ assert(cache_options.persistent_cache->IsCompressed());
+
+ // construct the page key
+ char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+ auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(),
+ cache_options.key_prefix.size(),
+ handle, cache_key);
+ // insert content to cache
+ cache_options.persistent_cache->Insert(key, data, size);
+}
+
+void PersistentCacheHelper::InsertUncompressedPage(
+ const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+ const BlockContents& contents) {
+ assert(cache_options.persistent_cache);
+ assert(!cache_options.persistent_cache->IsCompressed());
+ // Precondition:
+ // (1) content is cacheable
+ // (2) content is not compressed
+
+ // construct the page key
+ char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+ auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(),
+ cache_options.key_prefix.size(),
+ handle, cache_key);
+ // insert block contents to page cache
+ cache_options.persistent_cache->Insert(key, contents.data.data(),
+ contents.data.size());
+}
+
+Status PersistentCacheHelper::LookupRawPage(
+ const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+ std::unique_ptr<char[]>* raw_data, const size_t raw_data_size) {
+#ifdef NDEBUG
+ (void)raw_data_size;
+#endif
+ assert(cache_options.persistent_cache);
+ assert(cache_options.persistent_cache->IsCompressed());
+
+ // construct the page key
+ char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+ auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(),
+ cache_options.key_prefix.size(),
+ handle, cache_key);
+ // Lookup page
+ size_t size;
+ Status s = cache_options.persistent_cache->Lookup(key, raw_data, &size);
+ if (!s.ok()) {
+ // cache miss
+ RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS);
+ return s;
+ }
+
+ // cache hit
+ assert(raw_data_size == handle.size() + kBlockTrailerSize);
+ assert(size == raw_data_size);
+ RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT);
+ return Status::OK();
+}
+
+Status PersistentCacheHelper::LookupUncompressedPage(
+ const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+ BlockContents* contents) {
+ assert(cache_options.persistent_cache);
+ assert(!cache_options.persistent_cache->IsCompressed());
+ if (!contents) {
+ // We shouldn't lookup in the cache. Either
+ // (1) Nowhere to store
+ return Status::NotFound();
+ }
+
+ // construct the page key
+ char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+ auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(),
+ cache_options.key_prefix.size(),
+ handle, cache_key);
+ // Lookup page
+ std::unique_ptr<char[]> data;
+ size_t size;
+ Status s = cache_options.persistent_cache->Lookup(key, &data, &size);
+ if (!s.ok()) {
+ // cache miss
+ RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS);
+ return s;
+ }
+
+ // please note we are potentially comparing compressed data size with
+ // uncompressed data size
+ assert(handle.size() <= size);
+
+ // update stats
+ RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT);
+ // construct result and return
+ *contents = BlockContents(std::move(data), size);
+ return Status::OK();
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/persistent_cache_helper.h b/src/rocksdb/table/persistent_cache_helper.h
new file mode 100644
index 000000000..1db855729
--- /dev/null
+++ b/src/rocksdb/table/persistent_cache_helper.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+
+#include "monitoring/statistics.h"
+#include "table/format.h"
+#include "table/persistent_cache_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct BlockContents;
+
+// PersistentCacheHelper
+//
+// Encapsulates some of the helper logic for read and writing from the cache
+class PersistentCacheHelper {
+ public:
+ // insert block into raw page cache
+ static void InsertRawPage(const PersistentCacheOptions& cache_options,
+ const BlockHandle& handle, const char* data,
+ const size_t size);
+
+ // insert block into uncompressed cache
+ static void InsertUncompressedPage(
+ const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+ const BlockContents& contents);
+
+ // lookup block from raw page cacge
+ static Status LookupRawPage(const PersistentCacheOptions& cache_options,
+ const BlockHandle& handle,
+ std::unique_ptr<char[]>* raw_data,
+ const size_t raw_data_size);
+
+ // lookup block from uncompressed cache
+ static Status LookupUncompressedPage(
+ const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+ BlockContents* contents);
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/persistent_cache_options.h b/src/rocksdb/table/persistent_cache_options.h
new file mode 100644
index 000000000..7c65a041a
--- /dev/null
+++ b/src/rocksdb/table/persistent_cache_options.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+
+#include "monitoring/statistics.h"
+#include "rocksdb/persistent_cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PersistentCacheOptions
+//
+// This describe the caching behavior for page cache
+// This is used to pass the context for caching and the cache handle
+struct PersistentCacheOptions {
+ PersistentCacheOptions() {}
+ explicit PersistentCacheOptions(
+ const std::shared_ptr<PersistentCache>& _persistent_cache,
+ const std::string _key_prefix, Statistics* const _statistics)
+ : persistent_cache(_persistent_cache),
+ key_prefix(_key_prefix),
+ statistics(_statistics) {}
+
+ virtual ~PersistentCacheOptions() {}
+
+ std::shared_ptr<PersistentCache> persistent_cache;
+ std::string key_prefix;
+ Statistics* statistics = nullptr;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_bloom.cc b/src/rocksdb/table/plain/plain_table_bloom.cc
new file mode 100644
index 000000000..7b6833524
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_bloom.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "table/plain/plain_table_bloom.h"
+
+#include <algorithm>
+#include <string>
+#include "util/dynamic_bloom.h"
+
+#include "memory/allocator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
+ uint32_t num_blocks =
+ (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+ // Make num_blocks an odd number to make sure more bits are involved
+ // when determining which block.
+ if (num_blocks % 2 == 0) {
+ num_blocks++;
+ }
+
+ return num_blocks * (CACHE_LINE_SIZE * 8);
+}
+} // namespace
+
+PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes)
+ : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
+
+void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits,
+ uint32_t num_blocks) {
+ data_ = raw_data;
+ kTotalBits = total_bits;
+ kNumBlocks = num_blocks;
+}
+
+void PlainTableBloomV1::SetTotalBits(Allocator* allocator, uint32_t total_bits,
+ uint32_t locality,
+ size_t huge_page_tlb_size,
+ Logger* logger) {
+ kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
+ : (total_bits + 7) / 8 * 8;
+ kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
+
+ assert(kNumBlocks > 0 || kTotalBits > 0);
+ assert(kNumProbes > 0);
+
+ uint32_t sz = kTotalBits / 8;
+ if (kNumBlocks > 0) {
+ sz += CACHE_LINE_SIZE - 1;
+ }
+ assert(allocator);
+
+ char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
+ memset(raw, 0, sz);
+ auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE;
+ if (kNumBlocks > 0 && cache_line_offset > 0) {
+ raw += CACHE_LINE_SIZE - cache_line_offset;
+ }
+ data_ = raw;
+}
+
+void BloomBlockBuilder::AddKeysHashes(
+ const std::vector<uint32_t>& keys_hashes) {
+ for (auto hash : keys_hashes) {
+ bloom_.AddHash(hash);
+ }
+}
+
+Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
+
+const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_bloom.h b/src/rocksdb/table/plain/plain_table_bloom.h
new file mode 100644
index 000000000..fdacdb0d5
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_bloom.h
@@ -0,0 +1,135 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+#include "port/port.h"
+#include "util/bloom_impl.h"
+#include "util/hash.h"
+
+#include "third-party/folly/folly/ConstexprMath.h"
+
+#include <memory>
+
+namespace ROCKSDB_NAMESPACE {
+class Slice;
+class Allocator;
+class Logger;
+
+// A legacy Bloom filter implementation used by Plain Table db format, for
+// schema backward compatibility. Not for use in new filter applications.
+class PlainTableBloomV1 {
+ public:
+ // allocator: pass allocator to bloom filter, hence trace the usage of memory
+ // total_bits: fixed total bits for the bloom
+ // num_probes: number of hash probes for a single key
+ // locality: If positive, optimize for cache line locality, 0 otherwise.
+ // hash_func: customized hash function
+ // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB
+ // within this page size. Need to reserve huge pages for
+ // it to be allocated, like:
+ // sysctl -w vm.nr_hugepages=20
+ // See linux doc Documentation/vm/hugetlbpage.txt
+ explicit PlainTableBloomV1(uint32_t num_probes = 6);
+ void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+ uint32_t locality, size_t huge_page_tlb_size,
+ Logger* logger);
+
+ ~PlainTableBloomV1() {}
+
+ // Assuming single threaded access to this function.
+ void AddHash(uint32_t hash);
+
+ // Multithreaded access to this function is OK
+ bool MayContainHash(uint32_t hash) const;
+
+ void Prefetch(uint32_t hash);
+
+ uint32_t GetNumBlocks() const { return kNumBlocks; }
+
+ Slice GetRawData() const { return Slice(data_, GetTotalBits() / 8); }
+
+ void SetRawData(char* raw_data, uint32_t total_bits, uint32_t num_blocks = 0);
+
+ uint32_t GetTotalBits() const { return kTotalBits; }
+
+ bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
+
+ private:
+ uint32_t kTotalBits;
+ uint32_t kNumBlocks;
+ const uint32_t kNumProbes;
+
+ char* data_;
+
+ static constexpr int LOG2_CACHE_LINE_SIZE =
+ folly::constexpr_log2(CACHE_LINE_SIZE);
+};
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// local variable is initialized but not referenced
+#pragma warning(disable : 4189)
+#endif
+inline void PlainTableBloomV1::Prefetch(uint32_t h) {
+ if (kNumBlocks != 0) {
+ uint32_t ignored;
+ LegacyLocalityBloomImpl</*ExtraRotates*/ true>::PrepareHashMayMatch(
+ h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE);
+ }
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const {
+ assert(IsInitialized());
+ if (kNumBlocks != 0) {
+ return LegacyLocalityBloomImpl<true>::HashMayMatch(
+ h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE);
+ } else {
+ return LegacyNoLocalityBloomImpl::HashMayMatch(h, kTotalBits, kNumProbes,
+ data_);
+ }
+}
+
+inline void PlainTableBloomV1::AddHash(uint32_t h) {
+ assert(IsInitialized());
+ if (kNumBlocks != 0) {
+ LegacyLocalityBloomImpl<true>::AddHash(h, kNumBlocks, kNumProbes, data_,
+ LOG2_CACHE_LINE_SIZE);
+ } else {
+ LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_);
+ }
+}
+
+class BloomBlockBuilder {
+ public:
+ static const std::string kBloomBlock;
+
+ explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {}
+
+ void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+ uint32_t locality, size_t huge_page_tlb_size,
+ Logger* logger) {
+ bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size,
+ logger);
+ }
+
+ uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
+
+ void AddKeysHashes(const std::vector<uint32_t>& keys_hashes);
+
+ Slice Finish();
+
+ private:
+ PlainTableBloomV1 bloom_;
+};
+
+}; // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_builder.cc b/src/rocksdb/table/plain/plain_table_builder.cc
new file mode 100644
index 000000000..147e46db2
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_builder.cc
@@ -0,0 +1,314 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_builder.h"
+
+#include <assert.h>
+
+#include <string>
+#include <limits>
+#include <map>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// a utility that helps writing block content to the file
+// @offset will advance if @block_contents was successfully written.
+// @block_handle the block handle this particular block.
+Status WriteBlock(const Slice& block_contents, WritableFileWriter* file,
+ uint64_t* offset, BlockHandle* block_handle) {
+ block_handle->set_offset(*offset);
+ block_handle->set_size(block_contents.size());
+ Status s = file->Append(block_contents);
+
+ if (s.ok()) {
+ *offset += block_contents.size();
+ }
+ return s;
+}
+
+} // namespace
+
+// kPlainTableMagicNumber was picked by running
+// echo rocksdb.table.plain | sha1sum
+// and taking the leading 64 bits.
+extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
+extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
+
+PlainTableBuilder::PlainTableBuilder(
+ const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories,
+ uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_len,
+ EncodingType encoding_type, size_t index_sparseness,
+ uint32_t bloom_bits_per_key, const std::string& column_family_name,
+ uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio,
+ bool store_index_in_file)
+ : ioptions_(ioptions),
+ moptions_(moptions),
+ bloom_block_(num_probes),
+ file_(file),
+ bloom_bits_per_key_(bloom_bits_per_key),
+ huge_page_tlb_size_(huge_page_tlb_size),
+ encoder_(encoding_type, user_key_len, moptions.prefix_extractor.get(),
+ index_sparseness),
+ store_index_in_file_(store_index_in_file),
+ prefix_extractor_(moptions.prefix_extractor.get()) {
+ // Build index block and save it in the file if hash_table_ratio > 0
+ if (store_index_in_file_) {
+ assert(hash_table_ratio > 0 || IsTotalOrderMode());
+ index_builder_.reset(new PlainTableIndexBuilder(
+ &arena_, ioptions, moptions.prefix_extractor.get(), index_sparseness,
+ hash_table_ratio, huge_page_tlb_size_));
+ properties_.user_collected_properties
+ [PlainTablePropertyNames::kBloomVersion] = "1"; // For future use
+ }
+
+ properties_.fixed_key_len = user_key_len;
+
+ // for plain table, we put all the data in a big chuck.
+ properties_.num_data_blocks = 1;
+ // Fill it later if store_index_in_file_ == true
+ properties_.index_size = 0;
+ properties_.filter_size = 0;
+ // To support roll-back to previous version, now still use version 0 for
+ // plain encoding.
+ properties_.format_version = (encoding_type == kPlain) ? 0 : 1;
+ properties_.column_family_id = column_family_id;
+ properties_.column_family_name = column_family_name;
+ properties_.prefix_extractor_name = moptions_.prefix_extractor != nullptr
+ ? moptions_.prefix_extractor->Name()
+ : "nullptr";
+
+ std::string val;
+ PutFixed32(&val, static_cast<uint32_t>(encoder_.GetEncodingType()));
+ properties_.user_collected_properties
+ [PlainTablePropertyNames::kEncodingType] = val;
+
+ for (auto& collector_factories : *int_tbl_prop_collector_factories) {
+ table_properties_collectors_.emplace_back(
+ collector_factories->CreateIntTblPropCollector(column_family_id));
+ }
+}
+
+PlainTableBuilder::~PlainTableBuilder() {
+}
+
+void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
+ // temp buffer for metadata bytes between key and value.
+ char meta_bytes_buf[6];
+ size_t meta_bytes_buf_size = 0;
+
+ ParsedInternalKey internal_key;
+ if (!ParseInternalKey(key, &internal_key)) {
+ assert(false);
+ return;
+ }
+ if (internal_key.type == kTypeRangeDeletion) {
+ status_ = Status::NotSupported("Range deletion unsupported");
+ return;
+ }
+
+ // Store key hash
+ if (store_index_in_file_) {
+ if (moptions_.prefix_extractor == nullptr) {
+ keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
+ } else {
+ Slice prefix =
+ moptions_.prefix_extractor->Transform(internal_key.user_key);
+ keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
+ }
+ }
+
+ // Write value
+ assert(offset_ <= std::numeric_limits<uint32_t>::max());
+ auto prev_offset = static_cast<uint32_t>(offset_);
+ // Write out the key
+ encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
+ &meta_bytes_buf_size);
+ if (SaveIndexInFile()) {
+ index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset);
+ }
+
+ // Write value length
+ uint32_t value_size = static_cast<uint32_t>(value.size());
+ char* end_ptr =
+ EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size);
+ assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf));
+ meta_bytes_buf_size = end_ptr - meta_bytes_buf;
+ file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size));
+
+ // Write value
+ file_->Append(value);
+ offset_ += value_size + meta_bytes_buf_size;
+
+ properties_.num_entries++;
+ properties_.raw_key_size += key.size();
+ properties_.raw_value_size += value.size();
+ if (internal_key.type == kTypeDeletion ||
+ internal_key.type == kTypeSingleDeletion) {
+ properties_.num_deletions++;
+ } else if (internal_key.type == kTypeMerge) {
+ properties_.num_merge_operands++;
+ }
+
+ // notify property collectors
+ NotifyCollectTableCollectorsOnAdd(
+ key, value, offset_, table_properties_collectors_, ioptions_.info_log);
+}
+
+Status PlainTableBuilder::status() const { return status_; }
+
+Status PlainTableBuilder::Finish() {
+ assert(!closed_);
+ closed_ = true;
+
+ properties_.data_size = offset_;
+
+ // Write the following blocks
+ // 1. [meta block: bloom] - optional
+ // 2. [meta block: index] - optional
+ // 3. [meta block: properties]
+ // 4. [metaindex block]
+ // 5. [footer]
+
+ MetaIndexBuilder meta_index_builer;
+
+ if (store_index_in_file_ && (properties_.num_entries > 0)) {
+ assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max());
+ Status s;
+ BlockHandle bloom_block_handle;
+ if (bloom_bits_per_key_ > 0) {
+ bloom_block_.SetTotalBits(
+ &arena_,
+ static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_,
+ ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.info_log);
+
+ PutVarint32(&properties_.user_collected_properties
+ [PlainTablePropertyNames::kNumBloomBlocks],
+ bloom_block_.GetNumBlocks());
+
+ bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_);
+
+ Slice bloom_finish_result = bloom_block_.Finish();
+
+ properties_.filter_size = bloom_finish_result.size();
+ s = WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle);
+
+ if (!s.ok()) {
+ return s;
+ }
+ meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
+ }
+ BlockHandle index_block_handle;
+ Slice index_finish_result = index_builder_->Finish();
+
+ properties_.index_size = index_finish_result.size();
+ s = WriteBlock(index_finish_result, file_, &offset_, &index_block_handle);
+
+ if (!s.ok()) {
+ return s;
+ }
+
+ meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
+ index_block_handle);
+ }
+
+ // Calculate bloom block size and index block size
+ PropertyBlockBuilder property_block_builder;
+ // -- Add basic properties
+ property_block_builder.AddTableProperty(properties_);
+
+ property_block_builder.Add(properties_.user_collected_properties);
+
+ // -- Add user collected properties
+ NotifyCollectTableCollectorsOnFinish(table_properties_collectors_,
+ ioptions_.info_log,
+ &property_block_builder);
+
+ // -- Write property block
+ BlockHandle property_block_handle;
+ auto s = WriteBlock(
+ property_block_builder.Finish(),
+ file_,
+ &offset_,
+ &property_block_handle
+ );
+ if (!s.ok()) {
+ return s;
+ }
+ meta_index_builer.Add(kPropertiesBlock, property_block_handle);
+
+ // -- write metaindex block
+ BlockHandle metaindex_block_handle;
+ s = WriteBlock(
+ meta_index_builer.Finish(),
+ file_,
+ &offset_,
+ &metaindex_block_handle
+ );
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Write Footer
+ // no need to write out new footer if we're using default checksum
+ Footer footer(kLegacyPlainTableMagicNumber, 0);
+ footer.set_metaindex_handle(metaindex_block_handle);
+ footer.set_index_handle(BlockHandle::NullBlockHandle());
+ std::string footer_encoding;
+ footer.EncodeTo(&footer_encoding);
+ s = file_->Append(footer_encoding);
+ if (s.ok()) {
+ offset_ += footer_encoding.size();
+ }
+
+ if (file_ != nullptr) {
+ file_checksum_ = file_->GetFileChecksum();
+ }
+ return s;
+}
+
+void PlainTableBuilder::Abandon() {
+ closed_ = true;
+}
+
+uint64_t PlainTableBuilder::NumEntries() const {
+ return properties_.num_entries;
+}
+
+uint64_t PlainTableBuilder::FileSize() const {
+ return offset_;
+}
+
+const char* PlainTableBuilder::GetFileChecksumFuncName() const {
+ if (file_ != nullptr) {
+ return file_->GetFileChecksumFuncName();
+ } else {
+ return kUnknownFileChecksumFuncName.c_str();
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_builder.h b/src/rocksdb/table/plain/plain_table_builder.h
new file mode 100644
index 000000000..fe2bf3cf9
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_builder.h
@@ -0,0 +1,151 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include "db/version_edit.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_index.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/table_builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+class TableBuilder;
+
+// The builder class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableBuilder: public TableBuilder {
+ public:
+ // Create a builder that will store the contents of the table it is
+ // building in *file. Does not close the file. It is up to the
+ // caller to close the file after calling Finish(). The output file
+ // will be part of level specified by 'level'. A value of -1 means
+ // that the caller does not know which level the output file will reside.
+ PlainTableBuilder(
+ const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories,
+ uint32_t column_family_id, WritableFileWriter* file,
+ uint32_t user_key_size, EncodingType encoding_type,
+ size_t index_sparseness, uint32_t bloom_bits_per_key,
+ const std::string& column_family_name, uint32_t num_probes = 6,
+ size_t huge_page_tlb_size = 0, double hash_table_ratio = 0,
+ bool store_index_in_file = false);
+ // No copying allowed
+ PlainTableBuilder(const PlainTableBuilder&) = delete;
+ void operator=(const PlainTableBuilder&) = delete;
+
+ // REQUIRES: Either Finish() or Abandon() has been called.
+ ~PlainTableBuilder();
+
+ // Add key,value to the table being constructed.
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Add(const Slice& key, const Slice& value) override;
+
+ // Return non-ok iff some error has been detected.
+ Status status() const override;
+
+ // Finish building the table. Stops using the file passed to the
+ // constructor after this function returns.
+ // REQUIRES: Finish(), Abandon() have not been called
+ Status Finish() override;
+
+ // Indicate that the contents of this builder should be abandoned. Stops
+ // using the file passed to the constructor after this function returns.
+ // If the caller is not going to call Finish(), it must call Abandon()
+ // before destroying this builder.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Abandon() override;
+
+ // Number of calls to Add() so far.
+ uint64_t NumEntries() const override;
+
+ // Size of the file generated so far. If invoked after a successful
+ // Finish() call, returns the size of the final generated file.
+ uint64_t FileSize() const override;
+
+ TableProperties GetTableProperties() const override { return properties_; }
+
+ bool SaveIndexInFile() const { return store_index_in_file_; }
+
+ // Get file checksum
+ const std::string& GetFileChecksum() const override { return file_checksum_; }
+
+ // Get file checksum function name
+ const char* GetFileChecksumFuncName() const override;
+
+ private:
+ Arena arena_;
+ const ImmutableCFOptions& ioptions_;
+ const MutableCFOptions& moptions_;
+ std::vector<std::unique_ptr<IntTblPropCollector>>
+ table_properties_collectors_;
+
+ BloomBlockBuilder bloom_block_;
+ std::unique_ptr<PlainTableIndexBuilder> index_builder_;
+
+ WritableFileWriter* file_;
+ uint64_t offset_ = 0;
+ uint32_t bloom_bits_per_key_;
+ size_t huge_page_tlb_size_;
+ Status status_;
+ TableProperties properties_;
+ PlainTableKeyEncoder encoder_;
+
+ bool store_index_in_file_;
+
+ std::vector<uint32_t> keys_or_prefixes_hashes_;
+ bool closed_ = false; // Either Finish() or Abandon() has been called.
+
+ const SliceTransform* prefix_extractor_;
+
+ // Store file checksum. If checksum is disabled, its value is "0".
+ std::string file_checksum_ = kUnknownFileChecksum;
+
+ Slice GetPrefix(const Slice& target) const {
+ assert(target.size() >= 8); // target is internal key
+ return GetPrefixFromUserKey(GetUserKey(target));
+ }
+
+ Slice GetPrefix(const ParsedInternalKey& target) const {
+ return GetPrefixFromUserKey(target.user_key);
+ }
+
+ Slice GetUserKey(const Slice& key) const {
+ return Slice(key.data(), key.size() - 8);
+ }
+
+ Slice GetPrefixFromUserKey(const Slice& user_key) const {
+ if (!IsTotalOrderMode()) {
+ return prefix_extractor_->Transform(user_key);
+ } else {
+ // Use empty slice as prefix if prefix_extractor is not set.
+ // In that case,
+ // it falls back to pure binary search and
+ // total iterator seek is supported.
+ return Slice();
+ }
+ }
+
+ bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_factory.cc b/src/rocksdb/table/plain/plain_table_factory.cc
new file mode 100644
index 000000000..c2db8f395
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_factory.cc
@@ -0,0 +1,235 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_factory.h"
+
+#include <stdint.h>
+#include <memory>
+#include "db/dbformat.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "table/plain/plain_table_builder.h"
+#include "table/plain/plain_table_reader.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status PlainTableFactory::NewTableReader(
+ const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table,
+ bool /*prefetch_index_and_filter_in_cache*/) const {
+ return PlainTableReader::Open(
+ table_reader_options.ioptions, table_reader_options.env_options,
+ table_reader_options.internal_comparator, std::move(file), file_size,
+ table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio,
+ table_options_.index_sparseness, table_options_.huge_page_tlb_size,
+ table_options_.full_scan_mode, table_reader_options.immortal,
+ table_reader_options.prefix_extractor);
+}
+
+TableBuilder* PlainTableFactory::NewTableBuilder(
+ const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+ WritableFileWriter* file) const {
+ // Ignore the skip_filters flag. PlainTable format is optimized for small
+ // in-memory dbs. The skip_filters optimization is not useful for plain
+ // tables
+ //
+ return new PlainTableBuilder(
+ table_builder_options.ioptions, table_builder_options.moptions,
+ table_builder_options.int_tbl_prop_collector_factories, column_family_id,
+ file, table_options_.user_key_len, table_options_.encoding_type,
+ table_options_.index_sparseness, table_options_.bloom_bits_per_key,
+ table_builder_options.column_family_name, 6,
+ table_options_.huge_page_tlb_size, table_options_.hash_table_ratio,
+ table_options_.store_index_in_file);
+}
+
+std::string PlainTableFactory::GetPrintableTableOptions() const {
+ std::string ret;
+ ret.reserve(20000);
+ const int kBufferSize = 200;
+ char buffer[kBufferSize];
+
+ snprintf(buffer, kBufferSize, " user_key_len: %u\n",
+ table_options_.user_key_len);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " bloom_bits_per_key: %d\n",
+ table_options_.bloom_bits_per_key);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n",
+ table_options_.hash_table_ratio);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " index_sparseness: %" ROCKSDB_PRIszt "\n",
+ table_options_.index_sparseness);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " huge_page_tlb_size: %" ROCKSDB_PRIszt "\n",
+ table_options_.huge_page_tlb_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " encoding_type: %d\n",
+ table_options_.encoding_type);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " full_scan_mode: %d\n",
+ table_options_.full_scan_mode);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " store_index_in_file: %d\n",
+ table_options_.store_index_in_file);
+ ret.append(buffer);
+ return ret;
+}
+
+const PlainTableOptions& PlainTableFactory::table_options() const {
+ return table_options_;
+}
+
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+ const std::string& opts_str,
+ PlainTableOptions* new_table_options) {
+ std::unordered_map<std::string, std::string> opts_map;
+ Status s = StringToMap(opts_str, &opts_map);
+ if (!s.ok()) {
+ return s;
+ }
+ return GetPlainTableOptionsFromMap(table_options, opts_map,
+ new_table_options);
+}
+
+Status GetMemTableRepFactoryFromString(
+ const std::string& opts_str,
+ std::unique_ptr<MemTableRepFactory>* new_mem_factory) {
+ std::vector<std::string> opts_list = StringSplit(opts_str, ':');
+ size_t len = opts_list.size();
+
+ if (opts_list.empty() || opts_list.size() > 2) {
+ return Status::InvalidArgument("Can't parse memtable_factory option ",
+ opts_str);
+ }
+
+ MemTableRepFactory* mem_factory = nullptr;
+
+ if (opts_list[0] == "skip_list") {
+ // Expecting format
+ // skip_list:<lookahead>
+ if (2 == len) {
+ size_t lookahead = ParseSizeT(opts_list[1]);
+ mem_factory = new SkipListFactory(lookahead);
+ } else if (1 == len) {
+ mem_factory = new SkipListFactory();
+ }
+ } else if (opts_list[0] == "prefix_hash") {
+ // Expecting format
+ // prfix_hash:<hash_bucket_count>
+ if (2 == len) {
+ size_t hash_bucket_count = ParseSizeT(opts_list[1]);
+ mem_factory = NewHashSkipListRepFactory(hash_bucket_count);
+ } else if (1 == len) {
+ mem_factory = NewHashSkipListRepFactory();
+ }
+ } else if (opts_list[0] == "hash_linkedlist") {
+ // Expecting format
+ // hash_linkedlist:<hash_bucket_count>
+ if (2 == len) {
+ size_t hash_bucket_count = ParseSizeT(opts_list[1]);
+ mem_factory = NewHashLinkListRepFactory(hash_bucket_count);
+ } else if (1 == len) {
+ mem_factory = NewHashLinkListRepFactory();
+ }
+ } else if (opts_list[0] == "vector") {
+ // Expecting format
+ // vector:<count>
+ if (2 == len) {
+ size_t count = ParseSizeT(opts_list[1]);
+ mem_factory = new VectorRepFactory(count);
+ } else if (1 == len) {
+ mem_factory = new VectorRepFactory();
+ }
+ } else if (opts_list[0] == "cuckoo") {
+ return Status::NotSupported(
+ "cuckoo hash memtable is not supported anymore.");
+ } else {
+ return Status::InvalidArgument("Unrecognized memtable_factory option ",
+ opts_str);
+ }
+
+ if (mem_factory != nullptr) {
+ new_mem_factory->reset(mem_factory);
+ }
+
+ return Status::OK();
+}
+
+std::string ParsePlainTableOptions(const std::string& name,
+ const std::string& org_value,
+ PlainTableOptions* new_options,
+ bool input_strings_escaped = false,
+ bool ignore_unknown_options = false) {
+ const std::string& value =
+ input_strings_escaped ? UnescapeOptionString(org_value) : org_value;
+ const auto iter = plain_table_type_info.find(name);
+ if (iter == plain_table_type_info.end()) {
+ if (ignore_unknown_options) {
+ return "";
+ } else {
+ return "Unrecognized option";
+ }
+ }
+ const auto& opt_info = iter->second;
+ if (opt_info.verification != OptionVerificationType::kDeprecated &&
+ !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset,
+ opt_info.type, value)) {
+ return "Invalid value";
+ }
+ return "";
+}
+
+Status GetPlainTableOptionsFromMap(
+ const PlainTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ PlainTableOptions* new_table_options, bool input_strings_escaped,
+ bool /*ignore_unknown_options*/) {
+ assert(new_table_options);
+ *new_table_options = table_options;
+ for (const auto& o : opts_map) {
+ auto error_message = ParsePlainTableOptions(
+ o.first, o.second, new_table_options, input_strings_escaped);
+ if (error_message != "") {
+ const auto iter = plain_table_type_info.find(o.first);
+ if (iter == plain_table_type_info.end() ||
+ !input_strings_escaped || // !input_strings_escaped indicates
+ // the old API, where everything is
+ // parsable.
+ (iter->second.verification != OptionVerificationType::kByName &&
+ iter->second.verification !=
+ OptionVerificationType::kByNameAllowNull &&
+ iter->second.verification !=
+ OptionVerificationType::kByNameAllowFromNull &&
+ iter->second.verification != OptionVerificationType::kDeprecated)) {
+ // Restore "new_options" to the default "base_options".
+ *new_table_options = table_options;
+ return Status::InvalidArgument("Can't parse PlainTableOptions:",
+ o.first + " " + error_message);
+ }
+ }
+ }
+ return Status::OK();
+}
+
+extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
+ return new PlainTableFactory(options);
+}
+
+const std::string PlainTablePropertyNames::kEncodingType =
+ "rocksdb.plain.table.encoding.type";
+
+const std::string PlainTablePropertyNames::kBloomVersion =
+ "rocksdb.plain.table.bloom.version";
+
+const std::string PlainTablePropertyNames::kNumBloomBlocks =
+ "rocksdb.plain.table.bloom.numblocks";
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_factory.h b/src/rocksdb/table/plain/plain_table_factory.h
new file mode 100644
index 000000000..64dd171cb
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_factory.h
@@ -0,0 +1,223 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <memory>
+#include <string>
+#include <stdint.h>
+
+#include "options/options_helper.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct EnvOptions;
+
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+// PlainTableFactory is the entrance function to the PlainTable format of
+// SST files. It returns instances PlainTableBuilder as the builder
+// class and PlainTableReader as the reader class, where the format is
+// actually implemented.
+//
+// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
+// Data is not organized in blocks, which allows fast access. Because of
+// following downsides
+// 1. Data compression is not supported.
+// 2. Data is not checksumed.
+// it is not recommended to use this format on other type of file systems.
+//
+// PlainTable requires fixed length key, configured as a constructor
+// parameter of the factory class. Output file format:
+// +-------------+-----------------+
+// | version | user_key_length |
+// +------------++------------+-----------------+ <= key1 offset
+// | encoded key1 | value_size | |
+// +------------+-------------+-------------+ |
+// | value1 |
+// | |
+// +--------------------------+-------------+---+ <= key2 offset
+// | encoded key2 | value_size | |
+// +------------+-------------+-------------+ |
+// | value2 |
+// | |
+// | ...... |
+// +-----------------+--------------------------+
+//
+// When the key encoding type is kPlain. Key part is encoded as:
+// +------------+--------------------+
+// | [key_size] | internal key |
+// +------------+--------------------+
+// for the case of user_key_len = kPlainTableVariableLength case,
+// and simply:
+// +----------------------+
+// | internal key |
+// +----------------------+
+// for user_key_len != kPlainTableVariableLength case.
+//
+// If key encoding type is kPrefix. Keys are encoding in this format.
+// There are three ways to encode a key:
+// (1) Full Key
+// +---------------+---------------+-------------------+
+// | Full Key Flag | Full Key Size | Full Internal Key |
+// +---------------+---------------+-------------------+
+// which simply encodes a full key
+//
+// (2) A key shared the same prefix as the previous key, which is encoded as
+// format of (1).
+// +-------------+-------------+-------------+-------------+------------+
+// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
+// +-------------+-------------+-------------+-------------+------------+
+// where key is the suffix part of the key, including the internal bytes.
+// the actual key will be constructed by concatenating prefix part of the
+// previous key, with the suffix part of the key here, with sizes given here.
+//
+// (3) A key shared the same prefix as the previous key, which is encoded as
+// the format of (2).
+// +-----------------+-----------------+------------------------+
+// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
+// +-----------------+-----------------+------------------------+
+// The key will be constructed by concatenating previous key's prefix (which is
+// also a prefix which the last key encoded in the format of (1)) and the
+// key given here.
+//
+// For example, we for following keys (prefix and suffix are separated by
+// spaces):
+// 0000 0001
+// 0000 00021
+// 0000 0002
+// 00011 00
+// 0002 0001
+// Will be encoded like this:
+// FK 8 00000001
+// PF 4 SF 5 00021
+// SF 4 0002
+// FK 7 0001100
+// FK 8 00020001
+// (where FK means full key flag, PF means prefix flag and SF means suffix flag)
+//
+// All those "key flag + key size" shown above are in this format:
+// The 8 bits of the first byte:
+// +----+----+----+----+----+----+----+----+
+// | Type | Size |
+// +----+----+----+----+----+----+----+----+
+// Type indicates: full key, prefix, or suffix.
+// The last 6 bits are for size. If the size bits are not all 1, it means the
+// size of the key. Otherwise, varint32 is read after this byte. This varint
+// value + 0x3F (the value of all 1) will be the key size.
+//
+// For example, full key with length 16 will be encoded as (binary):
+// 00 010000
+// (00 means full key)
+// and a prefix with 100 bytes will be encoded as:
+// 01 111111 00100101
+// (63) (37)
+// (01 means key suffix)
+//
+// All the internal keys above (including kPlain and kPrefix) are encoded in
+// this format:
+// There are two types:
+// (1) normal internal key format
+// +----------- ...... -------------+----+---+---+---+---+---+---+---+
+// | user key |type| sequence ID |
+// +----------- ..... --------------+----+---+---+---+---+---+---+---+
+// (2) Special case for keys whose sequence ID is 0 and is value type
+// +----------- ...... -------------+----+
+// | user key |0x80|
+// +----------- ..... --------------+----+
+// To save 7 bytes for the special case where sequence ID = 0.
+//
+//
+class PlainTableFactory : public TableFactory {
+ public:
+ ~PlainTableFactory() {}
+ // user_key_len is the length of the user key. If it is set to be
+ // kPlainTableVariableLength, then it means variable length. Otherwise, all
+ // the keys need to have the fix length of this value. bloom_bits_per_key is
+ // number of bits used for bloom filer per key. hash_table_ratio is
+ // the desired utilization of the hash table used for prefix hashing.
+ // hash_table_ratio = number of prefixes / #buckets in the hash table
+ // hash_table_ratio = 0 means skip hash table but only replying on binary
+ // search.
+ // index_sparseness determines index interval for keys
+ // inside the same prefix. It will be the maximum number of linear search
+ // required after hash and binary search.
+ // index_sparseness = 0 means index for every key.
+ // huge_page_tlb_size determines whether to allocate hash indexes from huge
+ // page TLB and the page size if allocating from there. See comments of
+ // Arena::AllocateAligned() for details.
+ explicit PlainTableFactory(
+ const PlainTableOptions& _table_options = PlainTableOptions())
+ : table_options_(_table_options) {}
+
+ const char* Name() const override { return "PlainTable"; }
+ Status NewTableReader(const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ uint64_t file_size, std::unique_ptr<TableReader>* table,
+ bool prefetch_index_and_filter_in_cache) const override;
+
+ TableBuilder* NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ uint32_t column_family_id, WritableFileWriter* file) const override;
+
+ std::string GetPrintableTableOptions() const override;
+
+ const PlainTableOptions& table_options() const;
+
+ static const char kValueTypeSeqId0 = char(~0);
+
+ // Sanitizes the specified DB Options.
+ Status SanitizeOptions(
+ const DBOptions& /*db_opts*/,
+ const ColumnFamilyOptions& /*cf_opts*/) const override {
+ return Status::OK();
+ }
+
+ void* GetOptions() override { return &table_options_; }
+
+ Status GetOptionString(std::string* /*opt_string*/,
+ const std::string& /*delimiter*/) const override {
+ return Status::OK();
+ }
+
+ private:
+ PlainTableOptions table_options_;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
+ {"user_key_len",
+ {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
+ OptionVerificationType::kNormal, false, 0}},
+ {"bloom_bits_per_key",
+ {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
+ OptionVerificationType::kNormal, false, 0}},
+ {"hash_table_ratio",
+ {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
+ OptionVerificationType::kNormal, false, 0}},
+ {"index_sparseness",
+ {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
+ OptionVerificationType::kNormal, false, 0}},
+ {"huge_page_tlb_size",
+ {offsetof(struct PlainTableOptions, huge_page_tlb_size),
+ OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
+ {"encoding_type",
+ {offsetof(struct PlainTableOptions, encoding_type),
+ OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}},
+ {"full_scan_mode",
+ {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
+ OptionVerificationType::kNormal, false, 0}},
+ {"store_index_in_file",
+ {offsetof(struct PlainTableOptions, store_index_in_file),
+ OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_index.cc b/src/rocksdb/table/plain/plain_table_index.cc
new file mode 100644
index 000000000..1099dfa6e
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_index.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+
+#include "table/plain/plain_table_index.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+ assert(num_buckets > 0);
+ return hash % num_buckets;
+}
+}
+
+Status PlainTableIndex::InitFromRawData(Slice data) {
+ if (!GetVarint32(&data, &index_size_)) {
+ return Status::Corruption("Couldn't read the index size!");
+ }
+ assert(index_size_ > 0);
+ if (!GetVarint32(&data, &num_prefixes_)) {
+ return Status::Corruption("Couldn't read the index size!");
+ }
+ sub_index_size_ =
+ static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
+
+ char* index_data_begin = const_cast<char*>(data.data());
+ index_ = reinterpret_cast<uint32_t*>(index_data_begin);
+ sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
+ return Status::OK();
+}
+
+PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
+ uint32_t prefix_hash, uint32_t* bucket_value) const {
+ int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
+ GetUnaligned(index_ + bucket, bucket_value);
+ if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
+ *bucket_value ^= kSubIndexMask;
+ return kSubindex;
+ }
+ if (*bucket_value >= kMaxFileSize) {
+ return kNoPrefixForBucket;
+ } else {
+ // point directly to the file
+ return kDirectToFile;
+ }
+}
+
+void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
+ uint32_t offset) {
+ if (num_records_in_current_group_ == kNumRecordsPerGroup) {
+ current_group_ = AllocateNewGroup();
+ num_records_in_current_group_ = 0;
+ }
+ auto& new_record = current_group_[num_records_in_current_group_++];
+ new_record.hash = hash;
+ new_record.offset = offset;
+ new_record.next = nullptr;
+}
+
+void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
+ uint32_t key_offset) {
+ if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
+ ++num_prefixes_;
+ if (!is_first_record_) {
+ keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+ }
+ num_keys_per_prefix_ = 0;
+ prev_key_prefix_ = key_prefix_slice.ToString();
+ prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
+ due_index_ = true;
+ }
+
+ if (due_index_) {
+ // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+ record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
+ due_index_ = false;
+ }
+
+ num_keys_per_prefix_++;
+ if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
+ due_index_ = true;
+ }
+ is_first_record_ = false;
+}
+
+Slice PlainTableIndexBuilder::Finish() {
+ AllocateIndex();
+ std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
+ std::vector<uint32_t> entries_per_bucket(index_size_, 0);
+ BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
+
+ keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+ ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s",
+ keys_per_prefix_hist_.ToString().c_str());
+
+ // From the temp data structure, populate indexes.
+ return FillIndexes(hash_to_offsets, entries_per_bucket);
+}
+
+void PlainTableIndexBuilder::AllocateIndex() {
+ if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
+ // Fall back to pure binary search if the user fails to specify a prefix
+ // extractor.
+ index_size_ = 1;
+ } else {
+ double hash_table_size_multipier = 1.0 / hash_table_ratio_;
+ index_size_ =
+ static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
+ assert(index_size_ > 0);
+ }
+}
+
+void PlainTableIndexBuilder::BucketizeIndexes(
+ std::vector<IndexRecord*>* hash_to_offsets,
+ std::vector<uint32_t>* entries_per_bucket) {
+ bool first = true;
+ uint32_t prev_hash = 0;
+ size_t num_records = record_list_.GetNumRecords();
+ for (size_t i = 0; i < num_records; i++) {
+ IndexRecord* index_record = record_list_.At(i);
+ uint32_t cur_hash = index_record->hash;
+ if (first || prev_hash != cur_hash) {
+ prev_hash = cur_hash;
+ first = false;
+ }
+ uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
+ IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
+ index_record->next = prev_bucket_head;
+ (*hash_to_offsets)[bucket] = index_record;
+ (*entries_per_bucket)[bucket]++;
+ }
+
+ sub_index_size_ = 0;
+ for (auto entry_count : *entries_per_bucket) {
+ if (entry_count <= 1) {
+ continue;
+ }
+ // Only buckets with more than 1 entry will have subindex.
+ sub_index_size_ += VarintLength(entry_count);
+ // total bytes needed to store these entries' in-file offsets.
+ sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
+ }
+}
+
+Slice PlainTableIndexBuilder::FillIndexes(
+ const std::vector<IndexRecord*>& hash_to_offsets,
+ const std::vector<uint32_t>& entries_per_bucket) {
+ ROCKS_LOG_DEBUG(ioptions_.info_log,
+ "Reserving %" PRIu32 " bytes for plain table's sub_index",
+ sub_index_size_);
+ auto total_allocate_size = GetTotalSize();
+ char* allocated = arena_->AllocateAligned(
+ total_allocate_size, huge_page_tlb_size_, ioptions_.info_log);
+
+ auto temp_ptr = EncodeVarint32(allocated, index_size_);
+ uint32_t* index =
+ reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
+ char* sub_index = reinterpret_cast<char*>(index + index_size_);
+
+ uint32_t sub_index_offset = 0;
+ for (uint32_t i = 0; i < index_size_; i++) {
+ uint32_t num_keys_for_bucket = entries_per_bucket[i];
+ switch (num_keys_for_bucket) {
+ case 0:
+ // No key for bucket
+ PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
+ break;
+ case 1:
+ // point directly to the file offset
+ PutUnaligned(index + i, hash_to_offsets[i]->offset);
+ break;
+ default:
+ // point to second level indexes.
+ PutUnaligned(index + i, sub_index_offset | PlainTableIndex::kSubIndexMask);
+ char* prev_ptr = &sub_index[sub_index_offset];
+ char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+ sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
+ char* sub_index_pos = &sub_index[sub_index_offset];
+ IndexRecord* record = hash_to_offsets[i];
+ int j;
+ for (j = num_keys_for_bucket - 1; j >= 0 && record;
+ j--, record = record->next) {
+ EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
+ }
+ assert(j == -1 && record == nullptr);
+ sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
+ assert(sub_index_offset <= sub_index_size_);
+ break;
+ }
+ }
+ assert(sub_index_offset == sub_index_size_);
+
+ ROCKS_LOG_DEBUG(ioptions_.info_log,
+ "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
+ index_size_, sub_index_size_);
+ return Slice(allocated, GetTotalSize());
+}
+
+const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
+ "PlainTableIndexBlock";
+}; // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_index.h b/src/rocksdb/table/plain/plain_table_index.h
new file mode 100644
index 000000000..86385b906
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_index.h
@@ -0,0 +1,249 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "memory/arena.h"
+#include "monitoring/histogram.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains two classes PlainTableIndex and PlainTableIndexBuilder
+// The two classes implement the index format of PlainTable.
+// For descripton of PlainTable format, see comments of class
+// PlainTableFactory
+//
+//
+// PlainTableIndex contains buckets size of index_size_, each is a
+// 32-bit integer. The lower 31 bits contain an offset value (explained below)
+// and the first bit of the integer indicates type of the offset.
+//
+// +--------------+------------------------------------------------------+
+// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
+// +--------------+------------------------------------------------------+
+//
+// Explanation for the "flag bit":
+//
+// 0 indicates that the bucket contains only one prefix (no conflict when
+// hashing this prefix), whose first row starts from this offset of the
+// file.
+// 1 indicates that the bucket contains more than one prefixes, or there
+// are too many rows for one prefix so we need a binary search for it. In
+// this case, the offset indicates the offset of sub_index_ holding the
+// binary search indexes of keys for those rows. Those binary search indexes
+// are organized in this way:
+//
+// The first 4 bytes, indicate how many indexes (N) are stored after it. After
+// it, there are N 32-bit integers, each points of an offset of the file,
+// which
+// points to starting of a row. Those offsets need to be guaranteed to be in
+// ascending order so the keys they are pointing to are also in ascending
+// order
+// to make sure we can use them to do binary searches. Below is visual
+// presentation of a bucket.
+//
+// <begin>
+// number_of_records: varint32
+// record 1 file offset: fixedint32
+// record 2 file offset: fixedint32
+// ....
+// record N file offset: fixedint32
+// <end>
+
+// The class loads the index block from a PlainTable SST file, and executes
+// the index lookup.
+// The class is used by PlainTableReader class.
+class PlainTableIndex {
+ public:
+ enum IndexSearchResult {
+ kNoPrefixForBucket = 0,
+ kDirectToFile = 1,
+ kSubindex = 2
+ };
+
+ explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
+
+ PlainTableIndex()
+ : index_size_(0),
+ sub_index_size_(0),
+ num_prefixes_(0),
+ index_(nullptr),
+ sub_index_(nullptr) {}
+
+ // The function that executes the lookup the hash table.
+ // The hash key is `prefix_hash`. The function fills the hash bucket
+ // content in `bucket_value`, which is up to the caller to interpret.
+ IndexSearchResult GetOffset(uint32_t prefix_hash,
+ uint32_t* bucket_value) const;
+
+ // Initialize data from `index_data`, which points to raw data for
+ // index stored in the SST file.
+ Status InitFromRawData(Slice index_data);
+
+ // Decode the sub index for specific hash bucket.
+ // The `offset` is the value returned as `bucket_value` by GetOffset()
+ // and is only valid when the return value is `kSubindex`.
+ // The return value is the pointer to the starting address of the
+ // sub-index. `upper_bound` is filled with the value indicating how many
+ // entries the sub-index has.
+ const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
+ uint32_t* upper_bound) const {
+ const char* index_ptr = &sub_index_[offset];
+ return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
+ }
+
+ uint32_t GetIndexSize() const { return index_size_; }
+
+ uint32_t GetSubIndexSize() const { return sub_index_size_; }
+
+ uint32_t GetNumPrefixes() const { return num_prefixes_; }
+
+ static const uint64_t kMaxFileSize = (1u << 31) - 1;
+ static const uint32_t kSubIndexMask = 0x80000000;
+ static const size_t kOffsetLen = sizeof(uint32_t);
+
+ private:
+ uint32_t index_size_;
+ uint32_t sub_index_size_;
+ uint32_t num_prefixes_;
+
+ uint32_t* index_;
+ char* sub_index_;
+};
+
+// PlainTableIndexBuilder is used to create plain table index.
+// After calling Finish(), it returns Slice, which is usually
+// used either to initialize PlainTableIndex or
+// to save index to sst file.
+// For more details about the index, please refer to:
+// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
+// #wiki-in-memory-index-format
+// The class is used by PlainTableBuilder class.
+class PlainTableIndexBuilder {
+ public:
+ PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
+ const SliceTransform* prefix_extractor,
+ size_t index_sparseness, double hash_table_ratio,
+ size_t huge_page_tlb_size)
+ : arena_(arena),
+ ioptions_(ioptions),
+ record_list_(kRecordsPerGroup),
+ is_first_record_(true),
+ due_index_(false),
+ num_prefixes_(0),
+ num_keys_per_prefix_(0),
+ prev_key_prefix_hash_(0),
+ index_sparseness_(index_sparseness),
+ index_size_(0),
+ sub_index_size_(0),
+ prefix_extractor_(prefix_extractor),
+ hash_table_ratio_(hash_table_ratio),
+ huge_page_tlb_size_(huge_page_tlb_size) {}
+
+ void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
+
+ Slice Finish();
+
+ uint32_t GetTotalSize() const {
+ return VarintLength(index_size_) + VarintLength(num_prefixes_) +
+ PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
+ }
+
+ static const std::string kPlainTableIndexBlock;
+
+ private:
+ struct IndexRecord {
+ uint32_t hash; // hash of the prefix
+ uint32_t offset; // offset of a row
+ IndexRecord* next;
+ };
+
+ // Helper class to track all the index records
+ class IndexRecordList {
+ public:
+ explicit IndexRecordList(size_t num_records_per_group)
+ : kNumRecordsPerGroup(num_records_per_group),
+ current_group_(nullptr),
+ num_records_in_current_group_(num_records_per_group) {}
+
+ ~IndexRecordList() {
+ for (size_t i = 0; i < groups_.size(); i++) {
+ delete[] groups_[i];
+ }
+ }
+
+ void AddRecord(uint32_t hash, uint32_t offset);
+
+ size_t GetNumRecords() const {
+ return (groups_.size() - 1) * kNumRecordsPerGroup +
+ num_records_in_current_group_;
+ }
+ IndexRecord* At(size_t index) {
+ return &(groups_[index / kNumRecordsPerGroup]
+ [index % kNumRecordsPerGroup]);
+ }
+
+ private:
+ IndexRecord* AllocateNewGroup() {
+ IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
+ groups_.push_back(result);
+ return result;
+ }
+
+ // Each group in `groups_` contains fix-sized records (determined by
+ // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
+ // occurs.
+ const size_t kNumRecordsPerGroup;
+ IndexRecord* current_group_;
+ // List of arrays allocated
+ std::vector<IndexRecord*> groups_;
+ size_t num_records_in_current_group_;
+ };
+
+ void AllocateIndex();
+
+ // Internal helper function to bucket index record list to hash buckets.
+ void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
+ std::vector<uint32_t>* entries_per_bucket);
+
+ // Internal helper class to fill the indexes and bloom filters to internal
+ // data structures.
+ Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
+ const std::vector<uint32_t>& entries_per_bucket);
+
+ Arena* arena_;
+ const ImmutableCFOptions ioptions_;
+ HistogramImpl keys_per_prefix_hist_;
+ IndexRecordList record_list_;
+ bool is_first_record_;
+ bool due_index_;
+ uint32_t num_prefixes_;
+ uint32_t num_keys_per_prefix_;
+
+ uint32_t prev_key_prefix_hash_;
+ size_t index_sparseness_;
+ uint32_t index_size_;
+ uint32_t sub_index_size_;
+
+ const SliceTransform* prefix_extractor_;
+ double hash_table_ratio_;
+ size_t huge_page_tlb_size_;
+
+ std::string prev_key_prefix_;
+
+ static const size_t kRecordsPerGroup = 256;
+};
+
+}; // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_key_coding.cc b/src/rocksdb/table/plain/plain_table_key_coding.cc
new file mode 100644
index 000000000..d82b969ba
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_key_coding.cc
@@ -0,0 +1,498 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_key_coding.h"
+
+#include <algorithm>
+#include <string>
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum PlainTableEntryType : unsigned char {
+ kFullKey = 0,
+ kPrefixFromPreviousKey = 1,
+ kKeySuffix = 2,
+};
+
+namespace {
+
+// Control byte:
+// First two bits indicate type of entry
+// Other bytes are inlined sizes. If all bits are 1 (0x03F), overflow bytes
+// are used. key_size-0x3F will be encoded as a variint32 after this bytes.
+
+const unsigned char kSizeInlineLimit = 0x3F;
+
+// Return 0 for error
+size_t EncodeSize(PlainTableEntryType type, uint32_t key_size,
+ char* out_buffer) {
+ out_buffer[0] = type << 6;
+
+ if (key_size < static_cast<uint32_t>(kSizeInlineLimit)) {
+ // size inlined
+ out_buffer[0] |= static_cast<char>(key_size);
+ return 1;
+ } else {
+ out_buffer[0] |= kSizeInlineLimit;
+ char* ptr = EncodeVarint32(out_buffer + 1, key_size - kSizeInlineLimit);
+ return ptr - out_buffer;
+ }
+}
+} // namespace
+
+// Fill bytes_read with number of bytes read.
+inline Status PlainTableKeyDecoder::DecodeSize(uint32_t start_offset,
+ PlainTableEntryType* entry_type,
+ uint32_t* key_size,
+ uint32_t* bytes_read) {
+ Slice next_byte_slice;
+ bool success = file_reader_.Read(start_offset, 1, &next_byte_slice);
+ if (!success) {
+ return file_reader_.status();
+ }
+ *entry_type = static_cast<PlainTableEntryType>(
+ (static_cast<unsigned char>(next_byte_slice[0]) & ~kSizeInlineLimit) >>
+ 6);
+ char inline_key_size = next_byte_slice[0] & kSizeInlineLimit;
+ if (inline_key_size < kSizeInlineLimit) {
+ *key_size = inline_key_size;
+ *bytes_read = 1;
+ return Status::OK();
+ } else {
+ uint32_t extra_size;
+ uint32_t tmp_bytes_read;
+ success = file_reader_.ReadVarint32(start_offset + 1, &extra_size,
+ &tmp_bytes_read);
+ if (!success) {
+ return file_reader_.status();
+ }
+ assert(tmp_bytes_read > 0);
+ *key_size = kSizeInlineLimit + extra_size;
+ *bytes_read = tmp_bytes_read + 1;
+ return Status::OK();
+ }
+}
+
+Status PlainTableKeyEncoder::AppendKey(const Slice& key,
+ WritableFileWriter* file,
+ uint64_t* offset, char* meta_bytes_buf,
+ size_t* meta_bytes_buf_size) {
+ ParsedInternalKey parsed_key;
+ if (!ParseInternalKey(key, &parsed_key)) {
+ return Status::Corruption(Slice());
+ }
+
+ Slice key_to_write = key; // Portion of internal key to write out.
+
+ uint32_t user_key_size = static_cast<uint32_t>(key.size() - 8);
+ if (encoding_type_ == kPlain) {
+ if (fixed_user_key_len_ == kPlainTableVariableLength) {
+ // Write key length
+ char key_size_buf[5]; // tmp buffer for key size as varint32
+ char* ptr = EncodeVarint32(key_size_buf, user_key_size);
+ assert(ptr <= key_size_buf + sizeof(key_size_buf));
+ auto len = ptr - key_size_buf;
+ Status s = file->Append(Slice(key_size_buf, len));
+ if (!s.ok()) {
+ return s;
+ }
+ *offset += len;
+ }
+ } else {
+ assert(encoding_type_ == kPrefix);
+ char size_bytes[12];
+ size_t size_bytes_pos = 0;
+
+ Slice prefix =
+ prefix_extractor_->Transform(Slice(key.data(), user_key_size));
+ if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetUserKey() ||
+ key_count_for_prefix_ % index_sparseness_ == 0) {
+ key_count_for_prefix_ = 1;
+ pre_prefix_.SetUserKey(prefix);
+ size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes);
+ Status s = file->Append(Slice(size_bytes, size_bytes_pos));
+ if (!s.ok()) {
+ return s;
+ }
+ *offset += size_bytes_pos;
+ } else {
+ key_count_for_prefix_++;
+ if (key_count_for_prefix_ == 2) {
+ // For second key within a prefix, need to encode prefix length
+ size_bytes_pos +=
+ EncodeSize(kPrefixFromPreviousKey,
+ static_cast<uint32_t>(pre_prefix_.GetUserKey().size()),
+ size_bytes + size_bytes_pos);
+ }
+ uint32_t prefix_len =
+ static_cast<uint32_t>(pre_prefix_.GetUserKey().size());
+ size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len,
+ size_bytes + size_bytes_pos);
+ Status s = file->Append(Slice(size_bytes, size_bytes_pos));
+ if (!s.ok()) {
+ return s;
+ }
+ *offset += size_bytes_pos;
+ key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len);
+ }
+ }
+
+ // Encode full key
+ // For value size as varint32 (up to 5 bytes).
+ // If the row is of value type with seqId 0, flush the special flag together
+ // in this buffer to safe one file append call, which takes 1 byte.
+ if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
+ Status s =
+ file->Append(Slice(key_to_write.data(), key_to_write.size() - 8));
+ if (!s.ok()) {
+ return s;
+ }
+ *offset += key_to_write.size() - 8;
+ meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0;
+ *meta_bytes_buf_size += 1;
+ } else {
+ file->Append(key_to_write);
+ *offset += key_to_write.size();
+ }
+
+ return Status::OK();
+}
+
+Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset,
+ uint32_t len) {
+ assert(file_offset + len <= file_info_->data_end_offset);
+ return Slice(buffer->buf.get() + (file_offset - buffer->buf_start_offset),
+ len);
+}
+
+bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len,
+ Slice* out) {
+ const uint32_t kPrefetchSize = 256u;
+
+ // Try to read from buffers.
+ for (uint32_t i = 0; i < num_buf_; i++) {
+ Buffer* buffer = buffers_[num_buf_ - 1 - i].get();
+ if (file_offset >= buffer->buf_start_offset &&
+ file_offset + len <= buffer->buf_start_offset + buffer->buf_len) {
+ *out = GetFromBuffer(buffer, file_offset, len);
+ return true;
+ }
+ }
+
+ Buffer* new_buffer;
+ // Data needed is not in any of the buffer. Allocate a new buffer.
+ if (num_buf_ < buffers_.size()) {
+ // Add a new buffer
+ new_buffer = new Buffer();
+ buffers_[num_buf_++].reset(new_buffer);
+ } else {
+ // Now simply replace the last buffer. Can improve the placement policy
+ // if needed.
+ new_buffer = buffers_[num_buf_ - 1].get();
+ }
+
+ assert(file_offset + len <= file_info_->data_end_offset);
+ uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset,
+ std::max(kPrefetchSize, len));
+ if (size_to_read > new_buffer->buf_capacity) {
+ new_buffer->buf.reset(new char[size_to_read]);
+ new_buffer->buf_capacity = size_to_read;
+ new_buffer->buf_len = 0;
+ }
+ Slice read_result;
+ Status s = file_info_->file->Read(file_offset, size_to_read, &read_result,
+ new_buffer->buf.get());
+ if (!s.ok()) {
+ status_ = s;
+ return false;
+ }
+ new_buffer->buf_start_offset = file_offset;
+ new_buffer->buf_len = size_to_read;
+ *out = GetFromBuffer(new_buffer, file_offset, len);
+ return true;
+}
+
+inline bool PlainTableFileReader::ReadVarint32(uint32_t offset, uint32_t* out,
+ uint32_t* bytes_read) {
+ if (file_info_->is_mmap_mode) {
+ const char* start = file_info_->file_data.data() + offset;
+ const char* limit =
+ file_info_->file_data.data() + file_info_->data_end_offset;
+ const char* key_ptr = GetVarint32Ptr(start, limit, out);
+ assert(key_ptr != nullptr);
+ *bytes_read = static_cast<uint32_t>(key_ptr - start);
+ return true;
+ } else {
+ return ReadVarint32NonMmap(offset, out, bytes_read);
+ }
+}
+
+bool PlainTableFileReader::ReadVarint32NonMmap(uint32_t offset, uint32_t* out,
+ uint32_t* bytes_read) {
+ const char* start;
+ const char* limit;
+ const uint32_t kMaxVarInt32Size = 6u;
+ uint32_t bytes_to_read =
+ std::min(file_info_->data_end_offset - offset, kMaxVarInt32Size);
+ Slice bytes;
+ if (!Read(offset, bytes_to_read, &bytes)) {
+ return false;
+ }
+ start = bytes.data();
+ limit = bytes.data() + bytes.size();
+
+ const char* key_ptr = GetVarint32Ptr(start, limit, out);
+ *bytes_read =
+ (key_ptr != nullptr) ? static_cast<uint32_t>(key_ptr - start) : 0;
+ return true;
+}
+
+Status PlainTableKeyDecoder::ReadInternalKey(
+ uint32_t file_offset, uint32_t user_key_size, ParsedInternalKey* parsed_key,
+ uint32_t* bytes_read, bool* internal_key_valid, Slice* internal_key) {
+ Slice tmp_slice;
+ bool success = file_reader_.Read(file_offset, user_key_size + 1, &tmp_slice);
+ if (!success) {
+ return file_reader_.status();
+ }
+ if (tmp_slice[user_key_size] == PlainTableFactory::kValueTypeSeqId0) {
+ // Special encoding for the row with seqID=0
+ parsed_key->user_key = Slice(tmp_slice.data(), user_key_size);
+ parsed_key->sequence = 0;
+ parsed_key->type = kTypeValue;
+ *bytes_read += user_key_size + 1;
+ *internal_key_valid = false;
+ } else {
+ success = file_reader_.Read(file_offset, user_key_size + 8, internal_key);
+ if (!success) {
+ return file_reader_.status();
+ }
+ *internal_key_valid = true;
+ if (!ParseInternalKey(*internal_key, parsed_key)) {
+ return Status::Corruption(
+ Slice("Incorrect value type found when reading the next key"));
+ }
+ *bytes_read += user_key_size + 8;
+ }
+ return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key,
+ uint32_t* bytes_read,
+ bool* /*seekable*/) {
+ uint32_t user_key_size = 0;
+ Status s;
+ if (fixed_user_key_len_ != kPlainTableVariableLength) {
+ user_key_size = fixed_user_key_len_;
+ } else {
+ uint32_t tmp_size = 0;
+ uint32_t tmp_read;
+ bool success =
+ file_reader_.ReadVarint32(start_offset, &tmp_size, &tmp_read);
+ if (!success) {
+ return file_reader_.status();
+ }
+ assert(tmp_read > 0);
+ user_key_size = tmp_size;
+ *bytes_read = tmp_read;
+ }
+ // dummy initial value to avoid compiler complain
+ bool decoded_internal_key_valid = true;
+ Slice decoded_internal_key;
+ s = ReadInternalKey(start_offset + *bytes_read, user_key_size, parsed_key,
+ bytes_read, &decoded_internal_key_valid,
+ &decoded_internal_key);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!file_reader_.file_info()->is_mmap_mode) {
+ cur_key_.SetInternalKey(*parsed_key);
+ parsed_key->user_key =
+ Slice(cur_key_.GetInternalKey().data(), user_key_size);
+ if (internal_key != nullptr) {
+ *internal_key = cur_key_.GetInternalKey();
+ }
+ } else if (internal_key != nullptr) {
+ if (decoded_internal_key_valid) {
+ *internal_key = decoded_internal_key;
+ } else {
+ // Need to copy out the internal key
+ cur_key_.SetInternalKey(*parsed_key);
+ *internal_key = cur_key_.GetInternalKey();
+ }
+ }
+ return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextPrefixEncodingKey(
+ uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key,
+ uint32_t* bytes_read, bool* seekable) {
+ PlainTableEntryType entry_type;
+
+ bool expect_suffix = false;
+ Status s;
+ do {
+ uint32_t size = 0;
+ // dummy initial value to avoid compiler complain
+ bool decoded_internal_key_valid = true;
+ uint32_t my_bytes_read = 0;
+ s = DecodeSize(start_offset + *bytes_read, &entry_type, &size,
+ &my_bytes_read);
+ if (!s.ok()) {
+ return s;
+ }
+ if (my_bytes_read == 0) {
+ return Status::Corruption("Unexpected EOF when reading size of the key");
+ }
+ *bytes_read += my_bytes_read;
+
+ switch (entry_type) {
+ case kFullKey: {
+ expect_suffix = false;
+ Slice decoded_internal_key;
+ s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+ bytes_read, &decoded_internal_key_valid,
+ &decoded_internal_key);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!file_reader_.file_info()->is_mmap_mode ||
+ (internal_key != nullptr && !decoded_internal_key_valid)) {
+ // In non-mmap mode, always need to make a copy of keys returned to
+ // users, because after reading value for the key, the key might
+ // be invalid.
+ cur_key_.SetInternalKey(*parsed_key);
+ saved_user_key_ = cur_key_.GetUserKey();
+ if (!file_reader_.file_info()->is_mmap_mode) {
+ parsed_key->user_key =
+ Slice(cur_key_.GetInternalKey().data(), size);
+ }
+ if (internal_key != nullptr) {
+ *internal_key = cur_key_.GetInternalKey();
+ }
+ } else {
+ if (internal_key != nullptr) {
+ *internal_key = decoded_internal_key;
+ }
+ saved_user_key_ = parsed_key->user_key;
+ }
+ break;
+ }
+ case kPrefixFromPreviousKey: {
+ if (seekable != nullptr) {
+ *seekable = false;
+ }
+ prefix_len_ = size;
+ assert(prefix_extractor_ == nullptr ||
+ prefix_extractor_->Transform(saved_user_key_).size() ==
+ prefix_len_);
+ // Need read another size flag for suffix
+ expect_suffix = true;
+ break;
+ }
+ case kKeySuffix: {
+ expect_suffix = false;
+ if (seekable != nullptr) {
+ *seekable = false;
+ }
+
+ Slice tmp_slice;
+ s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+ bytes_read, &decoded_internal_key_valid,
+ &tmp_slice);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!file_reader_.file_info()->is_mmap_mode) {
+ // In non-mmap mode, we need to make a copy of keys returned to
+ // users, because after reading value for the key, the key might
+ // be invalid.
+ // saved_user_key_ points to cur_key_. We are making a copy of
+ // the prefix part to another string, and construct the current
+ // key from the prefix part and the suffix part back to cur_key_.
+ std::string tmp =
+ Slice(saved_user_key_.data(), prefix_len_).ToString();
+ cur_key_.Reserve(prefix_len_ + size);
+ cur_key_.SetInternalKey(tmp, *parsed_key);
+ parsed_key->user_key =
+ Slice(cur_key_.GetInternalKey().data(), prefix_len_ + size);
+ saved_user_key_ = cur_key_.GetUserKey();
+ } else {
+ cur_key_.Reserve(prefix_len_ + size);
+ cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_),
+ *parsed_key);
+ }
+ parsed_key->user_key = cur_key_.GetUserKey();
+ if (internal_key != nullptr) {
+ *internal_key = cur_key_.GetInternalKey();
+ }
+ break;
+ }
+ default:
+ return Status::Corruption("Un-identified size flag.");
+ }
+ } while (expect_suffix); // Another round if suffix is expected.
+ return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextKey(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key, Slice* value,
+ uint32_t* bytes_read, bool* seekable) {
+ assert(value != nullptr);
+ Status s = NextKeyNoValue(start_offset, parsed_key, internal_key, bytes_read,
+ seekable);
+ if (s.ok()) {
+ assert(bytes_read != nullptr);
+ uint32_t value_size;
+ uint32_t value_size_bytes;
+ bool success = file_reader_.ReadVarint32(start_offset + *bytes_read,
+ &value_size, &value_size_bytes);
+ if (!success) {
+ return file_reader_.status();
+ }
+ if (value_size_bytes == 0) {
+ return Status::Corruption(
+ "Unexpected EOF when reading the next value's size.");
+ }
+ *bytes_read += value_size_bytes;
+ success = file_reader_.Read(start_offset + *bytes_read, value_size, value);
+ if (!success) {
+ return file_reader_.status();
+ }
+ *bytes_read += value_size;
+ }
+ return s;
+}
+
+Status PlainTableKeyDecoder::NextKeyNoValue(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key,
+ uint32_t* bytes_read,
+ bool* seekable) {
+ *bytes_read = 0;
+ if (seekable != nullptr) {
+ *seekable = true;
+ }
+ Status s;
+ if (encoding_type_ == kPlain) {
+ return NextPlainEncodingKey(start_offset, parsed_key, internal_key,
+ bytes_read, seekable);
+ } else {
+ assert(encoding_type_ == kPrefix);
+ return NextPrefixEncodingKey(start_offset, parsed_key, internal_key,
+ bytes_read, seekable);
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LIT
diff --git a/src/rocksdb/table/plain/plain_table_key_coding.h b/src/rocksdb/table/plain/plain_table_key_coding.h
new file mode 100644
index 000000000..d1460837d
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_key_coding.h
@@ -0,0 +1,193 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <array>
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include "table/plain/plain_table_reader.h"
+
+// The file contains three helper classes of PlainTable format,
+// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
+// These classes issue the lowest level of operations of PlainTable.
+// Actual data format of the key is documented in comments of class
+// PlainTableFactory.
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFile;
+struct ParsedInternalKey;
+struct PlainTableReaderFileInfo;
+enum PlainTableEntryType : unsigned char;
+
+// Helper class for PlainTable format to write out a key to an output file
+// The class is used in PlainTableBuilder.
+class PlainTableKeyEncoder {
+ public:
+ explicit PlainTableKeyEncoder(EncodingType encoding_type,
+ uint32_t user_key_len,
+ const SliceTransform* prefix_extractor,
+ size_t index_sparseness)
+ : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
+ fixed_user_key_len_(user_key_len),
+ prefix_extractor_(prefix_extractor),
+ index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
+ key_count_for_prefix_(0) {}
+ // key: the key to write out, in the format of internal key.
+ // file: the output file to write out
+ // offset: offset in the file. Needs to be updated after appending bytes
+ // for the key
+ // meta_bytes_buf: buffer for extra meta bytes
+ // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
+ // if meta_bytes_buf is updated.
+ Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset,
+ char* meta_bytes_buf, size_t* meta_bytes_buf_size);
+
+ // Return actual encoding type to be picked
+ EncodingType GetEncodingType() { return encoding_type_; }
+
+ private:
+ EncodingType encoding_type_;
+ uint32_t fixed_user_key_len_;
+ const SliceTransform* prefix_extractor_;
+ const size_t index_sparseness_;
+ size_t key_count_for_prefix_;
+ IterKey pre_prefix_;
+};
+
+// The class does raw file reads for PlainTableReader.
+// It hides whether it is a mmap-read, or a non-mmap read.
+// The class is implemented in a way to favor the performance of mmap case.
+// The class is used by PlainTableReader.
+class PlainTableFileReader {
+ public:
+ explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
+ : file_info_(_file_info), num_buf_(0) {}
+ // In mmaped mode, the results point to mmaped area of the file, which
+ // means it is always valid before closing the file.
+ // In non-mmap mode, the results point to an internal buffer. If the caller
+ // makes another read call, the results may not be valid. So callers should
+ // make a copy when needed.
+ // In order to save read calls to files, we keep two internal buffers:
+ // the first read and the most recent read. This is efficient because it
+ // columns these two common use cases:
+ // (1) hash index only identify one location, we read the key to verify
+ // the location, and read key and value if it is the right location.
+ // (2) after hash index checking, we identify two locations (because of
+ // hash bucket conflicts), we binary search the two location to see
+ // which one is what we need and start to read from the location.
+ // These two most common use cases will be covered by the two buffers
+ // so that we don't need to re-read the same location.
+ // Currently we keep a fixed size buffer. If a read doesn't exactly fit
+ // the buffer, we replace the second buffer with the location user reads.
+ //
+ // If return false, status code is stored in status_.
+ bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
+ if (file_info_->is_mmap_mode) {
+ assert(file_offset + len <= file_info_->data_end_offset);
+ *out = Slice(file_info_->file_data.data() + file_offset, len);
+ return true;
+ } else {
+ return ReadNonMmap(file_offset, len, out);
+ }
+ }
+
+ // If return false, status code is stored in status_.
+ bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
+
+ // *bytes_read = 0 means eof. false means failure and status is saved
+ // in status_. Not directly returning Status to save copying status
+ // object to map previous performance of mmap mode.
+ inline bool ReadVarint32(uint32_t offset, uint32_t* output,
+ uint32_t* bytes_read);
+
+ bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
+ uint32_t* bytes_read);
+
+ Status status() const { return status_; }
+
+ const PlainTableReaderFileInfo* file_info() { return file_info_; }
+
+ private:
+ const PlainTableReaderFileInfo* file_info_;
+
+ struct Buffer {
+ Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
+ std::unique_ptr<char[]> buf;
+ uint32_t buf_start_offset;
+ uint32_t buf_len;
+ uint32_t buf_capacity;
+ };
+
+ // Keep buffers for two recent reads.
+ std::array<std::unique_ptr<Buffer>, 2> buffers_;
+ uint32_t num_buf_;
+ Status status_;
+
+ Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
+};
+
+// A helper class to decode keys from input buffer
+// The class is used by PlainTableBuilder.
+class PlainTableKeyDecoder {
+ public:
+ explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
+ EncodingType encoding_type,
+ uint32_t user_key_len,
+ const SliceTransform* prefix_extractor)
+ : file_reader_(file_info),
+ encoding_type_(encoding_type),
+ prefix_len_(0),
+ fixed_user_key_len_(user_key_len),
+ prefix_extractor_(prefix_extractor),
+ in_prefix_(false) {}
+ // Find the next key.
+ // start: char array where the key starts.
+ // limit: boundary of the char array
+ // parsed_key: the output of the result key
+ // internal_key: if not null, fill with the output of the result key in
+ // un-parsed format
+ // bytes_read: how many bytes read from start. Output
+ // seekable: whether key can be read from this place. Used when building
+ // indexes. Output.
+ Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
+ Slice* internal_key, Slice* value, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+
+ Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
+ Slice* internal_key, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+
+ PlainTableFileReader file_reader_;
+ EncodingType encoding_type_;
+ uint32_t prefix_len_;
+ uint32_t fixed_user_key_len_;
+ Slice saved_user_key_;
+ IterKey cur_key_;
+ const SliceTransform* prefix_extractor_;
+ bool in_prefix_;
+
+ private:
+ Status NextPlainEncodingKey(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+ Status NextPrefixEncodingKey(uint32_t start_offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key, uint32_t* bytes_read,
+ bool* seekable = nullptr);
+ Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
+ ParsedInternalKey* parsed_key, uint32_t* bytes_read,
+ bool* internal_key_valid, Slice* internal_key);
+ inline Status DecodeSize(uint32_t start_offset,
+ PlainTableEntryType* entry_type, uint32_t* key_size,
+ uint32_t* bytes_read);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_reader.cc b/src/rocksdb/table/plain/plain_table_reader.cc
new file mode 100644
index 000000000..55756d9c1
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_reader.cc
@@ -0,0 +1,775 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "table/plain/plain_table_reader.h"
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+
+#include "table/block_based/block.h"
+#include "table/block_based/filter_block.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/two_level_iterator.h"
+
+#include "memory/arena.h"
+#include "monitoring/histogram.h"
+#include "monitoring/perf_context_imp.h"
+#include "util/coding.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Safely getting a uint32_t element from a char array, where, starting from
+// `base`, every 4 bytes are considered as an fixed 32 bit integer.
+inline uint32_t GetFixed32Element(const char* base, size_t offset) {
+ return DecodeFixed32(base + offset * sizeof(uint32_t));
+}
+} // namespace
+
+// Iterator to iterate IndexedTable
+class PlainTableIterator : public InternalIterator {
+ public:
+ explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
+ // No copying allowed
+ PlainTableIterator(const PlainTableIterator&) = delete;
+ void operator=(const Iterator&) = delete;
+
+ ~PlainTableIterator() override;
+
+ bool Valid() const override;
+
+ void SeekToFirst() override;
+
+ void SeekToLast() override;
+
+ void Seek(const Slice& target) override;
+
+ void SeekForPrev(const Slice& target) override;
+
+ void Next() override;
+
+ void Prev() override;
+
+ Slice key() const override;
+
+ Slice value() const override;
+
+ Status status() const override;
+
+ private:
+ PlainTableReader* table_;
+ PlainTableKeyDecoder decoder_;
+ bool use_prefix_seek_;
+ uint32_t offset_;
+ uint32_t next_offset_;
+ Slice key_;
+ Slice value_;
+ Status status_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+PlainTableReader::PlainTableReader(
+ const ImmutableCFOptions& ioptions,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
+ EncodingType encoding_type, uint64_t file_size,
+ const TableProperties* table_properties,
+ const SliceTransform* prefix_extractor)
+ : internal_comparator_(icomparator),
+ encoding_type_(encoding_type),
+ full_scan_mode_(false),
+ user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
+ prefix_extractor_(prefix_extractor),
+ enable_bloom_(false),
+ bloom_(6),
+ file_info_(std::move(file), storage_options,
+ static_cast<uint32_t>(table_properties->data_size)),
+ ioptions_(ioptions),
+ file_size_(file_size),
+ table_properties_(nullptr) {}
+
+PlainTableReader::~PlainTableReader() {
+}
+
+Status PlainTableReader::Open(
+ const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+ const InternalKeyComparator& internal_comparator,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
+ double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
+ bool full_scan_mode, const bool immortal_table,
+ const SliceTransform* prefix_extractor) {
+ if (file_size > PlainTableIndex::kMaxFileSize) {
+ return Status::NotSupported("File is too large for PlainTableReader!");
+ }
+
+ TableProperties* props_ptr = nullptr;
+ auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+ ioptions, &props_ptr,
+ true /* compression_type_missing */);
+ std::shared_ptr<TableProperties> props(props_ptr);
+ if (!s.ok()) {
+ return s;
+ }
+
+ assert(hash_table_ratio >= 0.0);
+ auto& user_props = props->user_collected_properties;
+ auto prefix_extractor_in_file = props->prefix_extractor_name;
+
+ if (!full_scan_mode &&
+ !prefix_extractor_in_file.empty() /* old version sst file*/
+ && prefix_extractor_in_file != "nullptr") {
+ if (!prefix_extractor) {
+ return Status::InvalidArgument(
+ "Prefix extractor is missing when opening a PlainTable built "
+ "using a prefix extractor");
+ } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) !=
+ 0) {
+ return Status::InvalidArgument(
+ "Prefix extractor given doesn't match the one used to build "
+ "PlainTable");
+ }
+ }
+
+ EncodingType encoding_type = kPlain;
+ auto encoding_type_prop =
+ user_props.find(PlainTablePropertyNames::kEncodingType);
+ if (encoding_type_prop != user_props.end()) {
+ encoding_type = static_cast<EncodingType>(
+ DecodeFixed32(encoding_type_prop->second.c_str()));
+ }
+
+ std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
+ ioptions, std::move(file), env_options, internal_comparator,
+ encoding_type, file_size, props.get(), prefix_extractor));
+
+ s = new_reader->MmapDataIfNeeded();
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (!full_scan_mode) {
+ s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
+ hash_table_ratio, index_sparseness,
+ huge_page_tlb_size);
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ // Flag to indicate it is a full scan mode so that none of the indexes
+ // can be used.
+ new_reader->full_scan_mode_ = true;
+ }
+ // PopulateIndex can add to the props, so don't store them until now
+ new_reader->table_properties_ = props;
+
+ if (immortal_table && new_reader->file_info_.is_mmap_mode) {
+ new_reader->dummy_cleanable_.reset(new Cleanable());
+ }
+
+ *table_reader = std::move(new_reader);
+ return s;
+}
+
+void PlainTableReader::SetupForCompaction() {
+}
+
+InternalIterator* PlainTableReader::NewIterator(
+ const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
+ Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
+ size_t /*compaction_readahead_size*/) {
+ // Not necessarily used here, but make sure this has been initialized
+ assert(table_properties_);
+
+ // Auto prefix mode is not implemented in PlainTable.
+ bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
+ !options.auto_prefix_mode;
+ if (arena == nullptr) {
+ return new PlainTableIterator(this, use_prefix_seek);
+ } else {
+ auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
+ return new (mem) PlainTableIterator(this, use_prefix_seek);
+ }
+}
+
+Status PlainTableReader::PopulateIndexRecordList(
+ PlainTableIndexBuilder* index_builder,
+ std::vector<uint32_t>* prefix_hashes) {
+ Slice prev_key_prefix_slice;
+ std::string prev_key_prefix_buf;
+ uint32_t pos = data_start_offset_;
+
+ bool is_first_record = true;
+ Slice key_prefix_slice;
+ PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
+ prefix_extractor_);
+ while (pos < file_info_.data_end_offset) {
+ uint32_t key_offset = pos;
+ ParsedInternalKey key;
+ Slice value_slice;
+ bool seekable = false;
+ Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
+ if (!s.ok()) {
+ return s;
+ }
+
+ key_prefix_slice = GetPrefix(key);
+ if (enable_bloom_) {
+ bloom_.AddHash(GetSliceHash(key.user_key));
+ } else {
+ if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
+ if (!is_first_record) {
+ prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
+ }
+ if (file_info_.is_mmap_mode) {
+ prev_key_prefix_slice = key_prefix_slice;
+ } else {
+ prev_key_prefix_buf = key_prefix_slice.ToString();
+ prev_key_prefix_slice = prev_key_prefix_buf;
+ }
+ }
+ }
+
+ index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
+
+ if (!seekable && is_first_record) {
+ return Status::Corruption("Key for a prefix is not seekable");
+ }
+
+ is_first_record = false;
+ }
+
+ prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
+ auto s = index_.InitFromRawData(index_builder->Finish());
+ return s;
+}
+
+void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
+ size_t huge_page_tlb_size) {
+ uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
+ if (bloom_total_bits > 0) {
+ enable_bloom_ = true;
+ bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
+ huge_page_tlb_size, ioptions_.info_log);
+ }
+}
+
+void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
+ assert(bloom_.IsInitialized());
+ for (const auto prefix_hash : prefix_hashes) {
+ bloom_.AddHash(prefix_hash);
+ }
+}
+
+Status PlainTableReader::MmapDataIfNeeded() {
+ if (file_info_.is_mmap_mode) {
+ // Get mmapped memory.
+ return file_info_.file->Read(0, static_cast<size_t>(file_size_), &file_info_.file_data, nullptr);
+ }
+ return Status::OK();
+}
+
+Status PlainTableReader::PopulateIndex(TableProperties* props,
+ int bloom_bits_per_key,
+ double hash_table_ratio,
+ size_t index_sparseness,
+ size_t huge_page_tlb_size) {
+ assert(props != nullptr);
+
+ BlockContents index_block_contents;
+ Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+ file_size_, kPlainTableMagicNumber, ioptions_,
+ PlainTableIndexBuilder::kPlainTableIndexBlock,
+ BlockType::kIndex, &index_block_contents,
+ true /* compression_type_missing */);
+
+ bool index_in_file = s.ok();
+
+ BlockContents bloom_block_contents;
+ bool bloom_in_file = false;
+ // We only need to read the bloom block if index block is in file.
+ if (index_in_file) {
+ s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+ file_size_, kPlainTableMagicNumber, ioptions_,
+ BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
+ &bloom_block_contents,
+ true /* compression_type_missing */);
+ bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
+ }
+
+ Slice* bloom_block;
+ if (bloom_in_file) {
+ // If bloom_block_contents.allocation is not empty (which will be the case
+ // for non-mmap mode), it holds the alloated memory for the bloom block.
+ // It needs to be kept alive to keep `bloom_block` valid.
+ bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
+ bloom_block = &bloom_block_contents.data;
+ } else {
+ bloom_block = nullptr;
+ }
+
+ Slice* index_block;
+ if (index_in_file) {
+ // If index_block_contents.allocation is not empty (which will be the case
+ // for non-mmap mode), it holds the alloated memory for the index block.
+ // It needs to be kept alive to keep `index_block` valid.
+ index_block_alloc_ = std::move(index_block_contents.allocation);
+ index_block = &index_block_contents.data;
+ } else {
+ index_block = nullptr;
+ }
+
+ if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
+ // moptions.prefix_extractor is requried for a hash-based look-up.
+ return Status::NotSupported(
+ "PlainTable requires a prefix extractor enable prefix hash mode.");
+ }
+
+ // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
+ // for a prefix (starting from the first one), generate a record of (hash,
+ // offset) and append it to IndexRecordList, which is a data structure created
+ // to store them.
+
+ if (!index_in_file) {
+ // Allocate bloom filter here for total order mode.
+ if (IsTotalOrderMode()) {
+ AllocateBloom(bloom_bits_per_key,
+ static_cast<uint32_t>(props->num_entries),
+ huge_page_tlb_size);
+ }
+ } else if (bloom_in_file) {
+ enable_bloom_ = true;
+ auto num_blocks_property = props->user_collected_properties.find(
+ PlainTablePropertyNames::kNumBloomBlocks);
+
+ uint32_t num_blocks = 0;
+ if (num_blocks_property != props->user_collected_properties.end()) {
+ Slice temp_slice(num_blocks_property->second);
+ if (!GetVarint32(&temp_slice, &num_blocks)) {
+ num_blocks = 0;
+ }
+ }
+ // cast away const qualifier, because bloom_ won't be changed
+ bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
+ static_cast<uint32_t>(bloom_block->size()) * 8,
+ num_blocks);
+ } else {
+ // Index in file but no bloom in file. Disable bloom filter in this case.
+ enable_bloom_ = false;
+ bloom_bits_per_key = 0;
+ }
+
+ PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
+ index_sparseness, hash_table_ratio,
+ huge_page_tlb_size);
+
+ std::vector<uint32_t> prefix_hashes;
+ if (!index_in_file) {
+ // Populates _bloom if enabled (total order mode)
+ s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ s = index_.InitFromRawData(*index_block);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ if (!index_in_file) {
+ if (!IsTotalOrderMode()) {
+ // Calculated bloom filter size and allocate memory for
+ // bloom filter based on the number of prefixes, then fill it.
+ AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
+ huge_page_tlb_size);
+ if (enable_bloom_) {
+ FillBloom(prefix_hashes);
+ }
+ }
+ }
+
+ // Fill two table properties.
+ if (!index_in_file) {
+ props->user_collected_properties["plain_table_hash_table_size"] =
+ ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
+ props->user_collected_properties["plain_table_sub_index_size"] =
+ ToString(index_.GetSubIndexSize());
+ } else {
+ props->user_collected_properties["plain_table_hash_table_size"] =
+ ToString(0);
+ props->user_collected_properties["plain_table_sub_index_size"] =
+ ToString(0);
+ }
+
+ return Status::OK();
+}
+
+Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
+ const Slice& target, const Slice& prefix,
+ uint32_t prefix_hash, bool& prefix_matched,
+ uint32_t* offset) const {
+ prefix_matched = false;
+ uint32_t prefix_index_offset;
+ auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
+ if (res == PlainTableIndex::kNoPrefixForBucket) {
+ *offset = file_info_.data_end_offset;
+ return Status::OK();
+ } else if (res == PlainTableIndex::kDirectToFile) {
+ *offset = prefix_index_offset;
+ return Status::OK();
+ }
+
+ // point to sub-index, need to do a binary search
+ uint32_t upper_bound;
+ const char* base_ptr =
+ index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
+ uint32_t low = 0;
+ uint32_t high = upper_bound;
+ ParsedInternalKey mid_key;
+ ParsedInternalKey parsed_target;
+ if (!ParseInternalKey(target, &parsed_target)) {
+ return Status::Corruption(Slice());
+ }
+
+ // The key is between [low, high). Do a binary search between it.
+ while (high - low > 1) {
+ uint32_t mid = (high + low) / 2;
+ uint32_t file_offset = GetFixed32Element(base_ptr, mid);
+ uint32_t tmp;
+ Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
+ if (!s.ok()) {
+ return s;
+ }
+ int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
+ if (cmp_result < 0) {
+ low = mid;
+ } else {
+ if (cmp_result == 0) {
+ // Happen to have found the exact key or target is smaller than the
+ // first key after base_offset.
+ prefix_matched = true;
+ *offset = file_offset;
+ return Status::OK();
+ } else {
+ high = mid;
+ }
+ }
+ }
+ // Both of the key at the position low or low+1 could share the same
+ // prefix as target. We need to rule out one of them to avoid to go
+ // to the wrong prefix.
+ ParsedInternalKey low_key;
+ uint32_t tmp;
+ uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
+ Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (GetPrefix(low_key) == prefix) {
+ prefix_matched = true;
+ *offset = low_key_offset;
+ } else if (low + 1 < upper_bound) {
+ // There is possible a next prefix, return it
+ prefix_matched = false;
+ *offset = GetFixed32Element(base_ptr, low + 1);
+ } else {
+ // target is larger than a key of the last prefix in this bucket
+ // but with a different prefix. Key does not exist.
+ *offset = file_info_.data_end_offset;
+ }
+ return Status::OK();
+}
+
+bool PlainTableReader::MatchBloom(uint32_t hash) const {
+ if (!enable_bloom_) {
+ return true;
+ }
+
+ if (bloom_.MayContainHash(hash)) {
+ PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+ return true;
+ } else {
+ PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+ return false;
+ }
+}
+
+Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+ ParsedInternalKey* parsed_key,
+ Slice* internal_key, Slice* value,
+ bool* seekable) const {
+ if (*offset == file_info_.data_end_offset) {
+ *offset = file_info_.data_end_offset;
+ return Status::OK();
+ }
+
+ if (*offset > file_info_.data_end_offset) {
+ return Status::Corruption("Offset is out of file size");
+ }
+
+ uint32_t bytes_read;
+ Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
+ &bytes_read, seekable);
+ if (!s.ok()) {
+ return s;
+ }
+ *offset = *offset + bytes_read;
+ return Status::OK();
+}
+
+void PlainTableReader::Prepare(const Slice& target) {
+ if (enable_bloom_) {
+ uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
+ bloom_.Prefetch(prefix_hash);
+ }
+}
+
+Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
+ GetContext* get_context,
+ const SliceTransform* /* prefix_extractor */,
+ bool /*skip_filters*/) {
+ // Check bloom filter first.
+ Slice prefix_slice;
+ uint32_t prefix_hash;
+ if (IsTotalOrderMode()) {
+ if (full_scan_mode_) {
+ status_ =
+ Status::InvalidArgument("Get() is not allowed in full scan mode.");
+ }
+ // Match whole user key for bloom filter check.
+ if (!MatchBloom(GetSliceHash(GetUserKey(target)))) {
+ return Status::OK();
+ }
+ // in total order mode, there is only one bucket 0, and we always use empty
+ // prefix.
+ prefix_slice = Slice();
+ prefix_hash = 0;
+ } else {
+ prefix_slice = GetPrefix(target);
+ prefix_hash = GetSliceHash(prefix_slice);
+ if (!MatchBloom(prefix_hash)) {
+ return Status::OK();
+ }
+ }
+ uint32_t offset;
+ bool prefix_match;
+ PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
+ prefix_extractor_);
+ Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
+ prefix_match, &offset);
+
+ if (!s.ok()) {
+ return s;
+ }
+ ParsedInternalKey found_key;
+ ParsedInternalKey parsed_target;
+ if (!ParseInternalKey(target, &parsed_target)) {
+ return Status::Corruption(Slice());
+ }
+ Slice found_value;
+ while (offset < file_info_.data_end_offset) {
+ s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!prefix_match) {
+ // Need to verify prefix for the first key found if it is not yet
+ // checked.
+ if (GetPrefix(found_key) != prefix_slice) {
+ return Status::OK();
+ }
+ prefix_match = true;
+ }
+ // TODO(ljin): since we know the key comparison result here,
+ // can we enable the fast path?
+ if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
+ bool dont_care __attribute__((__unused__));
+ if (!get_context->SaveValue(found_key, found_value, &dont_care,
+ dummy_cleanable_.get())) {
+ break;
+ }
+ }
+ }
+ return Status::OK();
+}
+
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
+ TableReaderCaller /*caller*/) {
+ return 0;
+}
+
+uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
+ const Slice& /*end*/,
+ TableReaderCaller /*caller*/) {
+ return 0;
+}
+
+PlainTableIterator::PlainTableIterator(PlainTableReader* table,
+ bool use_prefix_seek)
+ : table_(table),
+ decoder_(&table_->file_info_, table_->encoding_type_,
+ table_->user_key_len_, table_->prefix_extractor_),
+ use_prefix_seek_(use_prefix_seek) {
+ next_offset_ = offset_ = table_->file_info_.data_end_offset;
+}
+
+PlainTableIterator::~PlainTableIterator() {
+}
+
+bool PlainTableIterator::Valid() const {
+ return offset_ < table_->file_info_.data_end_offset &&
+ offset_ >= table_->data_start_offset_;
+}
+
+void PlainTableIterator::SeekToFirst() {
+ status_ = Status::OK();
+ next_offset_ = table_->data_start_offset_;
+ if (next_offset_ >= table_->file_info_.data_end_offset) {
+ next_offset_ = offset_ = table_->file_info_.data_end_offset;
+ } else {
+ Next();
+ }
+}
+
+void PlainTableIterator::SeekToLast() {
+ assert(false);
+ status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
+ next_offset_ = offset_ = table_->file_info_.data_end_offset;
+}
+
+void PlainTableIterator::Seek(const Slice& target) {
+ if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
+ // This check is done here instead of NewIterator() to permit creating an
+ // iterator with total_order_seek = true even if we won't be able to Seek()
+ // it. This is needed for compaction: it creates iterator with
+ // total_order_seek = true but usually never does Seek() on it,
+ // only SeekToFirst().
+ status_ =
+ Status::InvalidArgument(
+ "total_order_seek not implemented for PlainTable.");
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ return;
+ }
+
+ // If the user doesn't set prefix seek option and we are not able to do a
+ // total Seek(). assert failure.
+ if (table_->IsTotalOrderMode()) {
+ if (table_->full_scan_mode_) {
+ status_ =
+ Status::InvalidArgument("Seek() is not allowed in full scan mode.");
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ return;
+ } else if (table_->GetIndexSize() > 1) {
+ assert(false);
+ status_ = Status::NotSupported(
+ "PlainTable cannot issue non-prefix seek unless in total order "
+ "mode.");
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ return;
+ }
+ }
+
+ Slice prefix_slice = table_->GetPrefix(target);
+ uint32_t prefix_hash = 0;
+ // Bloom filter is ignored in total-order mode.
+ if (!table_->IsTotalOrderMode()) {
+ prefix_hash = GetSliceHash(prefix_slice);
+ if (!table_->MatchBloom(prefix_hash)) {
+ status_ = Status::OK();
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ return;
+ }
+ }
+ bool prefix_match;
+ status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash,
+ prefix_match, &next_offset_);
+ if (!status_.ok()) {
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ return;
+ }
+
+ if (next_offset_ < table_->file_info_.data_end_offset) {
+ for (Next(); status_.ok() && Valid(); Next()) {
+ if (!prefix_match) {
+ // Need to verify the first key's prefix
+ if (table_->GetPrefix(key()) != prefix_slice) {
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ break;
+ }
+ prefix_match = true;
+ }
+ if (table_->internal_comparator_.Compare(key(), target) >= 0) {
+ break;
+ }
+ }
+ } else {
+ offset_ = table_->file_info_.data_end_offset;
+ }
+}
+
+void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
+ assert(false);
+ status_ =
+ Status::NotSupported("SeekForPrev() is not supported in PlainTable");
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+}
+
+void PlainTableIterator::Next() {
+ offset_ = next_offset_;
+ if (offset_ < table_->file_info_.data_end_offset) {
+ Slice tmp_slice;
+ ParsedInternalKey parsed_key;
+ status_ =
+ table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
+ if (!status_.ok()) {
+ offset_ = next_offset_ = table_->file_info_.data_end_offset;
+ }
+ }
+}
+
+void PlainTableIterator::Prev() {
+ assert(false);
+}
+
+Slice PlainTableIterator::key() const {
+ assert(Valid());
+ return key_;
+}
+
+Slice PlainTableIterator::value() const {
+ assert(Valid());
+ return value_;
+}
+
+Status PlainTableIterator::status() const {
+ return status_;
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_reader.h b/src/rocksdb/table/plain/plain_table_reader.h
new file mode 100644
index 000000000..db7b0626f
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_reader.h
@@ -0,0 +1,246 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <unordered_map>
+#include <memory>
+#include <vector>
+#include <string>
+#include <stdint.h>
+
+#include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Block;
+struct BlockContents;
+class BlockHandle;
+class Footer;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class TableReader;
+class InternalKeyComparator;
+class PlainTableKeyDecoder;
+class GetContext;
+
+extern const uint32_t kPlainTableVariableLength;
+
+struct PlainTableReaderFileInfo {
+ bool is_mmap_mode;
+ Slice file_data;
+ uint32_t data_end_offset;
+ std::unique_ptr<RandomAccessFileReader> file;
+
+ PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file,
+ const EnvOptions& storage_options,
+ uint32_t _data_size_offset)
+ : is_mmap_mode(storage_options.use_mmap_reads),
+ data_end_offset(_data_size_offset),
+ file(std::move(_file)) {}
+};
+
+// The reader class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableReader: public TableReader {
+ public:
+// Based on following output file format shown in plain_table_factory.h
+// When opening the output file, PlainTableReader creates a hash table
+// from key prefixes to offset of the output file. PlainTable will decide
+// whether it points to the data offset of the first key with the key prefix
+// or the offset of it. If there are too many keys share this prefix, it will
+// create a binary search-able index from the suffix to offset on disk.
+ static Status Open(const ImmutableCFOptions& ioptions,
+ const EnvOptions& env_options,
+ const InternalKeyComparator& internal_comparator,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ uint64_t file_size, std::unique_ptr<TableReader>* table,
+ const int bloom_bits_per_key, double hash_table_ratio,
+ size_t index_sparseness, size_t huge_page_tlb_size,
+ bool full_scan_mode, const bool immortal_table = false,
+ const SliceTransform* prefix_extractor = nullptr);
+
+ // Returns new iterator over table contents
+ // compaction_readahead_size: its value will only be used if for_compaction =
+ // true
+ InternalIterator* NewIterator(const ReadOptions&,
+ const SliceTransform* prefix_extractor,
+ Arena* arena, bool skip_filters,
+ TableReaderCaller caller,
+ size_t compaction_readahead_size = 0) override;
+
+ void Prepare(const Slice& target) override;
+
+ Status Get(const ReadOptions& readOptions, const Slice& key,
+ GetContext* get_context, const SliceTransform* prefix_extractor,
+ bool skip_filters = false) override;
+
+ uint64_t ApproximateOffsetOf(const Slice& key,
+ TableReaderCaller caller) override;
+
+ uint64_t ApproximateSize(const Slice& start, const Slice& end,
+ TableReaderCaller caller) override;
+
+ uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
+ void SetupForCompaction() override;
+
+ std::shared_ptr<const TableProperties> GetTableProperties() const override {
+ return table_properties_;
+ }
+
+ virtual size_t ApproximateMemoryUsage() const override {
+ return arena_.MemoryAllocatedBytes();
+ }
+
+ PlainTableReader(const ImmutableCFOptions& ioptions,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ const EnvOptions& env_options,
+ const InternalKeyComparator& internal_comparator,
+ EncodingType encoding_type, uint64_t file_size,
+ const TableProperties* table_properties,
+ const SliceTransform* prefix_extractor);
+ virtual ~PlainTableReader();
+
+ protected:
+ // Check bloom filter to see whether it might contain this prefix.
+ // The hash of the prefix is given, since it can be reused for index lookup
+ // too.
+ virtual bool MatchBloom(uint32_t hash) const;
+
+ // PopulateIndex() builds index of keys. It must be called before any query
+ // to the table.
+ //
+ // props: the table properties object that need to be stored. Ownership of
+ // the object will be passed.
+ //
+
+ Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
+ double hash_table_ratio, size_t index_sparseness,
+ size_t huge_page_tlb_size);
+
+ Status MmapDataIfNeeded();
+
+ private:
+ const InternalKeyComparator internal_comparator_;
+ EncodingType encoding_type_;
+ // represents plain table's current status.
+ Status status_;
+
+ PlainTableIndex index_;
+ bool full_scan_mode_;
+
+ // data_start_offset_ and data_end_offset_ defines the range of the
+ // sst file that stores data.
+ const uint32_t data_start_offset_ = 0;
+ const uint32_t user_key_len_;
+ const SliceTransform* prefix_extractor_;
+
+ static const size_t kNumInternalBytes = 8;
+
+ // Bloom filter is used to rule out non-existent key
+ bool enable_bloom_;
+ PlainTableBloomV1 bloom_;
+ PlainTableReaderFileInfo file_info_;
+ Arena arena_;
+ CacheAllocationPtr index_block_alloc_;
+ CacheAllocationPtr bloom_block_alloc_;
+
+ const ImmutableCFOptions& ioptions_;
+ std::unique_ptr<Cleanable> dummy_cleanable_;
+ uint64_t file_size_;
+ protected: // for testing
+ std::shared_ptr<const TableProperties> table_properties_;
+ private:
+
+ bool IsFixedLength() const {
+ return user_key_len_ != kPlainTableVariableLength;
+ }
+
+ size_t GetFixedInternalKeyLength() const {
+ return user_key_len_ + kNumInternalBytes;
+ }
+
+ Slice GetPrefix(const Slice& target) const {
+ assert(target.size() >= 8); // target is internal key
+ return GetPrefixFromUserKey(GetUserKey(target));
+ }
+
+ Slice GetPrefix(const ParsedInternalKey& target) const {
+ return GetPrefixFromUserKey(target.user_key);
+ }
+
+ Slice GetUserKey(const Slice& key) const {
+ return Slice(key.data(), key.size() - 8);
+ }
+
+ Slice GetPrefixFromUserKey(const Slice& user_key) const {
+ if (!IsTotalOrderMode()) {
+ return prefix_extractor_->Transform(user_key);
+ } else {
+ // Use empty slice as prefix if prefix_extractor is not set.
+ // In that case,
+ // it falls back to pure binary search and
+ // total iterator seek is supported.
+ return Slice();
+ }
+ }
+
+ friend class TableCache;
+ friend class PlainTableIterator;
+
+ // Internal helper function to generate an IndexRecordList object from all
+ // the rows, which contains index records as a list.
+ // If bloom_ is not null, all the keys' full-key hash will be added to the
+ // bloom filter.
+ Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
+ std::vector<uint32_t>* prefix_hashes);
+
+ // Internal helper function to allocate memory for bloom filter
+ void AllocateBloom(int bloom_bits_per_key, int num_prefixes,
+ size_t huge_page_tlb_size);
+
+ void FillBloom(const std::vector<uint32_t>& prefix_hashes);
+
+ // Read the key and value at `offset` to parameters for keys, the and
+ // `seekable`.
+ // On success, `offset` will be updated as the offset for the next key.
+ // `parsed_key` will be key in parsed format.
+ // if `internal_key` is not empty, it will be filled with key with slice
+ // format.
+ // if `seekable` is not null, it will return whether we can directly read
+ // data using this offset.
+ Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+ ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
+ bool* seekable = nullptr) const;
+ // Get file offset for key target.
+ // return value prefix_matched is set to true if the offset is confirmed
+ // for a key with the same prefix as target.
+ Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target,
+ const Slice& prefix, uint32_t prefix_hash,
+ bool& prefix_matched, uint32_t* offset) const;
+
+ bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+
+ // No copying allowed
+ explicit PlainTableReader(const TableReader&) = delete;
+ void operator=(const TableReader&) = delete;
+};
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/scoped_arena_iterator.h b/src/rocksdb/table/scoped_arena_iterator.h
new file mode 100644
index 000000000..8d73d12ee
--- /dev/null
+++ b/src/rocksdb/table/scoped_arena_iterator.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "table/internal_iterator.h"
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ScopedArenaIterator {
+
+ void reset(InternalIterator* iter) ROCKSDB_NOEXCEPT {
+ if (iter_ != nullptr) {
+ iter_->~InternalIterator();
+ }
+ iter_ = iter;
+ }
+
+ public:
+
+ explicit ScopedArenaIterator(InternalIterator* iter = nullptr)
+ : iter_(iter) {}
+
+ ScopedArenaIterator(const ScopedArenaIterator&) = delete;
+ ScopedArenaIterator& operator=(const ScopedArenaIterator&) = delete;
+
+ ScopedArenaIterator(ScopedArenaIterator&& o) ROCKSDB_NOEXCEPT {
+ iter_ = o.iter_;
+ o.iter_ = nullptr;
+ }
+
+ ScopedArenaIterator& operator=(ScopedArenaIterator&& o) ROCKSDB_NOEXCEPT {
+ reset(o.iter_);
+ o.iter_ = nullptr;
+ return *this;
+ }
+
+ InternalIterator* operator->() { return iter_; }
+ InternalIterator* get() { return iter_; }
+
+ void set(InternalIterator* iter) { reset(iter); }
+
+ InternalIterator* release() {
+ assert(iter_ != nullptr);
+ auto* res = iter_;
+ iter_ = nullptr;
+ return res;
+ }
+
+ ~ScopedArenaIterator() {
+ reset(nullptr);
+ }
+
+ private:
+ InternalIterator* iter_;
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/sst_file_reader.cc b/src/rocksdb/table/sst_file_reader.cc
new file mode 100644
index 000000000..9e3ba6eab
--- /dev/null
+++ b/src/rocksdb/table/sst_file_reader.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_file_reader.h"
+
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "env/composite_env_wrapper.h"
+#include "file/random_access_file_reader.h"
+#include "options/cf_options.h"
+#include "table/get_context.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct SstFileReader::Rep {
+ Options options;
+ EnvOptions soptions;
+ ImmutableCFOptions ioptions;
+ MutableCFOptions moptions;
+
+ std::unique_ptr<TableReader> table_reader;
+
+ Rep(const Options& opts)
+ : options(opts),
+ soptions(options),
+ ioptions(options),
+ moptions(ColumnFamilyOptions(options)) {}
+};
+
+SstFileReader::SstFileReader(const Options& options) : rep_(new Rep(options)) {}
+
+SstFileReader::~SstFileReader() {}
+
+Status SstFileReader::Open(const std::string& file_path) {
+ auto r = rep_.get();
+ Status s;
+ uint64_t file_size = 0;
+ std::unique_ptr<RandomAccessFile> file;
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ s = r->options.env->GetFileSize(file_path, &file_size);
+ if (s.ok()) {
+ s = r->options.env->NewRandomAccessFile(file_path, &file, r->soptions);
+ }
+ if (s.ok()) {
+ file_reader.reset(new RandomAccessFileReader(
+ NewLegacyRandomAccessFileWrapper(file), file_path));
+ }
+ if (s.ok()) {
+ TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor.get(),
+ r->soptions, r->ioptions.internal_comparator);
+ // Allow open file with global sequence number for backward compatibility.
+ t_opt.largest_seqno = kMaxSequenceNumber;
+ s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader),
+ file_size, &r->table_reader);
+ }
+ return s;
+}
+
+Iterator* SstFileReader::NewIterator(const ReadOptions& options) {
+ auto r = rep_.get();
+ auto sequence = options.snapshot != nullptr
+ ? options.snapshot->GetSequenceNumber()
+ : kMaxSequenceNumber;
+ auto internal_iter = r->table_reader->NewIterator(
+ options, r->moptions.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kSSTFileReader);
+ return NewDBIterator(r->options.env, options, r->ioptions, r->moptions,
+ r->ioptions.user_comparator, internal_iter, sequence,
+ r->moptions.max_sequential_skip_in_iterations,
+ nullptr /* read_callback */);
+}
+
+std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties()
+ const {
+ return rep_->table_reader->GetTableProperties();
+}
+
+Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) {
+ return rep_->table_reader->VerifyChecksum(read_options,
+ TableReaderCaller::kSSTFileReader);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/table/sst_file_reader_test.cc b/src/rocksdb/table/sst_file_reader_test.cc
new file mode 100644
index 000000000..ac020a3ec
--- /dev/null
+++ b/src/rocksdb/table/sst_file_reader_test.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+
+#include "rocksdb/db.h"
+#include "rocksdb/sst_file_reader.h"
+#include "rocksdb/sst_file_writer.h"
+#include "table/sst_file_writer_collectors.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string EncodeAsString(uint64_t v) {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%08" PRIu64, v);
+ return std::string(buf);
+}
+
+std::string EncodeAsUint64(uint64_t v) {
+ std::string dst;
+ PutFixed64(&dst, v);
+ return dst;
+}
+
+class SstFileReaderTest : public testing::Test {
+ public:
+ SstFileReaderTest() {
+ options_.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ sst_name_ = test::PerThreadDBPath("sst_file");
+ }
+
+ ~SstFileReaderTest() {
+ Status s = Env::Default()->DeleteFile(sst_name_);
+ assert(s.ok());
+ }
+
+ void CreateFile(const std::string& file_name,
+ const std::vector<std::string>& keys) {
+ SstFileWriter writer(soptions_, options_);
+ ASSERT_OK(writer.Open(file_name));
+ for (size_t i = 0; i + 2 < keys.size(); i += 3) {
+ ASSERT_OK(writer.Put(keys[i], keys[i]));
+ ASSERT_OK(writer.Merge(keys[i + 1], EncodeAsUint64(i + 1)));
+ ASSERT_OK(writer.Delete(keys[i + 2]));
+ }
+ ASSERT_OK(writer.Finish());
+ }
+
+ void CheckFile(const std::string& file_name,
+ const std::vector<std::string>& keys,
+ bool check_global_seqno = false) {
+ ReadOptions ropts;
+ SstFileReader reader(options_);
+ ASSERT_OK(reader.Open(file_name));
+ ASSERT_OK(reader.VerifyChecksum());
+ std::unique_ptr<Iterator> iter(reader.NewIterator(ropts));
+ iter->SeekToFirst();
+ for (size_t i = 0; i + 2 < keys.size(); i += 3) {
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(keys[i]), 0);
+ ASSERT_EQ(iter->value().compare(keys[i]), 0);
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ(iter->key().compare(keys[i + 1]), 0);
+ ASSERT_EQ(iter->value().compare(EncodeAsUint64(i + 1)), 0);
+ iter->Next();
+ }
+ ASSERT_FALSE(iter->Valid());
+ if (check_global_seqno) {
+ auto properties = reader.GetTableProperties();
+ ASSERT_TRUE(properties);
+ auto& user_properties = properties->user_collected_properties;
+ ASSERT_TRUE(
+ user_properties.count(ExternalSstFilePropertyNames::kGlobalSeqno));
+ }
+ }
+
+ void CreateFileAndCheck(const std::vector<std::string>& keys) {
+ CreateFile(sst_name_, keys);
+ CheckFile(sst_name_, keys);
+ }
+
+ protected:
+ Options options_;
+ EnvOptions soptions_;
+ std::string sst_name_;
+};
+
+const uint64_t kNumKeys = 100;
+
+TEST_F(SstFileReaderTest, Basic) {
+ std::vector<std::string> keys;
+ for (uint64_t i = 0; i < kNumKeys; i++) {
+ keys.emplace_back(EncodeAsString(i));
+ }
+ CreateFileAndCheck(keys);
+}
+
+TEST_F(SstFileReaderTest, Uint64Comparator) {
+ options_.comparator = test::Uint64Comparator();
+ std::vector<std::string> keys;
+ for (uint64_t i = 0; i < kNumKeys; i++) {
+ keys.emplace_back(EncodeAsUint64(i));
+ }
+ CreateFileAndCheck(keys);
+}
+
+TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) {
+ std::vector<std::string> keys;
+ for (uint64_t i = 0; i < kNumKeys; i++) {
+ keys.emplace_back(EncodeAsString(i));
+ }
+ // Generate a SST file.
+ CreateFile(sst_name_, keys);
+
+ // Ingest the file into a db, to assign it a global sequence number.
+ Options options;
+ options.create_if_missing = true;
+ std::string db_name = test::PerThreadDBPath("test_db");
+ DB* db;
+ ASSERT_OK(DB::Open(options, db_name, &db));
+ // Bump sequence number.
+ ASSERT_OK(db->Put(WriteOptions(), keys[0], "foo"));
+ ASSERT_OK(db->Flush(FlushOptions()));
+ // Ingest the file.
+ IngestExternalFileOptions ingest_options;
+ ingest_options.write_global_seqno = true;
+ ASSERT_OK(db->IngestExternalFile({sst_name_}, ingest_options));
+ std::vector<std::string> live_files;
+ uint64_t manifest_file_size = 0;
+ ASSERT_OK(db->GetLiveFiles(live_files, &manifest_file_size));
+ // Get the ingested file.
+ std::string ingested_file;
+ for (auto& live_file : live_files) {
+ if (live_file.substr(live_file.size() - 4, std::string::npos) == ".sst") {
+ if (ingested_file.empty() || ingested_file < live_file) {
+ ingested_file = live_file;
+ }
+ }
+ }
+ ASSERT_FALSE(ingested_file.empty());
+ delete db;
+
+ // Verify the file can be open and read by SstFileReader.
+ CheckFile(db_name + ingested_file, keys, true /* check_global_seqno */);
+
+ // Cleanup.
+ ASSERT_OK(DestroyDB(db_name, options));
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr,
+ "SKIPPED as SstFileReader is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/sst_file_writer.cc b/src/rocksdb/table/sst_file_writer.cc
new file mode 100644
index 000000000..a5d08ea77
--- /dev/null
+++ b/src/rocksdb/table/sst_file_writer.cc
@@ -0,0 +1,319 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/sst_file_writer.h"
+
+#include <vector>
+
+#include "db/dbformat.h"
+#include "env/composite_env_wrapper.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/sst_file_writer_collectors.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string ExternalSstFilePropertyNames::kVersion =
+ "rocksdb.external_sst_file.version";
+const std::string ExternalSstFilePropertyNames::kGlobalSeqno =
+ "rocksdb.external_sst_file.global_seqno";
+
+#ifndef ROCKSDB_LITE
+
+const size_t kFadviseTrigger = 1024 * 1024; // 1MB
+
+struct SstFileWriter::Rep {
+ Rep(const EnvOptions& _env_options, const Options& options,
+ Env::IOPriority _io_priority, const Comparator* _user_comparator,
+ ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters)
+ : env_options(_env_options),
+ ioptions(options),
+ mutable_cf_options(options),
+ io_priority(_io_priority),
+ internal_comparator(_user_comparator),
+ cfh(_cfh),
+ invalidate_page_cache(_invalidate_page_cache),
+ last_fadvise_size(0),
+ skip_filters(_skip_filters) {}
+
+ std::unique_ptr<WritableFileWriter> file_writer;
+ std::unique_ptr<TableBuilder> builder;
+ EnvOptions env_options;
+ ImmutableCFOptions ioptions;
+ MutableCFOptions mutable_cf_options;
+ Env::IOPriority io_priority;
+ InternalKeyComparator internal_comparator;
+ ExternalSstFileInfo file_info;
+ InternalKey ikey;
+ std::string column_family_name;
+ ColumnFamilyHandle* cfh;
+ // If true, We will give the OS a hint that this file pages is not needed
+ // every time we write 1MB to the file.
+ bool invalidate_page_cache;
+ // The size of the file during the last time we called Fadvise to remove
+ // cached pages from page cache.
+ uint64_t last_fadvise_size;
+ bool skip_filters;
+ Status Add(const Slice& user_key, const Slice& value,
+ const ValueType value_type) {
+ if (!builder) {
+ return Status::InvalidArgument("File is not opened");
+ }
+
+ if (file_info.num_entries == 0) {
+ file_info.smallest_key.assign(user_key.data(), user_key.size());
+ } else {
+ if (internal_comparator.user_comparator()->Compare(
+ user_key, file_info.largest_key) <= 0) {
+ // Make sure that keys are added in order
+ return Status::InvalidArgument(
+ "Keys must be added in strict ascending order.");
+ }
+ }
+
+ // TODO(tec) : For external SST files we could omit the seqno and type.
+ switch (value_type) {
+ case ValueType::kTypeValue:
+ ikey.Set(user_key, 0 /* Sequence Number */,
+ ValueType::kTypeValue /* Put */);
+ break;
+ case ValueType::kTypeMerge:
+ ikey.Set(user_key, 0 /* Sequence Number */,
+ ValueType::kTypeMerge /* Merge */);
+ break;
+ case ValueType::kTypeDeletion:
+ ikey.Set(user_key, 0 /* Sequence Number */,
+ ValueType::kTypeDeletion /* Delete */);
+ break;
+ default:
+ return Status::InvalidArgument("Value type is not supported");
+ }
+ builder->Add(ikey.Encode(), value);
+
+ // update file info
+ file_info.num_entries++;
+ file_info.largest_key.assign(user_key.data(), user_key.size());
+ file_info.file_size = builder->FileSize();
+
+ InvalidatePageCache(false /* closing */);
+
+ return Status::OK();
+ }
+
+ Status DeleteRange(const Slice& begin_key, const Slice& end_key) {
+ if (!builder) {
+ return Status::InvalidArgument("File is not opened");
+ }
+
+ RangeTombstone tombstone(begin_key, end_key, 0 /* Sequence Number */);
+ if (file_info.num_range_del_entries == 0) {
+ file_info.smallest_range_del_key.assign(tombstone.start_key_.data(),
+ tombstone.start_key_.size());
+ file_info.largest_range_del_key.assign(tombstone.end_key_.data(),
+ tombstone.end_key_.size());
+ } else {
+ if (internal_comparator.user_comparator()->Compare(
+ tombstone.start_key_, file_info.smallest_range_del_key) < 0) {
+ file_info.smallest_range_del_key.assign(tombstone.start_key_.data(),
+ tombstone.start_key_.size());
+ }
+ if (internal_comparator.user_comparator()->Compare(
+ tombstone.end_key_, file_info.largest_range_del_key) > 0) {
+ file_info.largest_range_del_key.assign(tombstone.end_key_.data(),
+ tombstone.end_key_.size());
+ }
+ }
+
+ auto ikey_and_end_key = tombstone.Serialize();
+ builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
+
+ // update file info
+ file_info.num_range_del_entries++;
+ file_info.file_size = builder->FileSize();
+
+ InvalidatePageCache(false /* closing */);
+
+ return Status::OK();
+ }
+
+ void InvalidatePageCache(bool closing) {
+ if (invalidate_page_cache == false) {
+ // Fadvise disabled
+ return;
+ }
+ uint64_t bytes_since_last_fadvise =
+ builder->FileSize() - last_fadvise_size;
+ if (bytes_since_last_fadvise > kFadviseTrigger || closing) {
+ TEST_SYNC_POINT_CALLBACK("SstFileWriter::Rep::InvalidatePageCache",
+ &(bytes_since_last_fadvise));
+ // Tell the OS that we dont need this file in page cache
+ file_writer->InvalidateCache(0, 0);
+ last_fadvise_size = builder->FileSize();
+ }
+ }
+
+};
+
+SstFileWriter::SstFileWriter(const EnvOptions& env_options,
+ const Options& options,
+ const Comparator* user_comparator,
+ ColumnFamilyHandle* column_family,
+ bool invalidate_page_cache,
+ Env::IOPriority io_priority, bool skip_filters)
+ : rep_(new Rep(env_options, options, io_priority, user_comparator,
+ column_family, invalidate_page_cache, skip_filters)) {
+ rep_->file_info.file_size = 0;
+}
+
+SstFileWriter::~SstFileWriter() {
+ if (rep_->builder) {
+ // User did not call Finish() or Finish() failed, we need to
+ // abandon the builder.
+ rep_->builder->Abandon();
+ }
+}
+
+Status SstFileWriter::Open(const std::string& file_path) {
+ Rep* r = rep_.get();
+ Status s;
+ std::unique_ptr<WritableFile> sst_file;
+ s = r->ioptions.env->NewWritableFile(file_path, &sst_file, r->env_options);
+ if (!s.ok()) {
+ return s;
+ }
+
+ sst_file->SetIOPriority(r->io_priority);
+
+ CompressionType compression_type;
+ CompressionOptions compression_opts;
+ if (r->ioptions.bottommost_compression != kDisableCompressionOption) {
+ compression_type = r->ioptions.bottommost_compression;
+ if (r->ioptions.bottommost_compression_opts.enabled) {
+ compression_opts = r->ioptions.bottommost_compression_opts;
+ } else {
+ compression_opts = r->ioptions.compression_opts;
+ }
+ } else if (!r->ioptions.compression_per_level.empty()) {
+ // Use the compression of the last level if we have per level compression
+ compression_type = *(r->ioptions.compression_per_level.rbegin());
+ compression_opts = r->ioptions.compression_opts;
+ } else {
+ compression_type = r->mutable_cf_options.compression;
+ compression_opts = r->ioptions.compression_opts;
+ }
+ uint64_t sample_for_compression =
+ r->mutable_cf_options.sample_for_compression;
+
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+
+ // SstFileWriter properties collector to add SstFileWriter version.
+ int_tbl_prop_collector_factories.emplace_back(
+ new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+ 0 /* global_seqno*/));
+
+ // User collector factories
+ auto user_collector_factories =
+ r->ioptions.table_properties_collector_factories;
+ for (size_t i = 0; i < user_collector_factories.size(); i++) {
+ int_tbl_prop_collector_factories.emplace_back(
+ new UserKeyTablePropertiesCollectorFactory(
+ user_collector_factories[i]));
+ }
+ int unknown_level = -1;
+ uint32_t cf_id;
+
+ if (r->cfh != nullptr) {
+ // user explicitly specified that this file will be ingested into cfh,
+ // we can persist this information in the file.
+ cf_id = r->cfh->GetID();
+ r->column_family_name = r->cfh->GetName();
+ } else {
+ r->column_family_name = "";
+ cf_id = TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+ }
+
+ TableBuilderOptions table_builder_options(
+ r->ioptions, r->mutable_cf_options, r->internal_comparator,
+ &int_tbl_prop_collector_factories, compression_type,
+ sample_for_compression, compression_opts, r->skip_filters,
+ r->column_family_name, unknown_level);
+ r->file_writer.reset(
+ new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(sst_file)),
+ file_path, r->env_options, r->ioptions.env,
+ nullptr /* stats */, r->ioptions.listeners));
+
+ // TODO(tec) : If table_factory is using compressed block cache, we will
+ // be adding the external sst file blocks into it, which is wasteful.
+ r->builder.reset(r->ioptions.table_factory->NewTableBuilder(
+ table_builder_options, cf_id, r->file_writer.get()));
+
+ r->file_info = ExternalSstFileInfo();
+ r->file_info.file_path = file_path;
+ r->file_info.version = 2;
+ return s;
+}
+
+Status SstFileWriter::Add(const Slice& user_key, const Slice& value) {
+ return rep_->Add(user_key, value, ValueType::kTypeValue);
+}
+
+Status SstFileWriter::Put(const Slice& user_key, const Slice& value) {
+ return rep_->Add(user_key, value, ValueType::kTypeValue);
+}
+
+Status SstFileWriter::Merge(const Slice& user_key, const Slice& value) {
+ return rep_->Add(user_key, value, ValueType::kTypeMerge);
+}
+
+Status SstFileWriter::Delete(const Slice& user_key) {
+ return rep_->Add(user_key, Slice(), ValueType::kTypeDeletion);
+}
+
+Status SstFileWriter::DeleteRange(const Slice& begin_key,
+ const Slice& end_key) {
+ return rep_->DeleteRange(begin_key, end_key);
+}
+
+Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
+ Rep* r = rep_.get();
+ if (!r->builder) {
+ return Status::InvalidArgument("File is not opened");
+ }
+ if (r->file_info.num_entries == 0 &&
+ r->file_info.num_range_del_entries == 0) {
+ return Status::InvalidArgument("Cannot create sst file with no entries");
+ }
+
+ Status s = r->builder->Finish();
+ r->file_info.file_size = r->builder->FileSize();
+
+ if (s.ok()) {
+ s = r->file_writer->Sync(r->ioptions.use_fsync);
+ r->InvalidatePageCache(true /* closing */);
+ if (s.ok()) {
+ s = r->file_writer->Close();
+ }
+ }
+ if (!s.ok()) {
+ r->ioptions.env->DeleteFile(r->file_info.file_path);
+ }
+
+ if (file_info != nullptr) {
+ *file_info = r->file_info;
+ }
+
+ r->builder.reset();
+ return s;
+}
+
+uint64_t SstFileWriter::FileSize() {
+ return rep_->file_info.file_size;
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/sst_file_writer_collectors.h b/src/rocksdb/table/sst_file_writer_collectors.h
new file mode 100644
index 000000000..01ecec971
--- /dev/null
+++ b/src/rocksdb/table/sst_file_writer_collectors.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+#include "db/dbformat.h"
+#include "db/table_properties_collector.h"
+#include "rocksdb/types.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Table Properties that are specific to tables created by SstFileWriter.
+struct ExternalSstFilePropertyNames {
+ // value of this property is a fixed uint32 number.
+ static const std::string kVersion;
+ // value of this property is a fixed uint64 number.
+ static const std::string kGlobalSeqno;
+};
+
+// PropertiesCollector used to add properties specific to tables
+// generated by SstFileWriter
+class SstFileWriterPropertiesCollector : public IntTblPropCollector {
+ public:
+ explicit SstFileWriterPropertiesCollector(int32_t version,
+ SequenceNumber global_seqno)
+ : version_(version), global_seqno_(global_seqno) {}
+
+ virtual Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
+ uint64_t /*file_size*/) override {
+ // Intentionally left blank. Have no interest in collecting stats for
+ // individual key/value pairs.
+ return Status::OK();
+ }
+
+ virtual void BlockAdd(uint64_t /* blockRawBytes */,
+ uint64_t /* blockCompressedBytesFast */,
+ uint64_t /* blockCompressedBytesSlow */) override {
+ // Intentionally left blank. No interest in collecting stats for
+ // blocks.
+ return;
+ }
+
+ virtual Status Finish(UserCollectedProperties* properties) override {
+ // File version
+ std::string version_val;
+ PutFixed32(&version_val, static_cast<uint32_t>(version_));
+ properties->insert({ExternalSstFilePropertyNames::kVersion, version_val});
+
+ // Global Sequence number
+ std::string seqno_val;
+ PutFixed64(&seqno_val, static_cast<uint64_t>(global_seqno_));
+ properties->insert({ExternalSstFilePropertyNames::kGlobalSeqno, seqno_val});
+
+ return Status::OK();
+ }
+
+ virtual const char* Name() const override {
+ return "SstFileWriterPropertiesCollector";
+ }
+
+ virtual UserCollectedProperties GetReadableProperties() const override {
+ return {{ExternalSstFilePropertyNames::kVersion, ToString(version_)}};
+ }
+
+ private:
+ int32_t version_;
+ SequenceNumber global_seqno_;
+};
+
+class SstFileWriterPropertiesCollectorFactory
+ : public IntTblPropCollectorFactory {
+ public:
+ explicit SstFileWriterPropertiesCollectorFactory(int32_t version,
+ SequenceNumber global_seqno)
+ : version_(version), global_seqno_(global_seqno) {}
+
+ virtual IntTblPropCollector* CreateIntTblPropCollector(
+ uint32_t /*column_family_id*/) override {
+ return new SstFileWriterPropertiesCollector(version_, global_seqno_);
+ }
+
+ virtual const char* Name() const override {
+ return "SstFileWriterPropertiesCollector";
+ }
+
+ private:
+ int32_t version_;
+ SequenceNumber global_seqno_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_builder.h b/src/rocksdb/table/table_builder.h
new file mode 100644
index 000000000..541251073
--- /dev/null
+++ b/src/rocksdb/table/table_builder.h
@@ -0,0 +1,170 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+#include <string>
+#include <utility>
+#include <vector>
+#include "db/dbformat.h"
+#include "db/table_properties_collector.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Status;
+
+struct TableReaderOptions {
+ // @param skip_filters Disables loading/accessing the filter block
+ TableReaderOptions(const ImmutableCFOptions& _ioptions,
+ const SliceTransform* _prefix_extractor,
+ const EnvOptions& _env_options,
+ const InternalKeyComparator& _internal_comparator,
+ bool _skip_filters = false, bool _immortal = false,
+ int _level = -1,
+ BlockCacheTracer* const _block_cache_tracer = nullptr)
+ : TableReaderOptions(_ioptions, _prefix_extractor, _env_options,
+ _internal_comparator, _skip_filters, _immortal,
+ _level, 0 /* _largest_seqno */,
+ _block_cache_tracer) {}
+
+ // @param skip_filters Disables loading/accessing the filter block
+ TableReaderOptions(const ImmutableCFOptions& _ioptions,
+ const SliceTransform* _prefix_extractor,
+ const EnvOptions& _env_options,
+ const InternalKeyComparator& _internal_comparator,
+ bool _skip_filters, bool _immortal, int _level,
+ SequenceNumber _largest_seqno,
+ BlockCacheTracer* const _block_cache_tracer)
+ : ioptions(_ioptions),
+ prefix_extractor(_prefix_extractor),
+ env_options(_env_options),
+ internal_comparator(_internal_comparator),
+ skip_filters(_skip_filters),
+ immortal(_immortal),
+ level(_level),
+ largest_seqno(_largest_seqno),
+ block_cache_tracer(_block_cache_tracer) {}
+
+ const ImmutableCFOptions& ioptions;
+ const SliceTransform* prefix_extractor;
+ const EnvOptions& env_options;
+ const InternalKeyComparator& internal_comparator;
+ // This is only used for BlockBasedTable (reader)
+ bool skip_filters;
+ // Whether the table will be valid as long as the DB is open
+ bool immortal;
+ // what level this table/file is on, -1 for "not set, don't know"
+ int level;
+ // largest seqno in the table
+ SequenceNumber largest_seqno;
+ BlockCacheTracer* const block_cache_tracer;
+};
+
+struct TableBuilderOptions {
+ TableBuilderOptions(
+ const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions,
+ const InternalKeyComparator& _internal_comparator,
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ _int_tbl_prop_collector_factories,
+ CompressionType _compression_type, uint64_t _sample_for_compression,
+ const CompressionOptions& _compression_opts, bool _skip_filters,
+ const std::string& _column_family_name, int _level,
+ const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0,
+ const uint64_t _target_file_size = 0,
+ const uint64_t _file_creation_time = 0)
+ : ioptions(_ioptions),
+ moptions(_moptions),
+ internal_comparator(_internal_comparator),
+ int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories),
+ compression_type(_compression_type),
+ sample_for_compression(_sample_for_compression),
+ compression_opts(_compression_opts),
+ skip_filters(_skip_filters),
+ column_family_name(_column_family_name),
+ level(_level),
+ creation_time(_creation_time),
+ oldest_key_time(_oldest_key_time),
+ target_file_size(_target_file_size),
+ file_creation_time(_file_creation_time) {}
+ const ImmutableCFOptions& ioptions;
+ const MutableCFOptions& moptions;
+ const InternalKeyComparator& internal_comparator;
+ const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
+ int_tbl_prop_collector_factories;
+ CompressionType compression_type;
+ uint64_t sample_for_compression;
+ const CompressionOptions& compression_opts;
+ bool skip_filters; // only used by BlockBasedTableBuilder
+ const std::string& column_family_name;
+ int level; // what level this table/file is on, -1 for "not set, don't know"
+ const uint64_t creation_time;
+ const int64_t oldest_key_time;
+ const uint64_t target_file_size;
+ const uint64_t file_creation_time;
+};
+
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
+class TableBuilder {
+ public:
+ // REQUIRES: Either Finish() or Abandon() has been called.
+ virtual ~TableBuilder() {}
+
+ // Add key,value to the table being constructed.
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: Finish(), Abandon() have not been called
+ virtual void Add(const Slice& key, const Slice& value) = 0;
+
+ // Return non-ok iff some error has been detected.
+ virtual Status status() const = 0;
+
+ // Finish building the table.
+ // REQUIRES: Finish(), Abandon() have not been called
+ virtual Status Finish() = 0;
+
+ // Indicate that the contents of this builder should be abandoned.
+ // If the caller is not going to call Finish(), it must call Abandon()
+ // before destroying this builder.
+ // REQUIRES: Finish(), Abandon() have not been called
+ virtual void Abandon() = 0;
+
+ // Number of calls to Add() so far.
+ virtual uint64_t NumEntries() const = 0;
+
+ // Size of the file generated so far. If invoked after a successful
+ // Finish() call, returns the size of the final generated file.
+ virtual uint64_t FileSize() const = 0;
+
+ // If the user defined table properties collector suggest the file to
+ // be further compacted.
+ virtual bool NeedCompact() const { return false; }
+
+ // Returns table properties
+ virtual TableProperties GetTableProperties() const = 0;
+
+ // Return file checksum
+ virtual const std::string& GetFileChecksum() const = 0;
+
+ // Return file checksum function name
+ virtual const char* GetFileChecksumFuncName() const = 0;
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_properties.cc b/src/rocksdb/table/table_properties.cc
new file mode 100644
index 000000000..d1dacd1a5
--- /dev/null
+++ b/src/rocksdb/table/table_properties.cc
@@ -0,0 +1,272 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/table_properties.h"
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "table/block_based/block.h"
+#include "table/internal_iterator.h"
+#include "table/table_properties_internal.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint32_t TablePropertiesCollectorFactory::Context::kUnknownColumnFamily =
+ port::kMaxInt32;
+
+namespace {
+ void AppendProperty(
+ std::string& props,
+ const std::string& key,
+ const std::string& value,
+ const std::string& prop_delim,
+ const std::string& kv_delim) {
+ props.append(key);
+ props.append(kv_delim);
+ props.append(value);
+ props.append(prop_delim);
+ }
+
+ template <class TValue>
+ void AppendProperty(
+ std::string& props,
+ const std::string& key,
+ const TValue& value,
+ const std::string& prop_delim,
+ const std::string& kv_delim) {
+ AppendProperty(
+ props, key, ToString(value), prop_delim, kv_delim
+ );
+ }
+
+ // Seek to the specified meta block.
+ // Return true if it successfully seeks to that block.
+ Status SeekToMetaBlock(InternalIterator* meta_iter,
+ const std::string& block_name, bool* is_found,
+ BlockHandle* block_handle = nullptr) {
+ if (block_handle != nullptr) {
+ *block_handle = BlockHandle::NullBlockHandle();
+ }
+ *is_found = true;
+ meta_iter->Seek(block_name);
+ if (meta_iter->status().ok()) {
+ if (meta_iter->Valid() && meta_iter->key() == block_name) {
+ *is_found = true;
+ if (block_handle) {
+ Slice v = meta_iter->value();
+ return block_handle->DecodeFrom(&v);
+ }
+ } else {
+ *is_found = false;
+ return Status::OK();
+ }
+ }
+ return meta_iter->status();
+ }
+}
+
+std::string TableProperties::ToString(
+ const std::string& prop_delim,
+ const std::string& kv_delim) const {
+ std::string result;
+ result.reserve(1024);
+
+ // Basic Info
+ AppendProperty(result, "# data blocks", num_data_blocks, prop_delim,
+ kv_delim);
+ AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
+ AppendProperty(result, "# deletions", num_deletions, prop_delim, kv_delim);
+ AppendProperty(result, "# merge operands", num_merge_operands, prop_delim,
+ kv_delim);
+ AppendProperty(result, "# range deletions", num_range_deletions, prop_delim,
+ kv_delim);
+
+ AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
+ AppendProperty(result, "raw average key size",
+ num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0,
+ prop_delim, kv_delim);
+ AppendProperty(result, "raw value size", raw_value_size, prop_delim,
+ kv_delim);
+ AppendProperty(result, "raw average value size",
+ num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0,
+ prop_delim, kv_delim);
+
+ AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
+ char index_block_size_str[80];
+ snprintf(index_block_size_str, sizeof(index_block_size_str),
+ "index block size (user-key? %d, delta-value? %d)",
+ static_cast<int>(index_key_is_user_key),
+ static_cast<int>(index_value_is_delta_encoded));
+ AppendProperty(result, index_block_size_str, index_size, prop_delim,
+ kv_delim);
+ if (index_partitions != 0) {
+ AppendProperty(result, "# index partitions", index_partitions, prop_delim,
+ kv_delim);
+ AppendProperty(result, "top-level index size", top_level_index_size, prop_delim,
+ kv_delim);
+ }
+ AppendProperty(result, "filter block size", filter_size, prop_delim,
+ kv_delim);
+ AppendProperty(result, "(estimated) table size",
+ data_size + index_size + filter_size, prop_delim, kv_delim);
+
+ AppendProperty(
+ result, "filter policy name",
+ filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
+ prop_delim, kv_delim);
+
+ AppendProperty(result, "prefix extractor name",
+ prefix_extractor_name.empty() ? std::string("N/A")
+ : prefix_extractor_name,
+ prop_delim, kv_delim);
+
+ AppendProperty(result, "column family ID",
+ column_family_id ==
+ ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::
+ Context::kUnknownColumnFamily
+ ? std::string("N/A")
+ : ROCKSDB_NAMESPACE::ToString(column_family_id),
+ prop_delim, kv_delim);
+ AppendProperty(
+ result, "column family name",
+ column_family_name.empty() ? std::string("N/A") : column_family_name,
+ prop_delim, kv_delim);
+
+ AppendProperty(result, "comparator name",
+ comparator_name.empty() ? std::string("N/A") : comparator_name,
+ prop_delim, kv_delim);
+
+ AppendProperty(
+ result, "merge operator name",
+ merge_operator_name.empty() ? std::string("N/A") : merge_operator_name,
+ prop_delim, kv_delim);
+
+ AppendProperty(result, "property collectors names",
+ property_collectors_names.empty() ? std::string("N/A")
+ : property_collectors_names,
+ prop_delim, kv_delim);
+
+ AppendProperty(
+ result, "SST file compression algo",
+ compression_name.empty() ? std::string("N/A") : compression_name,
+ prop_delim, kv_delim);
+
+ AppendProperty(
+ result, "SST file compression options",
+ compression_options.empty() ? std::string("N/A") : compression_options,
+ prop_delim, kv_delim);
+
+ AppendProperty(result, "creation time", creation_time, prop_delim, kv_delim);
+
+ AppendProperty(result, "time stamp of earliest key", oldest_key_time,
+ prop_delim, kv_delim);
+
+ AppendProperty(result, "file creation time", file_creation_time, prop_delim,
+ kv_delim);
+
+ return result;
+}
+
+void TableProperties::Add(const TableProperties& tp) {
+ data_size += tp.data_size;
+ index_size += tp.index_size;
+ index_partitions += tp.index_partitions;
+ top_level_index_size += tp.top_level_index_size;
+ index_key_is_user_key += tp.index_key_is_user_key;
+ index_value_is_delta_encoded += tp.index_value_is_delta_encoded;
+ filter_size += tp.filter_size;
+ raw_key_size += tp.raw_key_size;
+ raw_value_size += tp.raw_value_size;
+ num_data_blocks += tp.num_data_blocks;
+ num_entries += tp.num_entries;
+ num_deletions += tp.num_deletions;
+ num_merge_operands += tp.num_merge_operands;
+ num_range_deletions += tp.num_range_deletions;
+}
+
+const std::string TablePropertiesNames::kDataSize =
+ "rocksdb.data.size";
+const std::string TablePropertiesNames::kIndexSize =
+ "rocksdb.index.size";
+const std::string TablePropertiesNames::kIndexPartitions =
+ "rocksdb.index.partitions";
+const std::string TablePropertiesNames::kTopLevelIndexSize =
+ "rocksdb.top-level.index.size";
+const std::string TablePropertiesNames::kIndexKeyIsUserKey =
+ "rocksdb.index.key.is.user.key";
+const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded =
+ "rocksdb.index.value.is.delta.encoded";
+const std::string TablePropertiesNames::kFilterSize =
+ "rocksdb.filter.size";
+const std::string TablePropertiesNames::kRawKeySize =
+ "rocksdb.raw.key.size";
+const std::string TablePropertiesNames::kRawValueSize =
+ "rocksdb.raw.value.size";
+const std::string TablePropertiesNames::kNumDataBlocks =
+ "rocksdb.num.data.blocks";
+const std::string TablePropertiesNames::kNumEntries =
+ "rocksdb.num.entries";
+const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys";
+const std::string TablePropertiesNames::kMergeOperands =
+ "rocksdb.merge.operands";
+const std::string TablePropertiesNames::kNumRangeDeletions =
+ "rocksdb.num.range-deletions";
+const std::string TablePropertiesNames::kFilterPolicy =
+ "rocksdb.filter.policy";
+const std::string TablePropertiesNames::kFormatVersion =
+ "rocksdb.format.version";
+const std::string TablePropertiesNames::kFixedKeyLen =
+ "rocksdb.fixed.key.length";
+const std::string TablePropertiesNames::kColumnFamilyId =
+ "rocksdb.column.family.id";
+const std::string TablePropertiesNames::kColumnFamilyName =
+ "rocksdb.column.family.name";
+const std::string TablePropertiesNames::kComparator = "rocksdb.comparator";
+const std::string TablePropertiesNames::kMergeOperator =
+ "rocksdb.merge.operator";
+const std::string TablePropertiesNames::kPrefixExtractorName =
+ "rocksdb.prefix.extractor.name";
+const std::string TablePropertiesNames::kPropertyCollectors =
+ "rocksdb.property.collectors";
+const std::string TablePropertiesNames::kCompression = "rocksdb.compression";
+const std::string TablePropertiesNames::kCompressionOptions =
+ "rocksdb.compression_options";
+const std::string TablePropertiesNames::kCreationTime = "rocksdb.creation.time";
+const std::string TablePropertiesNames::kOldestKeyTime =
+ "rocksdb.oldest.key.time";
+const std::string TablePropertiesNames::kFileCreationTime =
+ "rocksdb.file.creation.time";
+
+extern const std::string kPropertiesBlock = "rocksdb.properties";
+// Old property block name for backward compatibility
+extern const std::string kPropertiesBlockOldName = "rocksdb.stats";
+extern const std::string kCompressionDictBlock = "rocksdb.compression_dict";
+extern const std::string kRangeDelBlock = "rocksdb.range_del";
+
+// Seek to the properties block.
+// Return true if it successfully seeks to the properties block.
+Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found) {
+ Status status = SeekToMetaBlock(meta_iter, kPropertiesBlock, is_found);
+ if (!*is_found && status.ok()) {
+ status = SeekToMetaBlock(meta_iter, kPropertiesBlockOldName, is_found);
+ }
+ return status;
+}
+
+// Seek to the compression dictionary block.
+// Return true if it successfully seeks to that block.
+Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found,
+ BlockHandle* block_handle) {
+ return SeekToMetaBlock(meta_iter, kCompressionDictBlock, is_found, block_handle);
+}
+
+Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found,
+ BlockHandle* block_handle = nullptr) {
+ return SeekToMetaBlock(meta_iter, kRangeDelBlock, is_found, block_handle);
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_properties_internal.h b/src/rocksdb/table/table_properties_internal.h
new file mode 100644
index 000000000..a7a92e3e1
--- /dev/null
+++ b/src/rocksdb/table/table_properties_internal.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+#include "rocksdb/iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockHandle;
+
+// Seek to the properties block.
+// If it successfully seeks to the properties block, "is_found" will be
+// set to true.
+Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found);
+
+// Seek to the compression dictionary block.
+// If it successfully seeks to the properties block, "is_found" will be
+// set to true.
+Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found,
+ BlockHandle* block_handle);
+
+// TODO(andrewkr) should not put all meta block in table_properties.h/cc
+Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found,
+ BlockHandle* block_handle);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_reader.h b/src/rocksdb/table/table_reader.h
new file mode 100644
index 000000000..4a08e3883
--- /dev/null
+++ b/src/rocksdb/table/table_reader.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/slice_transform.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/multiget_context.h"
+#include "table/table_reader_caller.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Iterator;
+struct ParsedInternalKey;
+class Slice;
+class Arena;
+struct ReadOptions;
+struct TableProperties;
+class GetContext;
+class MultiGetContext;
+
+// A Table (also referred to as SST) is a sorted map from strings to strings.
+// Tables are immutable and persistent. A Table may be safely accessed from
+// multiple threads without external synchronization. Table readers are used
+// for reading various types of table formats supported by rocksdb including
+// BlockBasedTable, PlainTable and CuckooTable format.
+class TableReader {
+ public:
+ virtual ~TableReader() {}
+
+ // Returns a new iterator over the table contents.
+ // The result of NewIterator() is initially invalid (caller must
+ // call one of the Seek methods on the iterator before using it).
+ // arena: If not null, the arena needs to be used to allocate the Iterator.
+ // When destroying the iterator, the caller will not call "delete"
+ // but Iterator::~Iterator() directly. The destructor needs to destroy
+ // all the states but those allocated in arena.
+ // skip_filters: disables checking the bloom filters even if they exist. This
+ // option is effective only for block-based table format.
+ // compaction_readahead_size: its value will only be used if caller =
+ // kCompaction
+ virtual InternalIterator* NewIterator(
+ const ReadOptions&, const SliceTransform* prefix_extractor, Arena* arena,
+ bool skip_filters, TableReaderCaller caller,
+ size_t compaction_readahead_size = 0) = 0;
+
+ virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+ const ReadOptions& /*read_options*/) {
+ return nullptr;
+ }
+
+ // Given a key, return an approximate byte offset in the file where
+ // the data for that key begins (or would begin if the key were
+ // present in the file). The returned value is in terms of file
+ // bytes, and so includes effects like compression of the underlying data.
+ // E.g., the approximate offset of the last key in the table will
+ // be close to the file length.
+ virtual uint64_t ApproximateOffsetOf(const Slice& key,
+ TableReaderCaller caller) = 0;
+
+ // Given start and end keys, return the approximate data size in the file
+ // between the keys. The returned value is in terms of file bytes, and so
+ // includes effects like compression of the underlying data.
+ virtual uint64_t ApproximateSize(const Slice& start, const Slice& end,
+ TableReaderCaller caller) = 0;
+
+ // Set up the table for Compaction. Might change some parameters with
+ // posix_fadvise
+ virtual void SetupForCompaction() = 0;
+
+ virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0;
+
+ // Prepare work that can be done before the real Get()
+ virtual void Prepare(const Slice& /*target*/) {}
+
+ // Report an approximation of how much memory has been used.
+ virtual size_t ApproximateMemoryUsage() const = 0;
+
+ // Calls get_context->SaveValue() repeatedly, starting with
+ // the entry found after a call to Seek(key), until it returns false.
+ // May not make such a call if filter policy says that key is not present.
+ //
+ // get_context->MarkKeyMayExist needs to be called when it is configured to be
+ // memory only and the key is not found in the block cache.
+ //
+ // readOptions is the options for the read
+ // key is the key to search for
+ // skip_filters: disables checking the bloom filters even if they exist. This
+ // option is effective only for block-based table format.
+ virtual Status Get(const ReadOptions& readOptions, const Slice& key,
+ GetContext* get_context,
+ const SliceTransform* prefix_extractor,
+ bool skip_filters = false) = 0;
+
+ virtual void MultiGet(const ReadOptions& readOptions,
+ const MultiGetContext::Range* mget_range,
+ const SliceTransform* prefix_extractor,
+ bool skip_filters = false) {
+ for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) {
+ *iter->s = Get(readOptions, iter->ikey, iter->get_context,
+ prefix_extractor, skip_filters);
+ }
+ }
+
+ // Prefetch data corresponding to a give range of keys
+ // Typically this functionality is required for table implementations that
+ // persists the data on a non volatile storage medium like disk/SSD
+ virtual Status Prefetch(const Slice* begin = nullptr,
+ const Slice* end = nullptr) {
+ (void) begin;
+ (void) end;
+ // Default implementation is NOOP.
+ // The child class should implement functionality when applicable
+ return Status::OK();
+ }
+
+ // convert db file to a human readable form
+ virtual Status DumpTable(WritableFile* /*out_file*/) {
+ return Status::NotSupported("DumpTable() not supported");
+ }
+
+ // check whether there is corruption in this db file
+ virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
+ TableReaderCaller /*caller*/) {
+ return Status::NotSupported("VerifyChecksum() not supported");
+ }
+};
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_reader_bench.cc b/src/rocksdb/table/table_reader_bench.cc
new file mode 100644
index 000000000..f1fd605aa
--- /dev/null
+++ b/src/rocksdb/table/table_reader_bench.cc
@@ -0,0 +1,347 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+ return 1;
+}
+#else
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "env/composite_env_wrapper.h"
+#include "file/random_access_file_reader.h"
+#include "monitoring/histogram.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+// Make a key that i determines the first 4 characters and j determines the
+// last 4 characters.
+static std::string MakeKey(int i, int j, bool through_db) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%04d__key___%04d", i, j);
+ if (through_db) {
+ return std::string(buf);
+ }
+ // If we directly query table, which operates on internal keys
+ // instead of user keys, we need to add 8 bytes of internal
+ // information (row type etc) to user key to make an internal
+ // key.
+ InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+ return key.Encode().ToString();
+}
+
+uint64_t Now(Env* env, bool measured_by_nanosecond) {
+ return measured_by_nanosecond ? env->NowNanos() : env->NowMicros();
+}
+} // namespace
+
+// A very simple benchmark that.
+// Create a table with roughly numKey1 * numKey2 keys,
+// where there are numKey1 prefixes of the key, each has numKey2 number of
+// distinguished key, differing in the suffix part.
+// If if_query_empty_keys = false, query the existing keys numKey1 * numKey2
+// times randomly.
+// If if_query_empty_keys = true, query numKey1 * numKey2 random empty keys.
+// Print out the total time.
+// If through_db=true, a full DB will be created and queries will be against
+// it. Otherwise, operations will be directly through table level.
+//
+// If for_terator=true, instead of just query one key each time, it queries
+// a range sharing the same prefix.
+namespace {
+void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
+ ReadOptions& read_options, int num_keys1,
+ int num_keys2, int num_iter, int /*prefix_len*/,
+ bool if_query_empty_keys, bool for_iterator,
+ bool through_db, bool measured_by_nanosecond) {
+ ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
+
+ std::string file_name =
+ test::PerThreadDBPath("rocksdb_table_reader_benchmark");
+ std::string dbname = test::PerThreadDBPath("rocksdb_table_reader_bench_db");
+ WriteOptions wo;
+ Env* env = Env::Default();
+ TableBuilder* tb = nullptr;
+ DB* db = nullptr;
+ Status s;
+ const ImmutableCFOptions ioptions(opts);
+ const ColumnFamilyOptions cfo(opts);
+ const MutableCFOptions moptions(cfo);
+ std::unique_ptr<WritableFileWriter> file_writer;
+ if (!through_db) {
+ std::unique_ptr<WritableFile> file;
+ env->NewWritableFile(file_name, &file, env_options);
+
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
+ int_tbl_prop_collector_factories;
+
+ file_writer.reset(new WritableFileWriter(
+ NewLegacyWritableFileWrapper(std::move(file)), file_name, env_options));
+ int unknown_level = -1;
+ tb = opts.table_factory->NewTableBuilder(
+ TableBuilderOptions(
+ ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
+ CompressionType::kNoCompression, 0 /* sample_for_compression */,
+ CompressionOptions(), false /* skip_filters */,
+ kDefaultColumnFamilyName, unknown_level),
+ 0 /* column_family_id */, file_writer.get());
+ } else {
+ s = DB::Open(opts, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+ }
+ // Populate slightly more than 1M keys
+ for (int i = 0; i < num_keys1; i++) {
+ for (int j = 0; j < num_keys2; j++) {
+ std::string key = MakeKey(i * 2, j, through_db);
+ if (!through_db) {
+ tb->Add(key, key);
+ } else {
+ db->Put(wo, key, key);
+ }
+ }
+ }
+ if (!through_db) {
+ tb->Finish();
+ file_writer->Close();
+ } else {
+ db->Flush(FlushOptions());
+ }
+
+ std::unique_ptr<TableReader> table_reader;
+ if (!through_db) {
+ std::unique_ptr<RandomAccessFile> raf;
+ s = env->NewRandomAccessFile(file_name, &raf, env_options);
+ if (!s.ok()) {
+ fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ uint64_t file_size;
+ env->GetFileSize(file_name, &file_size);
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(raf),
+ file_name));
+ s = opts.table_factory->NewTableReader(
+ TableReaderOptions(ioptions, moptions.prefix_extractor.get(),
+ env_options, ikc),
+ std::move(file_reader), file_size, &table_reader);
+ if (!s.ok()) {
+ fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ }
+
+ Random rnd(301);
+ std::string result;
+ HistogramImpl hist;
+
+ for (int it = 0; it < num_iter; it++) {
+ for (int i = 0; i < num_keys1; i++) {
+ for (int j = 0; j < num_keys2; j++) {
+ int r1 = rnd.Uniform(num_keys1) * 2;
+ int r2 = rnd.Uniform(num_keys2);
+ if (if_query_empty_keys) {
+ r1++;
+ r2 = num_keys2 * 2 - r2;
+ }
+
+ if (!for_iterator) {
+ // Query one existing key;
+ std::string key = MakeKey(r1, r2, through_db);
+ uint64_t start_time = Now(env, measured_by_nanosecond);
+ if (!through_db) {
+ PinnableSlice value;
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+ GetContext get_context(ioptions.user_comparator,
+ ioptions.merge_operator, ioptions.info_log,
+ ioptions.statistics, GetContext::kNotFound,
+ Slice(key), &value, nullptr, &merge_context,
+ true, &max_covering_tombstone_seq, env);
+ s = table_reader->Get(read_options, key, &get_context, nullptr);
+ } else {
+ s = db->Get(read_options, key, &result);
+ }
+ hist.Add(Now(env, measured_by_nanosecond) - start_time);
+ } else {
+ int r2_len;
+ if (if_query_empty_keys) {
+ r2_len = 0;
+ } else {
+ r2_len = rnd.Uniform(num_keys2) + 1;
+ if (r2_len + r2 > num_keys2) {
+ r2_len = num_keys2 - r2;
+ }
+ }
+ std::string start_key = MakeKey(r1, r2, through_db);
+ std::string end_key = MakeKey(r1, r2 + r2_len, through_db);
+ uint64_t total_time = 0;
+ uint64_t start_time = Now(env, measured_by_nanosecond);
+ Iterator* iter = nullptr;
+ InternalIterator* iiter = nullptr;
+ if (!through_db) {
+ iiter = table_reader->NewIterator(
+ read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+ } else {
+ iter = db->NewIterator(read_options);
+ }
+ int count = 0;
+ for (through_db ? iter->Seek(start_key) : iiter->Seek(start_key);
+ through_db ? iter->Valid() : iiter->Valid();
+ through_db ? iter->Next() : iiter->Next()) {
+ if (if_query_empty_keys) {
+ break;
+ }
+ // verify key;
+ total_time += Now(env, measured_by_nanosecond) - start_time;
+ assert(Slice(MakeKey(r1, r2 + count, through_db)) ==
+ (through_db ? iter->key() : iiter->key()));
+ start_time = Now(env, measured_by_nanosecond);
+ if (++count >= r2_len) {
+ break;
+ }
+ }
+ if (count != r2_len) {
+ fprintf(
+ stderr, "Iterator cannot iterate expected number of entries. "
+ "Expected %d but got %d\n", r2_len, count);
+ assert(false);
+ }
+ delete iter;
+ total_time += Now(env, measured_by_nanosecond) - start_time;
+ hist.Add(total_time);
+ }
+ }
+ }
+ }
+
+ fprintf(
+ stderr,
+ "==================================================="
+ "====================================================\n"
+ "InMemoryTableSimpleBenchmark: %20s num_key1: %5d "
+ "num_key2: %5d %10s\n"
+ "==================================================="
+ "===================================================="
+ "\nHistogram (unit: %s): \n%s",
+ opts.table_factory->Name(), num_keys1, num_keys2,
+ for_iterator ? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"),
+ measured_by_nanosecond ? "nanosecond" : "microsecond",
+ hist.ToString().c_str());
+ if (!through_db) {
+ env->DeleteFile(file_name);
+ } else {
+ delete db;
+ db = nullptr;
+ DestroyDB(dbname, opts);
+ }
+}
+} // namespace
+} // namespace ROCKSDB_NAMESPACE
+
+DEFINE_bool(query_empty, false, "query non-existing keys instead of existing "
+ "ones.");
+DEFINE_int32(num_keys1, 4096, "number of distinguish prefix of keys");
+DEFINE_int32(num_keys2, 512, "number of distinguish keys for each prefix");
+DEFINE_int32(iter, 3, "query non-existing keys instead of existing ones");
+DEFINE_int32(prefix_len, 16, "Prefix length used for iterators and indexes");
+DEFINE_bool(iterator, false, "For test iterator");
+DEFINE_bool(through_db, false, "If enable, a DB instance will be created and "
+ "the query will be against DB. Otherwise, will be directly against "
+ "a table reader.");
+DEFINE_bool(mmap_read, true, "Whether use mmap read");
+DEFINE_string(table_factory, "block_based",
+ "Table factory to use: `block_based` (default), `plain_table` or "
+ "`cuckoo_hash`.");
+DEFINE_string(time_unit, "microsecond",
+ "The time unit used for measuring performance. User can specify "
+ "`microsecond` (default) or `nanosecond`");
+
+int main(int argc, char** argv) {
+ SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+ " [OPTIONS]...");
+ ParseCommandLineFlags(&argc, &argv, true);
+
+ std::shared_ptr<ROCKSDB_NAMESPACE::TableFactory> tf;
+ ROCKSDB_NAMESPACE::Options options;
+ if (FLAGS_prefix_len < 16) {
+ options.prefix_extractor.reset(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_len));
+ }
+ ROCKSDB_NAMESPACE::ReadOptions ro;
+ ROCKSDB_NAMESPACE::EnvOptions env_options;
+ options.create_if_missing = true;
+ options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression;
+
+ if (FLAGS_table_factory == "cuckoo_hash") {
+#ifndef ROCKSDB_LITE
+ options.allow_mmap_reads = FLAGS_mmap_read;
+ env_options.use_mmap_reads = FLAGS_mmap_read;
+ ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
+ table_options.hash_table_ratio = 0.75;
+ tf.reset(ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options));
+#else
+ fprintf(stderr, "Plain table is not supported in lite mode\n");
+ exit(1);
+#endif // ROCKSDB_LITE
+ } else if (FLAGS_table_factory == "plain_table") {
+#ifndef ROCKSDB_LITE
+ options.allow_mmap_reads = FLAGS_mmap_read;
+ env_options.use_mmap_reads = FLAGS_mmap_read;
+
+ ROCKSDB_NAMESPACE::PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 16;
+ plain_table_options.bloom_bits_per_key = (FLAGS_prefix_len == 16) ? 0 : 8;
+ plain_table_options.hash_table_ratio = 0.75;
+
+ tf.reset(new ROCKSDB_NAMESPACE::PlainTableFactory(plain_table_options));
+ options.prefix_extractor.reset(
+ ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_len));
+#else
+ fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
+ exit(1);
+#endif // ROCKSDB_LITE
+ } else if (FLAGS_table_factory == "block_based") {
+ tf.reset(new ROCKSDB_NAMESPACE::BlockBasedTableFactory());
+ } else {
+ fprintf(stderr, "Invalid table type %s\n", FLAGS_table_factory.c_str());
+ }
+
+ if (tf) {
+ // if user provides invalid options, just fall back to microsecond.
+ bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond";
+
+ options.table_factory = tf;
+ ROCKSDB_NAMESPACE::TableReaderBenchmark(
+ options, env_options, ro, FLAGS_num_keys1, FLAGS_num_keys2, FLAGS_iter,
+ FLAGS_prefix_len, FLAGS_query_empty, FLAGS_iterator, FLAGS_through_db,
+ measured_by_nanosecond);
+ } else {
+ return 1;
+ }
+
+ return 0;
+}
+
+#endif // GFLAGS
diff --git a/src/rocksdb/table/table_reader_caller.h b/src/rocksdb/table/table_reader_caller.h
new file mode 100644
index 000000000..7a57b5e98
--- /dev/null
+++ b/src/rocksdb/table/table_reader_caller.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace ROCKSDB_NAMESPACE {
+// A list of callers for a table reader. It is used to trace the caller that
+// accesses on a block. This is only used for block cache tracing and analysis.
+// A user may use kUncategorized if the caller is not interesting for analysis
+// or the table reader is called in the test environment, e.g., unit test, table
+// reader benchmark, etc.
+enum TableReaderCaller : char {
+ kUserGet = 1,
+ kUserMultiGet = 2,
+ kUserIterator = 3,
+ kUserApproximateSize = 4,
+ kUserVerifyChecksum = 5,
+ kSSTDumpTool = 6,
+ kExternalSSTIngestion = 7,
+ kRepair = 8,
+ kPrefetch = 9,
+ kCompaction = 10,
+ // A compaction job may refill the block cache with blocks in the new SST
+ // files if paranoid_file_checks is true.
+ kCompactionRefill = 11,
+ // After building a table, it may load all its blocks into the block cache if
+ // paranoid_file_checks is true.
+ kFlush = 12,
+ // sst_file_reader.
+ kSSTFileReader = 13,
+ // A list of callers that are either not interesting for analysis or are
+ // calling from a test environment, e.g., unit test, benchmark, etc.
+ kUncategorized = 14,
+ // All callers should be added before kMaxBlockCacheLookupCaller.
+ kMaxBlockCacheLookupCaller
+};
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_test.cc b/src/rocksdb/table/table_test.cc
new file mode 100644
index 000000000..2a24c99eb
--- /dev/null
+++ b/src/rocksdb/table/table_test.cc
@@ -0,0 +1,4651 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "block_fetcher.h"
+#include "cache/lru_cache.h"
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "memtable/stl_wrappers.h"
+#include "meta_blocks.h"
+#include "monitoring/statistics.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/flush_block_policy.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/compression.h"
+#include "util/file_checksum_helper.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+
+namespace {
+
+const std::string kDummyValue(10000, 'o');
+
+// DummyPropertiesCollector used to test BlockBasedTableProperties
+class DummyPropertiesCollector : public TablePropertiesCollector {
+ public:
+ const char* Name() const override { return ""; }
+
+ Status Finish(UserCollectedProperties* /*properties*/) override {
+ return Status::OK();
+ }
+
+ Status Add(const Slice& /*user_key*/, const Slice& /*value*/) override {
+ return Status::OK();
+ }
+
+ UserCollectedProperties GetReadableProperties() const override {
+ return UserCollectedProperties{};
+ }
+};
+
+class DummyPropertiesCollectorFactory1
+ : public TablePropertiesCollectorFactory {
+ public:
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ return new DummyPropertiesCollector();
+ }
+ const char* Name() const override { return "DummyPropertiesCollector1"; }
+};
+
+class DummyPropertiesCollectorFactory2
+ : public TablePropertiesCollectorFactory {
+ public:
+ TablePropertiesCollector* CreateTablePropertiesCollector(
+ TablePropertiesCollectorFactory::Context /*context*/) override {
+ return new DummyPropertiesCollector();
+ }
+ const char* Name() const override { return "DummyPropertiesCollector2"; }
+};
+
+// Return reverse of "key".
+// Used to test non-lexicographic comparators.
+std::string Reverse(const Slice& key) {
+ auto rev = key.ToString();
+ std::reverse(rev.begin(), rev.end());
+ return rev;
+}
+
+class ReverseKeyComparator : public Comparator {
+ public:
+ const char* Name() const override {
+ return "rocksdb.ReverseBytewiseComparator";
+ }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ return BytewiseComparator()->Compare(Reverse(a), Reverse(b));
+ }
+
+ void FindShortestSeparator(std::string* start,
+ const Slice& limit) const override {
+ std::string s = Reverse(*start);
+ std::string l = Reverse(limit);
+ BytewiseComparator()->FindShortestSeparator(&s, l);
+ *start = Reverse(s);
+ }
+
+ void FindShortSuccessor(std::string* key) const override {
+ std::string s = Reverse(*key);
+ BytewiseComparator()->FindShortSuccessor(&s);
+ *key = Reverse(s);
+ }
+};
+
+ReverseKeyComparator reverse_key_comparator;
+
+void Increment(const Comparator* cmp, std::string* key) {
+ if (cmp == BytewiseComparator()) {
+ key->push_back('\0');
+ } else {
+ assert(cmp == &reverse_key_comparator);
+ std::string rev = Reverse(*key);
+ rev.push_back('\0');
+ *key = Reverse(rev);
+ }
+}
+
+} // namespace
+
+// Helper class for tests to unify the interface between
+// BlockBuilder/TableBuilder and Block/Table.
+class Constructor {
+ public:
+ explicit Constructor(const Comparator* cmp)
+ : data_(stl_wrappers::LessOfComparator(cmp)) {}
+ virtual ~Constructor() { }
+
+ void Add(const std::string& key, const Slice& value) {
+ data_[key] = value.ToString();
+ }
+
+ // Finish constructing the data structure with all the keys that have
+ // been added so far. Returns the keys in sorted order in "*keys"
+ // and stores the key/value pairs in "*kvmap"
+ void Finish(const Options& options, const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& moptions,
+ const BlockBasedTableOptions& table_options,
+ const InternalKeyComparator& internal_comparator,
+ std::vector<std::string>* keys, stl_wrappers::KVMap* kvmap) {
+ last_internal_key_ = &internal_comparator;
+ *kvmap = data_;
+ keys->clear();
+ for (const auto& kv : data_) {
+ keys->push_back(kv.first);
+ }
+ data_.clear();
+ Status s = FinishImpl(options, ioptions, moptions, table_options,
+ internal_comparator, *kvmap);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ }
+
+ // Construct the data structure from the data in "data"
+ virtual Status FinishImpl(const Options& options,
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& moptions,
+ const BlockBasedTableOptions& table_options,
+ const InternalKeyComparator& internal_comparator,
+ const stl_wrappers::KVMap& data) = 0;
+
+ virtual InternalIterator* NewIterator(
+ const SliceTransform* prefix_extractor = nullptr) const = 0;
+
+ virtual const stl_wrappers::KVMap& data() { return data_; }
+
+ virtual bool IsArenaMode() const { return false; }
+
+ virtual DB* db() const { return nullptr; } // Overridden in DBConstructor
+
+ virtual bool AnywayDeleteIterator() const { return false; }
+
+ protected:
+ const InternalKeyComparator* last_internal_key_;
+
+ private:
+ stl_wrappers::KVMap data_;
+};
+
+class BlockConstructor: public Constructor {
+ public:
+ explicit BlockConstructor(const Comparator* cmp)
+ : Constructor(cmp),
+ comparator_(cmp),
+ block_(nullptr) { }
+ ~BlockConstructor() override { delete block_; }
+ Status FinishImpl(const Options& /*options*/,
+ const ImmutableCFOptions& /*ioptions*/,
+ const MutableCFOptions& /*moptions*/,
+ const BlockBasedTableOptions& table_options,
+ const InternalKeyComparator& /*internal_comparator*/,
+ const stl_wrappers::KVMap& kv_map) override {
+ delete block_;
+ block_ = nullptr;
+ BlockBuilder builder(table_options.block_restart_interval);
+
+ for (const auto kv : kv_map) {
+ builder.Add(kv.first, kv.second);
+ }
+ // Open the block
+ data_ = builder.Finish().ToString();
+ BlockContents contents;
+ contents.data = data_;
+ block_ = new Block(std::move(contents), kDisableGlobalSequenceNumber);
+ return Status::OK();
+ }
+ InternalIterator* NewIterator(
+ const SliceTransform* /*prefix_extractor*/) const override {
+ return block_->NewDataIterator(comparator_, comparator_);
+ }
+
+ private:
+ const Comparator* comparator_;
+ std::string data_;
+ Block* block_;
+
+ BlockConstructor();
+};
+
+// A helper class that converts internal format keys into user keys
+class KeyConvertingIterator : public InternalIterator {
+ public:
+ explicit KeyConvertingIterator(InternalIterator* iter,
+ bool arena_mode = false)
+ : iter_(iter), arena_mode_(arena_mode) {}
+ ~KeyConvertingIterator() override {
+ if (arena_mode_) {
+ iter_->~InternalIterator();
+ } else {
+ delete iter_;
+ }
+ }
+ bool Valid() const override { return iter_->Valid() && status_.ok(); }
+ void Seek(const Slice& target) override {
+ ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+ std::string encoded;
+ AppendInternalKey(&encoded, ikey);
+ iter_->Seek(encoded);
+ }
+ void SeekForPrev(const Slice& target) override {
+ ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+ std::string encoded;
+ AppendInternalKey(&encoded, ikey);
+ iter_->SeekForPrev(encoded);
+ }
+ void SeekToFirst() override { iter_->SeekToFirst(); }
+ void SeekToLast() override { iter_->SeekToLast(); }
+ void Next() override { iter_->Next(); }
+ void Prev() override { iter_->Prev(); }
+ bool IsOutOfBound() override { return iter_->IsOutOfBound(); }
+
+ Slice key() const override {
+ assert(Valid());
+ ParsedInternalKey parsed_key;
+ if (!ParseInternalKey(iter_->key(), &parsed_key)) {
+ status_ = Status::Corruption("malformed internal key");
+ return Slice("corrupted key");
+ }
+ return parsed_key.user_key;
+ }
+
+ Slice value() const override { return iter_->value(); }
+ Status status() const override {
+ return status_.ok() ? iter_->status() : status_;
+ }
+
+ private:
+ mutable Status status_;
+ InternalIterator* iter_;
+ bool arena_mode_;
+
+ // No copying allowed
+ KeyConvertingIterator(const KeyConvertingIterator&);
+ void operator=(const KeyConvertingIterator&);
+};
+
+class TableConstructor: public Constructor {
+ public:
+ explicit TableConstructor(const Comparator* cmp,
+ bool convert_to_internal_key = false,
+ int level = -1, SequenceNumber largest_seqno = 0)
+ : Constructor(cmp),
+ largest_seqno_(largest_seqno),
+ convert_to_internal_key_(convert_to_internal_key),
+ level_(level) {
+ env_ = ROCKSDB_NAMESPACE::Env::Default();
+ }
+ ~TableConstructor() override { Reset(); }
+
+ Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& moptions,
+ const BlockBasedTableOptions& /*table_options*/,
+ const InternalKeyComparator& internal_comparator,
+ const stl_wrappers::KVMap& kv_map) override {
+ Reset();
+ soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+ file_writer_.reset(test::GetWritableFileWriter(new test::StringSink(),
+ "" /* don't care */));
+ std::unique_ptr<TableBuilder> builder;
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+
+ if (largest_seqno_ != 0) {
+ // Pretend that it's an external file written by SstFileWriter.
+ int_tbl_prop_collector_factories.emplace_back(
+ new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+ 0 /* global_seqno*/));
+ }
+
+ std::string column_family_name;
+ builder.reset(ioptions.table_factory->NewTableBuilder(
+ TableBuilderOptions(ioptions, moptions, internal_comparator,
+ &int_tbl_prop_collector_factories,
+ options.compression, options.sample_for_compression,
+ options.compression_opts, false /* skip_filters */,
+ column_family_name, level_),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ file_writer_.get()));
+
+ for (const auto kv : kv_map) {
+ if (convert_to_internal_key_) {
+ ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
+ std::string encoded;
+ AppendInternalKey(&encoded, ikey);
+ builder->Add(encoded, kv.second);
+ } else {
+ builder->Add(kv.first, kv.second);
+ }
+ EXPECT_TRUE(builder->status().ok());
+ }
+ Status s = builder->Finish();
+ file_writer_->Flush();
+ EXPECT_TRUE(s.ok()) << s.ToString();
+
+ EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize());
+
+ // Open the table
+ uniq_id_ = cur_uniq_id_++;
+ file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
+ TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
+ const bool kSkipFilters = true;
+ const bool kImmortal = true;
+ return ioptions.table_factory->NewTableReader(
+ TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
+ internal_comparator, !kSkipFilters, !kImmortal,
+ level_, largest_seqno_, &block_cache_tracer_),
+ std::move(file_reader_), TEST_GetSink()->contents().size(),
+ &table_reader_);
+ }
+
+ InternalIterator* NewIterator(
+ const SliceTransform* prefix_extractor) const override {
+ ReadOptions ro;
+ InternalIterator* iter = table_reader_->NewIterator(
+ ro, prefix_extractor, /*arena=*/nullptr, /*skip_filters=*/false,
+ TableReaderCaller::kUncategorized);
+ if (convert_to_internal_key_) {
+ return new KeyConvertingIterator(iter);
+ } else {
+ return iter;
+ }
+ }
+
+ uint64_t ApproximateOffsetOf(const Slice& key) const {
+ if (convert_to_internal_key_) {
+ InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+ const Slice skey = ikey.Encode();
+ return table_reader_->ApproximateOffsetOf(
+ skey, TableReaderCaller::kUncategorized);
+ }
+ return table_reader_->ApproximateOffsetOf(
+ key, TableReaderCaller::kUncategorized);
+ }
+
+ virtual Status Reopen(const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& moptions) {
+ file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
+ TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
+ return ioptions.table_factory->NewTableReader(
+ TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
+ *last_internal_key_),
+ std::move(file_reader_), TEST_GetSink()->contents().size(),
+ &table_reader_);
+ }
+
+ virtual TableReader* GetTableReader() { return table_reader_.get(); }
+
+ bool AnywayDeleteIterator() const override {
+ return convert_to_internal_key_;
+ }
+
+ void ResetTableReader() { table_reader_.reset(); }
+
+ bool ConvertToInternalKey() { return convert_to_internal_key_; }
+
+ test::StringSink* TEST_GetSink() {
+ return ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter(
+ file_writer_.get());
+ }
+
+ BlockCacheTracer block_cache_tracer_;
+
+ private:
+ void Reset() {
+ uniq_id_ = 0;
+ table_reader_.reset();
+ file_writer_.reset();
+ file_reader_.reset();
+ }
+
+ uint64_t uniq_id_;
+ std::unique_ptr<WritableFileWriter> file_writer_;
+ std::unique_ptr<RandomAccessFileReader> file_reader_;
+ std::unique_ptr<TableReader> table_reader_;
+ SequenceNumber largest_seqno_;
+ bool convert_to_internal_key_;
+ int level_;
+
+ TableConstructor();
+
+ static uint64_t cur_uniq_id_;
+ EnvOptions soptions;
+ Env* env_;
+};
+uint64_t TableConstructor::cur_uniq_id_ = 1;
+
+class MemTableConstructor: public Constructor {
+ public:
+ explicit MemTableConstructor(const Comparator* cmp, WriteBufferManager* wb)
+ : Constructor(cmp),
+ internal_comparator_(cmp),
+ write_buffer_manager_(wb),
+ table_factory_(new SkipListFactory) {
+ options_.memtable_factory = table_factory_;
+ ImmutableCFOptions ioptions(options_);
+ memtable_ =
+ new MemTable(internal_comparator_, ioptions, MutableCFOptions(options_),
+ wb, kMaxSequenceNumber, 0 /* column_family_id */);
+ memtable_->Ref();
+ }
+ ~MemTableConstructor() override { delete memtable_->Unref(); }
+ Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& /*moptions*/,
+ const BlockBasedTableOptions& /*table_options*/,
+ const InternalKeyComparator& /*internal_comparator*/,
+ const stl_wrappers::KVMap& kv_map) override {
+ delete memtable_->Unref();
+ ImmutableCFOptions mem_ioptions(ioptions);
+ memtable_ = new MemTable(internal_comparator_, mem_ioptions,
+ MutableCFOptions(options_), write_buffer_manager_,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ memtable_->Ref();
+ int seq = 1;
+ for (const auto kv : kv_map) {
+ memtable_->Add(seq, kTypeValue, kv.first, kv.second);
+ seq++;
+ }
+ return Status::OK();
+ }
+ InternalIterator* NewIterator(
+ const SliceTransform* /*prefix_extractor*/) const override {
+ return new KeyConvertingIterator(
+ memtable_->NewIterator(ReadOptions(), &arena_), true);
+ }
+
+ bool AnywayDeleteIterator() const override { return true; }
+
+ bool IsArenaMode() const override { return true; }
+
+ private:
+ mutable Arena arena_;
+ InternalKeyComparator internal_comparator_;
+ Options options_;
+ WriteBufferManager* write_buffer_manager_;
+ MemTable* memtable_;
+ std::shared_ptr<SkipListFactory> table_factory_;
+};
+
+class InternalIteratorFromIterator : public InternalIterator {
+ public:
+ explicit InternalIteratorFromIterator(Iterator* it) : it_(it) {}
+ bool Valid() const override { return it_->Valid(); }
+ void Seek(const Slice& target) override { it_->Seek(target); }
+ void SeekForPrev(const Slice& target) override { it_->SeekForPrev(target); }
+ void SeekToFirst() override { it_->SeekToFirst(); }
+ void SeekToLast() override { it_->SeekToLast(); }
+ void Next() override { it_->Next(); }
+ void Prev() override { it_->Prev(); }
+ Slice key() const override { return it_->key(); }
+ Slice value() const override { return it_->value(); }
+ Status status() const override { return it_->status(); }
+
+ private:
+ std::unique_ptr<Iterator> it_;
+};
+
+class DBConstructor: public Constructor {
+ public:
+ explicit DBConstructor(const Comparator* cmp)
+ : Constructor(cmp),
+ comparator_(cmp) {
+ db_ = nullptr;
+ NewDB();
+ }
+ ~DBConstructor() override { delete db_; }
+ Status FinishImpl(const Options& /*options*/,
+ const ImmutableCFOptions& /*ioptions*/,
+ const MutableCFOptions& /*moptions*/,
+ const BlockBasedTableOptions& /*table_options*/,
+ const InternalKeyComparator& /*internal_comparator*/,
+ const stl_wrappers::KVMap& kv_map) override {
+ delete db_;
+ db_ = nullptr;
+ NewDB();
+ for (const auto kv : kv_map) {
+ WriteBatch batch;
+ batch.Put(kv.first, kv.second);
+ EXPECT_TRUE(db_->Write(WriteOptions(), &batch).ok());
+ }
+ return Status::OK();
+ }
+
+ InternalIterator* NewIterator(
+ const SliceTransform* /*prefix_extractor*/) const override {
+ return new InternalIteratorFromIterator(db_->NewIterator(ReadOptions()));
+ }
+
+ DB* db() const override { return db_; }
+
+ private:
+ void NewDB() {
+ std::string name = test::PerThreadDBPath("table_testdb");
+
+ Options options;
+ options.comparator = comparator_;
+ Status status = DestroyDB(name, options);
+ ASSERT_TRUE(status.ok()) << status.ToString();
+
+ options.create_if_missing = true;
+ options.error_if_exists = true;
+ options.write_buffer_size = 10000; // Something small to force merging
+ status = DB::Open(options, name, &db_);
+ ASSERT_TRUE(status.ok()) << status.ToString();
+ }
+
+ const Comparator* comparator_;
+ DB* db_;
+};
+
+enum TestType {
+ BLOCK_BASED_TABLE_TEST,
+#ifndef ROCKSDB_LITE
+ PLAIN_TABLE_SEMI_FIXED_PREFIX,
+ PLAIN_TABLE_FULL_STR_PREFIX,
+ PLAIN_TABLE_TOTAL_ORDER,
+#endif // !ROCKSDB_LITE
+ BLOCK_TEST,
+ MEMTABLE_TEST,
+ DB_TEST
+};
+
+struct TestArgs {
+ TestType type;
+ bool reverse_compare;
+ int restart_interval;
+ CompressionType compression;
+ uint32_t format_version;
+ bool use_mmap;
+};
+
+static std::vector<TestArgs> GenerateArgList() {
+ std::vector<TestArgs> test_args;
+ std::vector<TestType> test_types = {
+ BLOCK_BASED_TABLE_TEST,
+#ifndef ROCKSDB_LITE
+ PLAIN_TABLE_SEMI_FIXED_PREFIX,
+ PLAIN_TABLE_FULL_STR_PREFIX,
+ PLAIN_TABLE_TOTAL_ORDER,
+#endif // !ROCKSDB_LITE
+ BLOCK_TEST,
+ MEMTABLE_TEST, DB_TEST};
+ std::vector<bool> reverse_compare_types = {false, true};
+ std::vector<int> restart_intervals = {16, 1, 1024};
+
+ // Only add compression if it is supported
+ std::vector<std::pair<CompressionType, bool>> compression_types;
+ compression_types.emplace_back(kNoCompression, false);
+ if (Snappy_Supported()) {
+ compression_types.emplace_back(kSnappyCompression, false);
+ }
+ if (Zlib_Supported()) {
+ compression_types.emplace_back(kZlibCompression, false);
+ compression_types.emplace_back(kZlibCompression, true);
+ }
+ if (BZip2_Supported()) {
+ compression_types.emplace_back(kBZip2Compression, false);
+ compression_types.emplace_back(kBZip2Compression, true);
+ }
+ if (LZ4_Supported()) {
+ compression_types.emplace_back(kLZ4Compression, false);
+ compression_types.emplace_back(kLZ4Compression, true);
+ compression_types.emplace_back(kLZ4HCCompression, false);
+ compression_types.emplace_back(kLZ4HCCompression, true);
+ }
+ if (XPRESS_Supported()) {
+ compression_types.emplace_back(kXpressCompression, false);
+ compression_types.emplace_back(kXpressCompression, true);
+ }
+ if (ZSTD_Supported()) {
+ compression_types.emplace_back(kZSTD, false);
+ compression_types.emplace_back(kZSTD, true);
+ }
+
+ for (auto test_type : test_types) {
+ for (auto reverse_compare : reverse_compare_types) {
+#ifndef ROCKSDB_LITE
+ if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX ||
+ test_type == PLAIN_TABLE_FULL_STR_PREFIX ||
+ test_type == PLAIN_TABLE_TOTAL_ORDER) {
+ // Plain table doesn't use restart index or compression.
+ TestArgs one_arg;
+ one_arg.type = test_type;
+ one_arg.reverse_compare = reverse_compare;
+ one_arg.restart_interval = restart_intervals[0];
+ one_arg.compression = compression_types[0].first;
+ one_arg.use_mmap = true;
+ test_args.push_back(one_arg);
+ one_arg.use_mmap = false;
+ test_args.push_back(one_arg);
+ continue;
+ }
+#endif // !ROCKSDB_LITE
+
+ for (auto restart_interval : restart_intervals) {
+ for (auto compression_type : compression_types) {
+ TestArgs one_arg;
+ one_arg.type = test_type;
+ one_arg.reverse_compare = reverse_compare;
+ one_arg.restart_interval = restart_interval;
+ one_arg.compression = compression_type.first;
+ one_arg.format_version = compression_type.second ? 2 : 1;
+ one_arg.use_mmap = false;
+ test_args.push_back(one_arg);
+ }
+ }
+ }
+ }
+ return test_args;
+}
+
+// In order to make all tests run for plain table format, including
+// those operating on empty keys, create a new prefix transformer which
+// return fixed prefix if the slice is not shorter than the prefix length,
+// and the full slice if it is shorter.
+class FixedOrLessPrefixTransform : public SliceTransform {
+ private:
+ const size_t prefix_len_;
+
+ public:
+ explicit FixedOrLessPrefixTransform(size_t prefix_len) :
+ prefix_len_(prefix_len) {
+ }
+
+ const char* Name() const override { return "rocksdb.FixedPrefix"; }
+
+ Slice Transform(const Slice& src) const override {
+ assert(InDomain(src));
+ if (src.size() < prefix_len_) {
+ return src;
+ }
+ return Slice(src.data(), prefix_len_);
+ }
+
+ bool InDomain(const Slice& /*src*/) const override { return true; }
+
+ bool InRange(const Slice& dst) const override {
+ return (dst.size() <= prefix_len_);
+ }
+ bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
+};
+
+class HarnessTest : public testing::Test {
+ public:
+ HarnessTest()
+ : ioptions_(options_),
+ moptions_(options_),
+ constructor_(nullptr),
+ write_buffer_(options_.db_write_buffer_size) {}
+
+ void Init(const TestArgs& args) {
+ delete constructor_;
+ constructor_ = nullptr;
+ options_ = Options();
+ options_.compression = args.compression;
+ // Use shorter block size for tests to exercise block boundary
+ // conditions more.
+ if (args.reverse_compare) {
+ options_.comparator = &reverse_key_comparator;
+ }
+
+ internal_comparator_.reset(
+ new test::PlainInternalKeyComparator(options_.comparator));
+
+ support_prev_ = true;
+ only_support_prefix_seek_ = false;
+ options_.allow_mmap_reads = args.use_mmap;
+ switch (args.type) {
+ case BLOCK_BASED_TABLE_TEST:
+ table_options_.flush_block_policy_factory.reset(
+ new FlushBlockBySizePolicyFactory());
+ table_options_.block_size = 256;
+ table_options_.block_restart_interval = args.restart_interval;
+ table_options_.index_block_restart_interval = args.restart_interval;
+ table_options_.format_version = args.format_version;
+ options_.table_factory.reset(
+ new BlockBasedTableFactory(table_options_));
+ constructor_ = new TableConstructor(
+ options_.comparator, true /* convert_to_internal_key_ */);
+ internal_comparator_.reset(
+ new InternalKeyComparator(options_.comparator));
+ break;
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+ case PLAIN_TABLE_SEMI_FIXED_PREFIX:
+ support_prev_ = false;
+ only_support_prefix_seek_ = true;
+ options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2));
+ options_.table_factory.reset(NewPlainTableFactory());
+ constructor_ = new TableConstructor(
+ options_.comparator, true /* convert_to_internal_key_ */);
+ internal_comparator_.reset(
+ new InternalKeyComparator(options_.comparator));
+ break;
+ case PLAIN_TABLE_FULL_STR_PREFIX:
+ support_prev_ = false;
+ only_support_prefix_seek_ = true;
+ options_.prefix_extractor.reset(NewNoopTransform());
+ options_.table_factory.reset(NewPlainTableFactory());
+ constructor_ = new TableConstructor(
+ options_.comparator, true /* convert_to_internal_key_ */);
+ internal_comparator_.reset(
+ new InternalKeyComparator(options_.comparator));
+ break;
+ case PLAIN_TABLE_TOTAL_ORDER:
+ support_prev_ = false;
+ only_support_prefix_seek_ = false;
+ options_.prefix_extractor = nullptr;
+
+ {
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = kPlainTableVariableLength;
+ plain_table_options.bloom_bits_per_key = 0;
+ plain_table_options.hash_table_ratio = 0;
+
+ options_.table_factory.reset(
+ NewPlainTableFactory(plain_table_options));
+ }
+ constructor_ = new TableConstructor(
+ options_.comparator, true /* convert_to_internal_key_ */);
+ internal_comparator_.reset(
+ new InternalKeyComparator(options_.comparator));
+ break;
+#endif // !ROCKSDB_LITE
+ case BLOCK_TEST:
+ table_options_.block_size = 256;
+ options_.table_factory.reset(
+ new BlockBasedTableFactory(table_options_));
+ constructor_ = new BlockConstructor(options_.comparator);
+ break;
+ case MEMTABLE_TEST:
+ table_options_.block_size = 256;
+ options_.table_factory.reset(
+ new BlockBasedTableFactory(table_options_));
+ constructor_ = new MemTableConstructor(options_.comparator,
+ &write_buffer_);
+ break;
+ case DB_TEST:
+ table_options_.block_size = 256;
+ options_.table_factory.reset(
+ new BlockBasedTableFactory(table_options_));
+ constructor_ = new DBConstructor(options_.comparator);
+ break;
+ }
+ ioptions_ = ImmutableCFOptions(options_);
+ moptions_ = MutableCFOptions(options_);
+ }
+
+ ~HarnessTest() override { delete constructor_; }
+
+ void Add(const std::string& key, const std::string& value) {
+ constructor_->Add(key, value);
+ }
+
+ void Test(Random* rnd) {
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap data;
+ constructor_->Finish(options_, ioptions_, moptions_, table_options_,
+ *internal_comparator_, &keys, &data);
+
+ TestForwardScan(keys, data);
+ if (support_prev_) {
+ TestBackwardScan(keys, data);
+ }
+ TestRandomAccess(rnd, keys, data);
+ }
+
+ void TestForwardScan(const std::vector<std::string>& /*keys*/,
+ const stl_wrappers::KVMap& data) {
+ InternalIterator* iter = constructor_->NewIterator();
+ ASSERT_TRUE(!iter->Valid());
+ iter->SeekToFirst();
+ for (stl_wrappers::KVMap::const_iterator model_iter = data.begin();
+ model_iter != data.end(); ++model_iter) {
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ iter->Next();
+ }
+ ASSERT_TRUE(!iter->Valid());
+ if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+ iter->~InternalIterator();
+ } else {
+ delete iter;
+ }
+ }
+
+ void TestBackwardScan(const std::vector<std::string>& /*keys*/,
+ const stl_wrappers::KVMap& data) {
+ InternalIterator* iter = constructor_->NewIterator();
+ ASSERT_TRUE(!iter->Valid());
+ iter->SeekToLast();
+ for (stl_wrappers::KVMap::const_reverse_iterator model_iter = data.rbegin();
+ model_iter != data.rend(); ++model_iter) {
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ iter->Prev();
+ }
+ ASSERT_TRUE(!iter->Valid());
+ if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+ iter->~InternalIterator();
+ } else {
+ delete iter;
+ }
+ }
+
+ void TestRandomAccess(Random* rnd, const std::vector<std::string>& keys,
+ const stl_wrappers::KVMap& data) {
+ static const bool kVerbose = false;
+ InternalIterator* iter = constructor_->NewIterator();
+ ASSERT_TRUE(!iter->Valid());
+ stl_wrappers::KVMap::const_iterator model_iter = data.begin();
+ if (kVerbose) fprintf(stderr, "---\n");
+ for (int i = 0; i < 200; i++) {
+ const int toss = rnd->Uniform(support_prev_ ? 5 : 3);
+ switch (toss) {
+ case 0: {
+ if (iter->Valid()) {
+ if (kVerbose) fprintf(stderr, "Next\n");
+ iter->Next();
+ ++model_iter;
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ }
+ break;
+ }
+
+ case 1: {
+ if (kVerbose) fprintf(stderr, "SeekToFirst\n");
+ iter->SeekToFirst();
+ model_iter = data.begin();
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ break;
+ }
+
+ case 2: {
+ std::string key = PickRandomKey(rnd, keys);
+ model_iter = data.lower_bound(key);
+ if (kVerbose) fprintf(stderr, "Seek '%s'\n",
+ EscapeString(key).c_str());
+ iter->Seek(Slice(key));
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ break;
+ }
+
+ case 3: {
+ if (iter->Valid()) {
+ if (kVerbose) fprintf(stderr, "Prev\n");
+ iter->Prev();
+ if (model_iter == data.begin()) {
+ model_iter = data.end(); // Wrap around to invalid value
+ } else {
+ --model_iter;
+ }
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ }
+ break;
+ }
+
+ case 4: {
+ if (kVerbose) fprintf(stderr, "SeekToLast\n");
+ iter->SeekToLast();
+ if (keys.empty()) {
+ model_iter = data.end();
+ } else {
+ std::string last = data.rbegin()->first;
+ model_iter = data.lower_bound(last);
+ }
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ break;
+ }
+ }
+ }
+ if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+ iter->~InternalIterator();
+ } else {
+ delete iter;
+ }
+ }
+
+ std::string ToString(const stl_wrappers::KVMap& data,
+ const stl_wrappers::KVMap::const_iterator& it) {
+ if (it == data.end()) {
+ return "END";
+ } else {
+ return "'" + it->first + "->" + it->second + "'";
+ }
+ }
+
+ std::string ToString(const stl_wrappers::KVMap& data,
+ const stl_wrappers::KVMap::const_reverse_iterator& it) {
+ if (it == data.rend()) {
+ return "END";
+ } else {
+ return "'" + it->first + "->" + it->second + "'";
+ }
+ }
+
+ std::string ToString(const InternalIterator* it) {
+ if (!it->Valid()) {
+ return "END";
+ } else {
+ return "'" + it->key().ToString() + "->" + it->value().ToString() + "'";
+ }
+ }
+
+ std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) {
+ if (keys.empty()) {
+ return "foo";
+ } else {
+ const int index = rnd->Uniform(static_cast<int>(keys.size()));
+ std::string result = keys[index];
+ switch (rnd->Uniform(support_prev_ ? 3 : 1)) {
+ case 0:
+ // Return an existing key
+ break;
+ case 1: {
+ // Attempt to return something smaller than an existing key
+ if (result.size() > 0 && result[result.size() - 1] > '\0'
+ && (!only_support_prefix_seek_
+ || options_.prefix_extractor->Transform(result).size()
+ < result.size())) {
+ result[result.size() - 1]--;
+ }
+ break;
+ }
+ case 2: {
+ // Return something larger than an existing key
+ Increment(options_.comparator, &result);
+ break;
+ }
+ }
+ return result;
+ }
+ }
+
+ // Returns nullptr if not running against a DB
+ DB* db() const { return constructor_->db(); }
+
+ void RandomizedHarnessTest(size_t part, size_t total) {
+ std::vector<TestArgs> args = GenerateArgList();
+ assert(part);
+ assert(part <= total);
+ for (size_t i = 0; i < args.size(); i++) {
+ if ((i % total) + 1 != part) {
+ continue;
+ }
+ Init(args[i]);
+ Random rnd(test::RandomSeed() + 5);
+ for (int num_entries = 0; num_entries < 2000;
+ num_entries += (num_entries < 50 ? 1 : 200)) {
+ for (int e = 0; e < num_entries; e++) {
+ std::string v;
+ Add(test::RandomKey(&rnd, rnd.Skewed(4)),
+ test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
+ }
+ Test(&rnd);
+ }
+ }
+ }
+
+ private:
+ Options options_ = Options();
+ ImmutableCFOptions ioptions_;
+ MutableCFOptions moptions_;
+ BlockBasedTableOptions table_options_ = BlockBasedTableOptions();
+ Constructor* constructor_;
+ WriteBufferManager write_buffer_;
+ bool support_prev_;
+ bool only_support_prefix_seek_;
+ std::shared_ptr<InternalKeyComparator> internal_comparator_;
+};
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+ bool result = (val >= low) && (val <= high);
+ if (!result) {
+ fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+ (unsigned long long)(val),
+ (unsigned long long)(low),
+ (unsigned long long)(high));
+ }
+ return result;
+}
+
+// Tests against all kinds of tables
+class TableTest : public testing::Test {
+ public:
+ const InternalKeyComparator& GetPlainInternalComparator(
+ const Comparator* comp) {
+ if (!plain_internal_comparator) {
+ plain_internal_comparator.reset(
+ new test::PlainInternalKeyComparator(comp));
+ }
+ return *plain_internal_comparator;
+ }
+ void IndexTest(BlockBasedTableOptions table_options);
+
+ private:
+ std::unique_ptr<InternalKeyComparator> plain_internal_comparator;
+};
+
+class GeneralTableTest : public TableTest {};
+class BlockBasedTableTest
+ : public TableTest,
+ virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+ BlockBasedTableTest() : format_(GetParam()) {
+ env_ = ROCKSDB_NAMESPACE::Env::Default();
+ }
+
+ BlockBasedTableOptions GetBlockBasedTableOptions() {
+ BlockBasedTableOptions options;
+ options.format_version = format_;
+ return options;
+ }
+
+ void SetupTracingTest(TableConstructor* c) {
+ test_path_ = test::PerThreadDBPath("block_based_table_tracing_test");
+ EXPECT_OK(env_->CreateDir(test_path_));
+ trace_file_path_ = test_path_ + "/block_cache_trace_file";
+ TraceOptions trace_opt;
+ std::unique_ptr<TraceWriter> trace_writer;
+ EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_,
+ &trace_writer));
+ c->block_cache_tracer_.StartTrace(env_, trace_opt, std::move(trace_writer));
+ {
+ std::string user_key = "k01";
+ InternalKey internal_key(user_key, 0, kTypeValue);
+ std::string encoded_key = internal_key.Encode().ToString();
+ c->Add(encoded_key, kDummyValue);
+ }
+ {
+ std::string user_key = "k02";
+ InternalKey internal_key(user_key, 0, kTypeValue);
+ std::string encoded_key = internal_key.Encode().ToString();
+ c->Add(encoded_key, kDummyValue);
+ }
+ }
+
+ void VerifyBlockAccessTrace(
+ TableConstructor* c,
+ const std::vector<BlockCacheTraceRecord>& expected_records) {
+ c->block_cache_tracer_.EndTrace();
+
+ std::unique_ptr<TraceReader> trace_reader;
+ Status s =
+ NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
+ EXPECT_OK(s);
+ BlockCacheTraceReader reader(std::move(trace_reader));
+ BlockCacheTraceHeader header;
+ EXPECT_OK(reader.ReadHeader(&header));
+ uint32_t index = 0;
+ while (s.ok()) {
+ BlockCacheTraceRecord access;
+ s = reader.ReadAccess(&access);
+ if (!s.ok()) {
+ break;
+ }
+ ASSERT_LT(index, expected_records.size());
+ EXPECT_NE("", access.block_key);
+ EXPECT_EQ(access.block_type, expected_records[index].block_type);
+ EXPECT_GT(access.block_size, 0);
+ EXPECT_EQ(access.caller, expected_records[index].caller);
+ EXPECT_EQ(access.no_insert, expected_records[index].no_insert);
+ EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit);
+ // Get
+ if (access.caller == TableReaderCaller::kUserGet) {
+ EXPECT_EQ(access.referenced_key,
+ expected_records[index].referenced_key);
+ EXPECT_EQ(access.get_id, expected_records[index].get_id);
+ EXPECT_EQ(access.get_from_user_specified_snapshot,
+ expected_records[index].get_from_user_specified_snapshot);
+ if (access.block_type == TraceType::kBlockTraceDataBlock) {
+ EXPECT_GT(access.referenced_data_size, 0);
+ EXPECT_GT(access.num_keys_in_block, 0);
+ EXPECT_EQ(access.referenced_key_exist_in_block,
+ expected_records[index].referenced_key_exist_in_block);
+ }
+ } else {
+ EXPECT_EQ(access.referenced_key, "");
+ EXPECT_EQ(access.get_id, 0);
+ EXPECT_TRUE(access.get_from_user_specified_snapshot == Boolean::kFalse);
+ EXPECT_EQ(access.referenced_data_size, 0);
+ EXPECT_EQ(access.num_keys_in_block, 0);
+ EXPECT_TRUE(access.referenced_key_exist_in_block == Boolean::kFalse);
+ }
+ index++;
+ }
+ EXPECT_EQ(index, expected_records.size());
+ EXPECT_OK(env_->DeleteFile(trace_file_path_));
+ EXPECT_OK(env_->DeleteDir(test_path_));
+ }
+
+ protected:
+ uint64_t IndexUncompressedHelper(bool indexCompress);
+
+ private:
+ uint32_t format_;
+ Env* env_;
+ std::string trace_file_path_;
+ std::string test_path_;
+};
+class PlainTableTest : public TableTest {};
+class TablePropertyTest : public testing::Test {};
+class BBTTailPrefetchTest : public TableTest {};
+
+// The helper class to test the file checksum
+class FileChecksumTestHelper {
+ public:
+ FileChecksumTestHelper(bool convert_to_internal_key = false)
+ : convert_to_internal_key_(convert_to_internal_key) {
+ sink_ = new test::StringSink();
+ }
+ ~FileChecksumTestHelper() {}
+
+ void CreateWriteableFile() {
+ file_writer_.reset(test::GetWritableFileWriter(sink_, "" /* don't care */));
+ }
+
+ void SetFileChecksumFunc(FileChecksumFunc* checksum_func) {
+ if (file_writer_ != nullptr) {
+ file_writer_->TEST_SetFileChecksumFunc(checksum_func);
+ }
+ }
+
+ WritableFileWriter* GetFileWriter() { return file_writer_.get(); }
+
+ Status ResetTableBuilder(std::unique_ptr<TableBuilder>&& builder) {
+ assert(builder != nullptr);
+ table_builder_ = std::move(builder);
+ return Status::OK();
+ }
+
+ void AddKVtoKVMap(int num_entries) {
+ Random rnd(test::RandomSeed());
+ for (int i = 0; i < num_entries; i++) {
+ std::string v;
+ test::RandomString(&rnd, 100, &v);
+ kv_map_[test::RandomKey(&rnd, 20)] = v;
+ }
+ }
+
+ Status WriteKVAndFlushTable() {
+ for (const auto kv : kv_map_) {
+ if (convert_to_internal_key_) {
+ ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
+ std::string encoded;
+ AppendInternalKey(&encoded, ikey);
+ table_builder_->Add(encoded, kv.second);
+ } else {
+ table_builder_->Add(kv.first, kv.second);
+ }
+ EXPECT_TRUE(table_builder_->status().ok());
+ }
+ Status s = table_builder_->Finish();
+ file_writer_->Flush();
+ EXPECT_TRUE(s.ok());
+
+ EXPECT_EQ(sink_->contents().size(), table_builder_->FileSize());
+ return s;
+ }
+
+ std::string GetFileChecksum() { return table_builder_->GetFileChecksum(); }
+
+ const char* GetFileChecksumFuncName() {
+ return table_builder_->GetFileChecksumFuncName();
+ }
+
+ Status CalculateFileChecksum(FileChecksumFunc* file_checksum_func,
+ std::string* checksum) {
+ assert(file_checksum_func != nullptr);
+ cur_uniq_id_ = checksum_uniq_id_++;
+ test::StringSink* ss_rw =
+ ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter(
+ file_writer_.get());
+ file_reader_.reset(test::GetRandomAccessFileReader(
+ new test::StringSource(ss_rw->contents())));
+ std::unique_ptr<char[]> scratch(new char[2048]);
+ Slice result;
+ uint64_t offset = 0;
+ std::string tmp_checksum;
+ bool first_read = true;
+ Status s;
+ s = file_reader_->Read(offset, 2048, &result, scratch.get(), false);
+ if (!s.ok()) {
+ return s;
+ }
+ while (result.size() != 0) {
+ if (first_read) {
+ first_read = false;
+ tmp_checksum = file_checksum_func->Value(scratch.get(), result.size());
+ } else {
+ tmp_checksum = file_checksum_func->Extend(tmp_checksum, scratch.get(),
+ result.size());
+ }
+ offset += static_cast<uint64_t>(result.size());
+ s = file_reader_->Read(offset, 2048, &result, scratch.get(), false);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ EXPECT_EQ(offset, static_cast<uint64_t>(table_builder_->FileSize()));
+ *checksum = tmp_checksum;
+ return Status::OK();
+ }
+
+ private:
+ bool convert_to_internal_key_;
+ uint64_t cur_uniq_id_;
+ std::unique_ptr<WritableFileWriter> file_writer_;
+ std::unique_ptr<RandomAccessFileReader> file_reader_;
+ std::unique_ptr<TableBuilder> table_builder_;
+ stl_wrappers::KVMap kv_map_;
+ test::StringSink* sink_;
+
+ static uint64_t checksum_uniq_id_;
+};
+
+uint64_t FileChecksumTestHelper::checksum_uniq_id_ = 1;
+
+INSTANTIATE_TEST_CASE_P(FormatDef, BlockBasedTableTest,
+ testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, BlockBasedTableTest,
+ testing::Values(test::kLatestFormatVersion));
+
+// This test serves as the living tutorial for the prefix scan of user collected
+// properties.
+TEST_F(TablePropertyTest, PrefixScanTest) {
+ UserCollectedProperties props{{"num.111.1", "1"},
+ {"num.111.2", "2"},
+ {"num.111.3", "3"},
+ {"num.333.1", "1"},
+ {"num.333.2", "2"},
+ {"num.333.3", "3"},
+ {"num.555.1", "1"},
+ {"num.555.2", "2"},
+ {"num.555.3", "3"}, };
+
+ // prefixes that exist
+ for (const std::string& prefix : {"num.111", "num.333", "num.555"}) {
+ int num = 0;
+ for (auto pos = props.lower_bound(prefix);
+ pos != props.end() &&
+ pos->first.compare(0, prefix.size(), prefix) == 0;
+ ++pos) {
+ ++num;
+ auto key = prefix + "." + ToString(num);
+ ASSERT_EQ(key, pos->first);
+ ASSERT_EQ(ToString(num), pos->second);
+ }
+ ASSERT_EQ(3, num);
+ }
+
+ // prefixes that don't exist
+ for (const std::string& prefix :
+ {"num.000", "num.222", "num.444", "num.666"}) {
+ auto pos = props.lower_bound(prefix);
+ ASSERT_TRUE(pos == props.end() ||
+ pos->first.compare(0, prefix.size(), prefix) != 0);
+ }
+}
+
+// This test include all the basic checks except those for index size and block
+// size, which will be conducted in separated unit tests.
+TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) {
+ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+
+ c.Add("a1", "val1");
+ c.Add("b2", "val2");
+ c.Add("c3", "val3");
+ c.Add("d4", "val4");
+ c.Add("e5", "val5");
+ c.Add("f6", "val6");
+ c.Add("g7", "val7");
+ c.Add("h8", "val8");
+ c.Add("j9", "val9");
+ uint64_t diff_internal_user_bytes = 9 * 8; // 8 is seq size, 9 k-v totally
+
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ Options options;
+ options.compression = kNoCompression;
+ options.statistics = CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kAll);
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.block_restart_interval = 1;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ ImmutableCFOptions ioptions(options);
+ MutableCFOptions moptions(options);
+ ioptions.statistics = options.statistics.get();
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+ ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_NOT_COMPRESSED), 0);
+
+ auto& props = *c.GetTableReader()->GetTableProperties();
+ ASSERT_EQ(kvmap.size(), props.num_entries);
+
+ auto raw_key_size = kvmap.size() * 2ul;
+ auto raw_value_size = kvmap.size() * 4ul;
+
+ ASSERT_EQ(raw_key_size + diff_internal_user_bytes, props.raw_key_size);
+ ASSERT_EQ(raw_value_size, props.raw_value_size);
+ ASSERT_EQ(1ul, props.num_data_blocks);
+ ASSERT_EQ("", props.filter_policy_name); // no filter policy is used
+
+ // Verify data size.
+ BlockBuilder block_builder(1);
+ for (const auto& item : kvmap) {
+ block_builder.Add(item.first, item.second);
+ }
+ Slice content = block_builder.Finish();
+ ASSERT_EQ(content.size() + kBlockTrailerSize + diff_internal_user_bytes,
+ props.data_size);
+ c.ResetTableReader();
+}
+
+#ifdef SNAPPY
+uint64_t BlockBasedTableTest::IndexUncompressedHelper(bool compressed) {
+ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+ constexpr size_t kNumKeys = 10000;
+
+ for (size_t k = 0; k < kNumKeys; ++k) {
+ c.Add("key" + ToString(k), "val" + ToString(k));
+ }
+
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ Options options;
+ options.compression = kSnappyCompression;
+ options.statistics = CreateDBStatistics();
+ options.statistics->set_stats_level(StatsLevel::kAll);
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.block_restart_interval = 1;
+ table_options.enable_index_compression = compressed;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ ImmutableCFOptions ioptions(options);
+ MutableCFOptions moptions(options);
+ ioptions.statistics = options.statistics.get();
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+ c.ResetTableReader();
+ return options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+}
+TEST_P(BlockBasedTableTest, IndexUncompressed) {
+ uint64_t tbl1_compressed_cnt = IndexUncompressedHelper(true);
+ uint64_t tbl2_compressed_cnt = IndexUncompressedHelper(false);
+ // tbl1_compressed_cnt should include 1 index block
+ EXPECT_EQ(tbl2_compressed_cnt + 1, tbl1_compressed_cnt);
+}
+#endif // SNAPPY
+
+TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
+ TableConstructor c(&reverse_key_comparator);
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+
+ {
+ Options options;
+ options.compression = CompressionType::kNoCompression;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+ auto& props = *c.GetTableReader()->GetTableProperties();
+
+ // Default comparator
+ ASSERT_EQ("leveldb.BytewiseComparator", props.comparator_name);
+ // No merge operator
+ ASSERT_EQ("nullptr", props.merge_operator_name);
+ // No prefix extractor
+ ASSERT_EQ("nullptr", props.prefix_extractor_name);
+ // No property collectors
+ ASSERT_EQ("[]", props.property_collectors_names);
+ // No filter policy is used
+ ASSERT_EQ("", props.filter_policy_name);
+ // Compression type == that set:
+ ASSERT_EQ("NoCompression", props.compression_name);
+ c.ResetTableReader();
+ }
+
+ {
+ Options options;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.comparator = &reverse_key_comparator;
+ options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+ options.prefix_extractor.reset(NewNoopTransform());
+ options.table_properties_collector_factories.emplace_back(
+ new DummyPropertiesCollectorFactory1());
+ options.table_properties_collector_factories.emplace_back(
+ new DummyPropertiesCollectorFactory2());
+
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+ auto& props = *c.GetTableReader()->GetTableProperties();
+
+ ASSERT_EQ("rocksdb.ReverseBytewiseComparator", props.comparator_name);
+ ASSERT_EQ("UInt64AddOperator", props.merge_operator_name);
+ ASSERT_EQ("rocksdb.Noop", props.prefix_extractor_name);
+ ASSERT_EQ("[DummyPropertiesCollector1,DummyPropertiesCollector2]",
+ props.property_collectors_names);
+ ASSERT_EQ("", props.filter_policy_name); // no filter policy is used
+ c.ResetTableReader();
+ }
+}
+
+TEST_P(BlockBasedTableTest, RangeDelBlock) {
+ TableConstructor c(BytewiseComparator());
+ std::vector<std::string> keys = {"1pika", "2chu"};
+ std::vector<std::string> vals = {"p", "c"};
+
+ std::vector<RangeTombstone> expected_tombstones = {
+ {"1pika", "2chu", 0},
+ {"2chu", "c", 1},
+ {"2chu", "c", 0},
+ {"c", "p", 0},
+ };
+
+ for (int i = 0; i < 2; i++) {
+ RangeTombstone t(keys[i], vals[i], i);
+ std::pair<InternalKey, Slice> p = t.Serialize();
+ c.Add(p.first.Encode().ToString(), p.second);
+ }
+
+ std::vector<std::string> sorted_keys;
+ stl_wrappers::KVMap kvmap;
+ Options options;
+ options.compression = kNoCompression;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.block_restart_interval = 1;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ std::unique_ptr<InternalKeyComparator> internal_cmp(
+ new InternalKeyComparator(options.comparator));
+ c.Finish(options, ioptions, moptions, table_options, *internal_cmp,
+ &sorted_keys, &kvmap);
+
+ for (int j = 0; j < 2; ++j) {
+ std::unique_ptr<InternalIterator> iter(
+ c.GetTableReader()->NewRangeTombstoneIterator(ReadOptions()));
+ if (j > 0) {
+ // For second iteration, delete the table reader object and verify the
+ // iterator can still access its metablock's range tombstones.
+ c.ResetTableReader();
+ }
+ ASSERT_FALSE(iter->Valid());
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ for (size_t i = 0; i < expected_tombstones.size(); i++) {
+ ASSERT_TRUE(iter->Valid());
+ ParsedInternalKey parsed_key;
+ ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key));
+ RangeTombstone t(parsed_key, iter->value());
+ const auto& expected_t = expected_tombstones[i];
+ ASSERT_EQ(t.start_key_, expected_t.start_key_);
+ ASSERT_EQ(t.end_key_, expected_t.end_key_);
+ ASSERT_EQ(t.seq_, expected_t.seq_);
+ iter->Next();
+ }
+ ASSERT_TRUE(!iter->Valid());
+ }
+}
+
+TEST_P(BlockBasedTableTest, FilterPolicyNameProperties) {
+ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+ c.Add("a1", "val1");
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ Options options;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+ auto& props = *c.GetTableReader()->GetTableProperties();
+ ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name);
+ c.ResetTableReader();
+}
+
+//
+// BlockBasedTableTest::PrefetchTest
+//
+void AssertKeysInCache(BlockBasedTable* table_reader,
+ const std::vector<std::string>& keys_in_cache,
+ const std::vector<std::string>& keys_not_in_cache,
+ bool convert = false) {
+ if (convert) {
+ for (auto key : keys_in_cache) {
+ InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+ ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+ }
+ for (auto key : keys_not_in_cache) {
+ InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+ ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+ }
+ } else {
+ for (auto key : keys_in_cache) {
+ ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
+ }
+ for (auto key : keys_not_in_cache) {
+ ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key));
+ }
+ }
+}
+
+void PrefetchRange(TableConstructor* c, Options* opt,
+ BlockBasedTableOptions* table_options, const char* key_begin,
+ const char* key_end,
+ const std::vector<std::string>& keys_in_cache,
+ const std::vector<std::string>& keys_not_in_cache,
+ const Status expected_status = Status::OK()) {
+ // reset the cache and reopen the table
+ table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+ opt->table_factory.reset(NewBlockBasedTableFactory(*table_options));
+ const ImmutableCFOptions ioptions2(*opt);
+ const MutableCFOptions moptions(*opt);
+ ASSERT_OK(c->Reopen(ioptions2, moptions));
+
+ // prefetch
+ auto* table_reader = dynamic_cast<BlockBasedTable*>(c->GetTableReader());
+ Status s;
+ std::unique_ptr<Slice> begin, end;
+ std::unique_ptr<InternalKey> i_begin, i_end;
+ if (key_begin != nullptr) {
+ if (c->ConvertToInternalKey()) {
+ i_begin.reset(new InternalKey(key_begin, kMaxSequenceNumber, kTypeValue));
+ begin.reset(new Slice(i_begin->Encode()));
+ } else {
+ begin.reset(new Slice(key_begin));
+ }
+ }
+ if (key_end != nullptr) {
+ if (c->ConvertToInternalKey()) {
+ i_end.reset(new InternalKey(key_end, kMaxSequenceNumber, kTypeValue));
+ end.reset(new Slice(i_end->Encode()));
+ } else {
+ end.reset(new Slice(key_end));
+ }
+ }
+ s = table_reader->Prefetch(begin.get(), end.get());
+
+ ASSERT_TRUE(s.code() == expected_status.code());
+
+ // assert our expectation in cache warmup
+ AssertKeysInCache(table_reader, keys_in_cache, keys_not_in_cache,
+ c->ConvertToInternalKey());
+ c->ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, PrefetchTest) {
+ // The purpose of this test is to test the prefetching operation built into
+ // BlockBasedTable.
+ Options opt;
+ std::unique_ptr<InternalKeyComparator> ikc;
+ ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+ opt.compression = kNoCompression;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.block_size = 1024;
+ // big enough so we don't ever lose cached values.
+ table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+ opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+ c.Add("k01", "hello");
+ c.Add("k02", "hello2");
+ c.Add("k03", std::string(10000, 'x'));
+ c.Add("k04", std::string(200000, 'x'));
+ c.Add("k05", std::string(300000, 'x'));
+ c.Add("k06", "hello3");
+ c.Add("k07", std::string(100000, 'x'));
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ const ImmutableCFOptions ioptions(opt);
+ const MutableCFOptions moptions(opt);
+ c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
+ c.ResetTableReader();
+
+ // We get the following data spread :
+ //
+ // Data block Index
+ // ========================
+ // [ k01 k02 k03 ] k03
+ // [ k04 ] k04
+ // [ k05 ] k05
+ // [ k06 k07 ] k07
+
+
+ // Simple
+ PrefetchRange(&c, &opt, &table_options,
+ /*key_range=*/"k01", "k05",
+ /*keys_in_cache=*/{"k01", "k02", "k03", "k04", "k05"},
+ /*keys_not_in_cache=*/{"k06", "k07"});
+ PrefetchRange(&c, &opt, &table_options, "k01", "k01", {"k01", "k02", "k03"},
+ {"k04", "k05", "k06", "k07"});
+ // odd
+ PrefetchRange(&c, &opt, &table_options, "a", "z",
+ {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+ PrefetchRange(&c, &opt, &table_options, "k00", "k00", {"k01", "k02", "k03"},
+ {"k04", "k05", "k06", "k07"});
+ // Edge cases
+ PrefetchRange(&c, &opt, &table_options, "k00", "k06",
+ {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+ PrefetchRange(&c, &opt, &table_options, "k00", "zzz",
+ {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+ // null keys
+ PrefetchRange(&c, &opt, &table_options, nullptr, nullptr,
+ {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+ PrefetchRange(&c, &opt, &table_options, "k04", nullptr,
+ {"k04", "k05", "k06", "k07"}, {"k01", "k02", "k03"});
+ PrefetchRange(&c, &opt, &table_options, nullptr, "k05",
+ {"k01", "k02", "k03", "k04", "k05"}, {"k06", "k07"});
+ // invalid
+ PrefetchRange(&c, &opt, &table_options, "k06", "k00", {}, {},
+ Status::InvalidArgument(Slice("k06 "), Slice("k07")));
+ c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ for (int i = 0; i <= 5; ++i) {
+ Options options;
+ // Make each key/value an individual block
+ table_options.block_size = 64;
+ switch (i) {
+ case 0:
+ // Binary search index
+ table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ break;
+ case 1:
+ // Hash search index
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+ break;
+ case 2:
+ // Hash search index with hash_index_allow_collision
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ table_options.hash_index_allow_collision = true;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+ break;
+ case 3:
+ // Hash search index with filter policy
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+ break;
+ case 4:
+ // Two-level index
+ table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ break;
+ case 5:
+ // Binary search with first key
+ table_options.index_type =
+ BlockBasedTableOptions::kBinarySearchWithFirstKey;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ break;
+ }
+
+ TableConstructor c(BytewiseComparator(),
+ true /* convert_to_internal_key_ */);
+ c.Add("aaaa1", std::string('a', 56));
+ c.Add("bbaa1", std::string('a', 56));
+ c.Add("cccc1", std::string('a', 56));
+ c.Add("bbbb1", std::string('a', 56));
+ c.Add("baaa1", std::string('a', 56));
+ c.Add("abbb1", std::string('a', 56));
+ c.Add("cccc2", std::string('a', 56));
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+ auto props = c.GetTableReader()->GetTableProperties();
+ ASSERT_EQ(7u, props->num_data_blocks);
+ auto* reader = c.GetTableReader();
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+ ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+ iter->Seek(InternalKey("b", 0, kTypeValue).Encode());
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("baaa1", ExtractUserKey(iter->key()).ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
+
+ iter->Seek(InternalKey("bb", 0, kTypeValue).Encode());
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
+
+ iter->Seek(InternalKey("bbb", 0, kTypeValue).Encode());
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
+ iter->Next();
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("cccc1", ExtractUserKey(iter->key()).ToString());
+ }
+}
+
+TEST_P(BlockBasedTableTest, NoopTransformSeek) {
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+
+ Options options;
+ options.comparator = BytewiseComparator();
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewNoopTransform());
+
+ TableConstructor c(options.comparator);
+ // To tickle the PrefixMayMatch bug it is important that the
+ // user-key is a single byte so that the index key exactly matches
+ // the user-key.
+ InternalKey key("a", 1, kTypeValue);
+ c.Add(key.Encode().ToString(), "b");
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ const InternalKeyComparator internal_comparator(options.comparator);
+ c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+ &keys, &kvmap);
+
+ auto* reader = c.GetTableReader();
+ for (int i = 0; i < 2; ++i) {
+ ReadOptions ro;
+ ro.total_order_seek = (i == 0);
+ std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+ ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+ iter->Seek(key.Encode());
+ ASSERT_OK(iter->status());
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("a", ExtractUserKey(iter->key()).ToString());
+ }
+}
+
+TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) {
+ // if DB is opened with a prefix extractor of a different name,
+ // prefix bloom is skipped when read the file
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.filter_policy.reset(NewBloomFilterPolicy(2));
+ table_options.whole_key_filtering = false;
+
+ Options options;
+ options.comparator = BytewiseComparator();
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+ TableConstructor c(options.comparator);
+ InternalKey key("abcdefghijk", 1, kTypeValue);
+ c.Add(key.Encode().ToString(), "test");
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ const InternalKeyComparator internal_comparator(options.comparator);
+ c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+ &keys, &kvmap);
+ // TODO(Zhongyi): update test to use MutableCFOptions
+ options.prefix_extractor.reset(NewFixedPrefixTransform(9));
+ const ImmutableCFOptions new_ioptions(options);
+ const MutableCFOptions new_moptions(options);
+ c.Reopen(new_ioptions, new_moptions);
+ auto reader = c.GetTableReader();
+ std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
+ ReadOptions(), new_moptions.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+ // Test point lookup
+ // only one kv
+ for (auto& kv : kvmap) {
+ db_iter->Seek(kv.first);
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ ASSERT_EQ(db_iter->key(), kv.first);
+ ASSERT_EQ(db_iter->value(), kv.second);
+ }
+}
+
+static std::string RandomString(Random* rnd, int len) {
+ std::string r;
+ test::RandomString(rnd, len, &r);
+ return r;
+}
+
+void AddInternalKey(TableConstructor* c, const std::string& prefix,
+ std::string value = "v", int /*suffix_len*/ = 800) {
+ static Random rnd(1023);
+ InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);
+ c->Add(k.Encode().ToString(), value);
+}
+
+void TableTest::IndexTest(BlockBasedTableOptions table_options) {
+ TableConstructor c(BytewiseComparator());
+
+ // keys with prefix length 3, make sure the key/value is big enough to fill
+ // one block
+ AddInternalKey(&c, "0015");
+ AddInternalKey(&c, "0035");
+
+ AddInternalKey(&c, "0054");
+ AddInternalKey(&c, "0055");
+
+ AddInternalKey(&c, "0056");
+ AddInternalKey(&c, "0057");
+
+ AddInternalKey(&c, "0058");
+ AddInternalKey(&c, "0075");
+
+ AddInternalKey(&c, "0076");
+ AddInternalKey(&c, "0095");
+
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ Options options;
+ options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+ table_options.block_size = 1700;
+ table_options.block_cache = NewLRUCache(1024, 4);
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ std::unique_ptr<InternalKeyComparator> comparator(
+ new InternalKeyComparator(BytewiseComparator()));
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+ &kvmap);
+ auto reader = c.GetTableReader();
+
+ auto props = reader->GetTableProperties();
+ ASSERT_EQ(5u, props->num_data_blocks);
+
+ // TODO(Zhongyi): update test to use MutableCFOptions
+ std::unique_ptr<InternalIterator> index_iter(reader->NewIterator(
+ ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+ // -- Find keys do not exist, but have common prefix.
+ std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
+ std::vector<std::string> lower_bound = {keys[0], keys[1], keys[2],
+ keys[7], keys[9], };
+
+ // find the lower bound of the prefix
+ for (size_t i = 0; i < prefixes.size(); ++i) {
+ index_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode());
+ ASSERT_OK(index_iter->status());
+ ASSERT_TRUE(index_iter->Valid());
+
+ // seek the first element in the block
+ ASSERT_EQ(lower_bound[i], index_iter->key().ToString());
+ ASSERT_EQ("v", index_iter->value().ToString());
+ }
+
+ // find the upper bound of prefixes
+ std::vector<std::string> upper_bound = {keys[1], keys[2], keys[7], keys[9], };
+
+ // find existing keys
+ for (const auto& item : kvmap) {
+ auto ukey = ExtractUserKey(item.first).ToString();
+ index_iter->Seek(ukey);
+
+ // ASSERT_OK(regular_iter->status());
+ ASSERT_OK(index_iter->status());
+
+ // ASSERT_TRUE(regular_iter->Valid());
+ ASSERT_TRUE(index_iter->Valid());
+
+ ASSERT_EQ(item.first, index_iter->key().ToString());
+ ASSERT_EQ(item.second, index_iter->value().ToString());
+ }
+
+ for (size_t i = 0; i < prefixes.size(); ++i) {
+ // the key is greater than any existing keys.
+ auto key = prefixes[i] + "9";
+ index_iter->Seek(InternalKey(key, 0, kTypeValue).Encode());
+
+ ASSERT_TRUE(index_iter->status().ok() || index_iter->status().IsNotFound());
+ ASSERT_TRUE(!index_iter->status().IsNotFound() || !index_iter->Valid());
+ if (i == prefixes.size() - 1) {
+ // last key
+ ASSERT_TRUE(!index_iter->Valid());
+ } else {
+ ASSERT_TRUE(index_iter->Valid());
+ // seek the first element in the block
+ ASSERT_EQ(upper_bound[i], index_iter->key().ToString());
+ ASSERT_EQ("v", index_iter->value().ToString());
+ }
+ }
+
+ // find keys with prefix that don't match any of the existing prefixes.
+ std::vector<std::string> non_exist_prefixes = {"002", "004", "006", "008"};
+ for (const auto& prefix : non_exist_prefixes) {
+ index_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode());
+ // regular_iter->Seek(prefix);
+
+ ASSERT_OK(index_iter->status());
+ // Seek to non-existing prefixes should yield either invalid, or a
+ // key with prefix greater than the target.
+ if (index_iter->Valid()) {
+ Slice ukey = ExtractUserKey(index_iter->key());
+ Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
+ ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) < 0);
+ }
+ }
+ for (const auto& prefix : non_exist_prefixes) {
+ index_iter->SeekForPrev(InternalKey(prefix, 0, kTypeValue).Encode());
+ // regular_iter->Seek(prefix);
+
+ ASSERT_OK(index_iter->status());
+ // Seek to non-existing prefixes should yield either invalid, or a
+ // key with prefix greater than the target.
+ if (index_iter->Valid()) {
+ Slice ukey = ExtractUserKey(index_iter->key());
+ Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
+ ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) > 0);
+ }
+ }
+ c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexTest) {
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+ IndexTest(table_options);
+}
+
+TEST_P(BlockBasedTableTest, HashIndexTest) {
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.index_type = BlockBasedTableOptions::kHashSearch;
+ IndexTest(table_options);
+}
+
+TEST_P(BlockBasedTableTest, PartitionIndexTest) {
+ const int max_index_keys = 5;
+ const int est_max_index_key_value_size = 32;
+ const int est_max_index_size = max_index_keys * est_max_index_key_value_size;
+ for (int i = 1; i <= est_max_index_size + 1; i++) {
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+ table_options.metadata_block_size = i;
+ IndexTest(table_options);
+ }
+}
+
+TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) {
+ std::unique_ptr<InternalKeyComparator> comparator(
+ new InternalKeyComparator(BytewiseComparator()));
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ Options options;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+
+ TableConstructor c(BytewiseComparator());
+ AddInternalKey(&c, "pika");
+
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+ &kvmap);
+ ASSERT_EQ(1, keys.size());
+
+ auto reader = c.GetTableReader();
+ ReadOptions ropt;
+ ropt.read_tier = ReadTier::kBlockCacheTier;
+ std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+ ropt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+ auto ikey = [](Slice user_key) {
+ return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
+ };
+
+ iter->Seek(ikey("pika"));
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsIncomplete());
+
+ // This used to crash at some point.
+ iter->Seek(ikey("pika"));
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->status().IsIncomplete());
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey1) {
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
+ IndexTest(table_options);
+}
+
+class CustomFlushBlockPolicy : public FlushBlockPolicyFactory,
+ public FlushBlockPolicy {
+ public:
+ explicit CustomFlushBlockPolicy(std::vector<int> keys_per_block)
+ : keys_per_block_(keys_per_block) {}
+
+ const char* Name() const override { return "table_test"; }
+ FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&,
+ const BlockBuilder&) const override {
+ return new CustomFlushBlockPolicy(keys_per_block_);
+ }
+
+ bool Update(const Slice&, const Slice&) override {
+ if (keys_in_current_block_ >= keys_per_block_.at(current_block_idx_)) {
+ ++current_block_idx_;
+ keys_in_current_block_ = 1;
+ return true;
+ }
+
+ ++keys_in_current_block_;
+ return false;
+ }
+
+ std::vector<int> keys_per_block_;
+
+ int current_block_idx_ = 0;
+ int keys_in_current_block_ = 0;
+};
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) {
+ for (int use_first_key = 0; use_first_key < 2; ++use_first_key) {
+ SCOPED_TRACE("use_first_key = " + std::to_string(use_first_key));
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.index_type =
+ use_first_key ? BlockBasedTableOptions::kBinarySearchWithFirstKey
+ : BlockBasedTableOptions::kBinarySearch;
+ table_options.block_cache = NewLRUCache(10000); // fits all blocks
+ table_options.index_shortening =
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+ table_options.flush_block_policy_factory =
+ std::make_shared<CustomFlushBlockPolicy>(std::vector<int>{2, 1, 3, 2});
+ Options options;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ options.statistics = CreateDBStatistics();
+ Statistics* stats = options.statistics.get();
+ std::unique_ptr<InternalKeyComparator> comparator(
+ new InternalKeyComparator(BytewiseComparator()));
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+
+ TableConstructor c(BytewiseComparator());
+
+ // Block 0.
+ AddInternalKey(&c, "aaaa", "v0");
+ AddInternalKey(&c, "aaac", "v1");
+
+ // Block 1.
+ AddInternalKey(&c, "aaca", "v2");
+
+ // Block 2.
+ AddInternalKey(&c, "caaa", "v3");
+ AddInternalKey(&c, "caac", "v4");
+ AddInternalKey(&c, "caae", "v5");
+
+ // Block 3.
+ AddInternalKey(&c, "ccaa", "v6");
+ AddInternalKey(&c, "ccac", "v7");
+
+ // Write the file.
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+ &kvmap);
+ ASSERT_EQ(8, keys.size());
+
+ auto reader = c.GetTableReader();
+ auto props = reader->GetTableProperties();
+ ASSERT_EQ(4u, props->num_data_blocks);
+ std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+ ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+ // Shouldn't have read data blocks before iterator is seeked.
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ auto ikey = [](Slice user_key) {
+ return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
+ };
+
+ // Seek to a key between blocks. If index contains first key, we shouldn't
+ // read any data blocks until value is requested.
+ iter->Seek(ikey("aaba"));
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[2], iter->key().ToString());
+ EXPECT_EQ(use_first_key ? 0 : 1,
+ stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ("v2", iter->value().ToString());
+ EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // Seek to the middle of a block. The block should be read right away.
+ iter->Seek(ikey("caab"));
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[4], iter->key().ToString());
+ EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ("v4", iter->value().ToString());
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // Seek to just before the same block and don't access value.
+ // The iterator should keep pinning the block contents.
+ iter->Seek(ikey("baaa"));
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[3], iter->key().ToString());
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // Seek to the same block again to check that the block is still pinned.
+ iter->Seek(ikey("caae"));
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[5], iter->key().ToString());
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ("v5", iter->value().ToString());
+ EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // Step forward and fall through to the next block. Don't access value.
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[6], iter->key().ToString());
+ EXPECT_EQ(use_first_key ? 2 : 3,
+ stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // Step forward again. Block should be read.
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[7], iter->key().ToString());
+ EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ("v7", iter->value().ToString());
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // Step forward and reach the end.
+ iter->Next();
+ EXPECT_FALSE(iter->Valid());
+ EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // Seek to a single-key block and step forward without accessing value.
+ iter->Seek(ikey("aaca"));
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[2], iter->key().ToString());
+ EXPECT_EQ(use_first_key ? 0 : 1,
+ stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[3], iter->key().ToString());
+ EXPECT_EQ(use_first_key ? 1 : 2,
+ stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ("v3", iter->value().ToString());
+ EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ // Seek between blocks and step back without accessing value.
+ iter->Seek(ikey("aaca"));
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[2], iter->key().ToString());
+ EXPECT_EQ(use_first_key ? 2 : 3,
+ stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ iter->Prev();
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[1], iter->key().ToString());
+ EXPECT_EQ(use_first_key ? 2 : 3,
+ stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ // All blocks are in cache now, there'll be no more misses ever.
+ EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ("v1", iter->value().ToString());
+
+ // Next into the next block again.
+ iter->Next();
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[2], iter->key().ToString());
+ EXPECT_EQ(use_first_key ? 2 : 4,
+ stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // Seek to first and step back without accessing value.
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[0], iter->key().ToString());
+ EXPECT_EQ(use_first_key ? 2 : 5,
+ stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ iter->Prev();
+ EXPECT_FALSE(iter->Valid());
+ EXPECT_EQ(use_first_key ? 2 : 5,
+ stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ // Do some SeekForPrev() and SeekToLast() just to cover all methods.
+ iter->SeekForPrev(ikey("caad"));
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[4], iter->key().ToString());
+ EXPECT_EQ(use_first_key ? 3 : 6,
+ stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ("v4", iter->value().ToString());
+ EXPECT_EQ(use_first_key ? 3 : 6,
+ stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ iter->SeekToLast();
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(keys[7], iter->key().ToString());
+ EXPECT_EQ(use_first_key ? 4 : 7,
+ stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ("v7", iter->value().ToString());
+ EXPECT_EQ(use_first_key ? 4 : 7,
+ stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+ EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ c.ResetTableReader();
+ }
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) {
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
+ table_options.block_cache = NewLRUCache(10000);
+ Options options;
+ options.statistics = CreateDBStatistics();
+ Statistics* stats = options.statistics.get();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ std::unique_ptr<InternalKeyComparator> comparator(
+ new InternalKeyComparator(BytewiseComparator()));
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+
+ TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false,
+ /* level */ -1, /* largest_seqno */ 42);
+
+ c.Add(InternalKey("b", 0, kTypeValue).Encode().ToString(), "x");
+ c.Add(InternalKey("c", 0, kTypeValue).Encode().ToString(), "y");
+
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+ &kvmap);
+ ASSERT_EQ(2, keys.size());
+
+ auto reader = c.GetTableReader();
+ auto props = reader->GetTableProperties();
+ ASSERT_EQ(1u, props->num_data_blocks);
+ std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+ ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+ iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString());
+ ASSERT_TRUE(iter->Valid());
+ EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
+ iter->key().ToString());
+ EXPECT_NE(keys[0], iter->key().ToString());
+ // Key should have been served from index, without reading data blocks.
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+ EXPECT_EQ("x", iter->value().ToString());
+ EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+ EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+ EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
+ iter->key().ToString());
+
+ c.ResetTableReader();
+}
+
+// It's very hard to figure out the index block size of a block accurately.
+// To make sure we get the index size, we just make sure as key number
+// grows, the filter block size also grows.
+TEST_P(BlockBasedTableTest, IndexSizeStat) {
+ uint64_t last_index_size = 0;
+
+ // we need to use random keys since the pure human readable texts
+ // may be well compressed, resulting insignifcant change of index
+ // block size.
+ Random rnd(test::RandomSeed());
+ std::vector<std::string> keys;
+
+ for (int i = 0; i < 100; ++i) {
+ keys.push_back(RandomString(&rnd, 10000));
+ }
+
+ // Each time we load one more key to the table. the table index block
+ // size is expected to be larger than last time's.
+ for (size_t i = 1; i < keys.size(); ++i) {
+ TableConstructor c(BytewiseComparator(),
+ true /* convert_to_internal_key_ */);
+ for (size_t j = 0; j < i; ++j) {
+ c.Add(keys[j], "val");
+ }
+
+ std::vector<std::string> ks;
+ stl_wrappers::KVMap kvmap;
+ Options options;
+ options.compression = kNoCompression;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.block_restart_interval = 1;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &ks, &kvmap);
+ auto index_size = c.GetTableReader()->GetTableProperties()->index_size;
+ ASSERT_GT(index_size, last_index_size);
+ last_index_size = index_size;
+ c.ResetTableReader();
+ }
+}
+
+TEST_P(BlockBasedTableTest, NumBlockStat) {
+ Random rnd(test::RandomSeed());
+ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+ Options options;
+ options.compression = kNoCompression;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.block_restart_interval = 1;
+ table_options.block_size = 1000;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ for (int i = 0; i < 10; ++i) {
+ // the key/val are slightly smaller than block size, so that each block
+ // holds roughly one key/value pair.
+ c.Add(RandomString(&rnd, 900), "val");
+ }
+
+ std::vector<std::string> ks;
+ stl_wrappers::KVMap kvmap;
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &ks, &kvmap);
+ ASSERT_EQ(kvmap.size(),
+ c.GetTableReader()->GetTableProperties()->num_data_blocks);
+ c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingGetTest) {
+ TableConstructor c(BytewiseComparator());
+ Options options;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ options.create_if_missing = true;
+ table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ SetupTracingTest(&c);
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ ImmutableCFOptions ioptions(options);
+ MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+ std::string user_key = "k01";
+ InternalKey internal_key(user_key, 0, kTypeValue);
+ std::string encoded_key = internal_key.Encode().ToString();
+ for (uint32_t i = 1; i <= 2; i++) {
+ PinnableSlice value;
+ GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, user_key, &value, nullptr,
+ nullptr, true, nullptr, nullptr, nullptr, nullptr,
+ nullptr, nullptr, /*tracing_get_id=*/i);
+ get_perf_context()->Reset();
+ ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context,
+ moptions.prefix_extractor.get()));
+ ASSERT_EQ(get_context.State(), GetContext::kFound);
+ ASSERT_EQ(value.ToString(), kDummyValue);
+ }
+
+ // Verify traces.
+ std::vector<BlockCacheTraceRecord> expected_records;
+ // The first two records should be prefetching index and filter blocks.
+ BlockCacheTraceRecord record;
+ record.block_type = TraceType::kBlockTraceIndexBlock;
+ record.caller = TableReaderCaller::kPrefetch;
+ record.is_cache_hit = Boolean::kFalse;
+ record.no_insert = Boolean::kFalse;
+ expected_records.push_back(record);
+ record.block_type = TraceType::kBlockTraceFilterBlock;
+ expected_records.push_back(record);
+ // Then we should have three records for one index, one filter, and one data
+ // block access.
+ record.get_id = 1;
+ record.block_type = TraceType::kBlockTraceIndexBlock;
+ record.caller = TableReaderCaller::kUserGet;
+ record.get_from_user_specified_snapshot = Boolean::kFalse;
+ record.referenced_key = encoded_key;
+ record.referenced_key_exist_in_block = Boolean::kTrue;
+ record.is_cache_hit = Boolean::kTrue;
+ expected_records.push_back(record);
+ record.block_type = TraceType::kBlockTraceFilterBlock;
+ expected_records.push_back(record);
+ record.is_cache_hit = Boolean::kFalse;
+ record.block_type = TraceType::kBlockTraceDataBlock;
+ expected_records.push_back(record);
+ // The second get should all observe cache hits.
+ record.is_cache_hit = Boolean::kTrue;
+ record.get_id = 2;
+ record.block_type = TraceType::kBlockTraceIndexBlock;
+ record.caller = TableReaderCaller::kUserGet;
+ record.get_from_user_specified_snapshot = Boolean::kFalse;
+ record.referenced_key = encoded_key;
+ expected_records.push_back(record);
+ record.block_type = TraceType::kBlockTraceFilterBlock;
+ expected_records.push_back(record);
+ record.block_type = TraceType::kBlockTraceDataBlock;
+ expected_records.push_back(record);
+ VerifyBlockAccessTrace(&c, expected_records);
+ c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) {
+ TableConstructor c(BytewiseComparator());
+ Options options;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ options.create_if_missing = true;
+ table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ SetupTracingTest(&c);
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ ImmutableCFOptions ioptions(options);
+ MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+ for (uint32_t i = 1; i <= 2; i++) {
+ std::string user_key = "k01";
+ InternalKey internal_key(user_key, 0, kTypeValue);
+ std::string encoded_key = internal_key.Encode().ToString();
+ c.GetTableReader()->ApproximateOffsetOf(
+ encoded_key, TableReaderCaller::kUserApproximateSize);
+ }
+ // Verify traces.
+ std::vector<BlockCacheTraceRecord> expected_records;
+ // The first two records should be prefetching index and filter blocks.
+ BlockCacheTraceRecord record;
+ record.block_type = TraceType::kBlockTraceIndexBlock;
+ record.caller = TableReaderCaller::kPrefetch;
+ record.is_cache_hit = Boolean::kFalse;
+ record.no_insert = Boolean::kFalse;
+ expected_records.push_back(record);
+ record.block_type = TraceType::kBlockTraceFilterBlock;
+ expected_records.push_back(record);
+ // Then we should have two records for only index blocks.
+ record.block_type = TraceType::kBlockTraceIndexBlock;
+ record.caller = TableReaderCaller::kUserApproximateSize;
+ record.is_cache_hit = Boolean::kTrue;
+ expected_records.push_back(record);
+ expected_records.push_back(record);
+ VerifyBlockAccessTrace(&c, expected_records);
+ c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingIterator) {
+ TableConstructor c(BytewiseComparator());
+ Options options;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ options.create_if_missing = true;
+ table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+ table_options.cache_index_and_filter_blocks = true;
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ SetupTracingTest(&c);
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ ImmutableCFOptions ioptions(options);
+ MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+ for (uint32_t i = 1; i <= 2; i++) {
+ std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
+ ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUserIterator));
+ iter->SeekToFirst();
+ while (iter->Valid()) {
+ iter->key();
+ iter->value();
+ iter->Next();
+ }
+ ASSERT_OK(iter->status());
+ iter.reset();
+ }
+
+ // Verify traces.
+ std::vector<BlockCacheTraceRecord> expected_records;
+ // The first two records should be prefetching index and filter blocks.
+ BlockCacheTraceRecord record;
+ record.block_type = TraceType::kBlockTraceIndexBlock;
+ record.caller = TableReaderCaller::kPrefetch;
+ record.is_cache_hit = Boolean::kFalse;
+ record.no_insert = Boolean::kFalse;
+ expected_records.push_back(record);
+ record.block_type = TraceType::kBlockTraceFilterBlock;
+ expected_records.push_back(record);
+ // Then we should have three records for index and two data block access.
+ record.block_type = TraceType::kBlockTraceIndexBlock;
+ record.caller = TableReaderCaller::kUserIterator;
+ record.is_cache_hit = Boolean::kTrue;
+ expected_records.push_back(record);
+ record.block_type = TraceType::kBlockTraceDataBlock;
+ record.is_cache_hit = Boolean::kFalse;
+ expected_records.push_back(record);
+ expected_records.push_back(record);
+ // When we iterate this file for the second time, we should observe all cache
+ // hits.
+ record.block_type = TraceType::kBlockTraceIndexBlock;
+ record.is_cache_hit = Boolean::kTrue;
+ expected_records.push_back(record);
+ record.block_type = TraceType::kBlockTraceDataBlock;
+ expected_records.push_back(record);
+ expected_records.push_back(record);
+ VerifyBlockAccessTrace(&c, expected_records);
+ c.ResetTableReader();
+}
+
+// A simple tool that takes the snapshot of block cache statistics.
+class BlockCachePropertiesSnapshot {
+ public:
+ explicit BlockCachePropertiesSnapshot(Statistics* statistics) {
+ block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS);
+ block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT);
+ index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
+ index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
+ data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
+ data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
+ filter_block_cache_miss =
+ statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS);
+ filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT);
+ block_cache_bytes_read = statistics->getTickerCount(BLOCK_CACHE_BYTES_READ);
+ block_cache_bytes_write =
+ statistics->getTickerCount(BLOCK_CACHE_BYTES_WRITE);
+ }
+
+ void AssertIndexBlockStat(int64_t expected_index_block_cache_miss,
+ int64_t expected_index_block_cache_hit) {
+ ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
+ ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
+ }
+
+ void AssertFilterBlockStat(int64_t expected_filter_block_cache_miss,
+ int64_t expected_filter_block_cache_hit) {
+ ASSERT_EQ(expected_filter_block_cache_miss, filter_block_cache_miss);
+ ASSERT_EQ(expected_filter_block_cache_hit, filter_block_cache_hit);
+ }
+
+ // Check if the fetched props matches the expected ones.
+ // TODO(kailiu) Use this only when you disabled filter policy!
+ void AssertEqual(int64_t expected_index_block_cache_miss,
+ int64_t expected_index_block_cache_hit,
+ int64_t expected_data_block_cache_miss,
+ int64_t expected_data_block_cache_hit) const {
+ ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
+ ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
+ ASSERT_EQ(expected_data_block_cache_miss, data_block_cache_miss);
+ ASSERT_EQ(expected_data_block_cache_hit, data_block_cache_hit);
+ ASSERT_EQ(expected_index_block_cache_miss + expected_data_block_cache_miss,
+ block_cache_miss);
+ ASSERT_EQ(expected_index_block_cache_hit + expected_data_block_cache_hit,
+ block_cache_hit);
+ }
+
+ int64_t GetCacheBytesRead() { return block_cache_bytes_read; }
+
+ int64_t GetCacheBytesWrite() { return block_cache_bytes_write; }
+
+ private:
+ int64_t block_cache_miss = 0;
+ int64_t block_cache_hit = 0;
+ int64_t index_block_cache_miss = 0;
+ int64_t index_block_cache_hit = 0;
+ int64_t data_block_cache_miss = 0;
+ int64_t data_block_cache_hit = 0;
+ int64_t filter_block_cache_miss = 0;
+ int64_t filter_block_cache_hit = 0;
+ int64_t block_cache_bytes_read = 0;
+ int64_t block_cache_bytes_write = 0;
+};
+
+// Make sure, by default, index/filter blocks were pre-loaded (meaning we won't
+// use block cache to store them).
+TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
+ Options options;
+ options.create_if_missing = true;
+ options.statistics = CreateDBStatistics();
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.block_cache = NewLRUCache(1024, 4);
+ table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+
+ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+ c.Add("key", "value");
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+ // preloading filter/index blocks is enabled.
+ auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+ ASSERT_FALSE(reader->TEST_FilterBlockInCache());
+ ASSERT_FALSE(reader->TEST_IndexBlockInCache());
+
+ {
+ // nothing happens in the beginning
+ BlockCachePropertiesSnapshot props(options.statistics.get());
+ props.AssertIndexBlockStat(0, 0);
+ props.AssertFilterBlockStat(0, 0);
+ }
+
+ {
+ GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, Slice(), nullptr, nullptr,
+ nullptr, true, nullptr, nullptr);
+ // a hack that just to trigger BlockBasedTable::GetFilter.
+ reader->Get(ReadOptions(), "non-exist-key", &get_context,
+ moptions.prefix_extractor.get());
+ BlockCachePropertiesSnapshot props(options.statistics.get());
+ props.AssertIndexBlockStat(0, 0);
+ props.AssertFilterBlockStat(0, 0);
+ }
+}
+
+// Due to the difficulities of the intersaction between statistics, this test
+// only tests the case when "index block is put to block cache"
+TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
+ // -- Table construction
+ Options options;
+ options.create_if_missing = true;
+ options.statistics = CreateDBStatistics();
+
+ // Enable the cache for index/filter blocks
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ LRUCacheOptions co;
+ co.capacity = 2048;
+ co.num_shard_bits = 2;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ table_options.block_cache = NewLRUCache(co);
+ table_options.cache_index_and_filter_blocks = true;
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+
+ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+ c.Add("key", "value");
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+ // preloading filter/index blocks is prohibited.
+ auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+ ASSERT_FALSE(reader->TEST_FilterBlockInCache());
+ ASSERT_TRUE(reader->TEST_IndexBlockInCache());
+
+ // -- PART 1: Open with regular block cache.
+ // Since block_cache is disabled, no cache activities will be involved.
+ std::unique_ptr<InternalIterator> iter;
+
+ int64_t last_cache_bytes_read = 0;
+ // At first, no block will be accessed.
+ {
+ BlockCachePropertiesSnapshot props(options.statistics.get());
+ // index will be added to block cache.
+ props.AssertEqual(1, // index block miss
+ 0, 0, 0);
+ ASSERT_EQ(props.GetCacheBytesRead(), 0);
+ ASSERT_EQ(props.GetCacheBytesWrite(),
+ static_cast<int64_t>(table_options.block_cache->GetUsage()));
+ last_cache_bytes_read = props.GetCacheBytesRead();
+ }
+
+ // Only index block will be accessed
+ {
+ iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
+ BlockCachePropertiesSnapshot props(options.statistics.get());
+ // NOTE: to help better highlight the "detla" of each ticker, I use
+ // <last_value> + <added_value> to indicate the increment of changed
+ // value; other numbers remain the same.
+ props.AssertEqual(1, 0 + 1, // index block hit
+ 0, 0);
+ // Cache hit, bytes read from cache should increase
+ ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
+ ASSERT_EQ(props.GetCacheBytesWrite(),
+ static_cast<int64_t>(table_options.block_cache->GetUsage()));
+ last_cache_bytes_read = props.GetCacheBytesRead();
+ }
+
+ // Only data block will be accessed
+ {
+ iter->SeekToFirst();
+ BlockCachePropertiesSnapshot props(options.statistics.get());
+ props.AssertEqual(1, 1, 0 + 1, // data block miss
+ 0);
+ // Cache miss, Bytes read from cache should not change
+ ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read);
+ ASSERT_EQ(props.GetCacheBytesWrite(),
+ static_cast<int64_t>(table_options.block_cache->GetUsage()));
+ last_cache_bytes_read = props.GetCacheBytesRead();
+ }
+
+ // Data block will be in cache
+ {
+ iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
+ iter->SeekToFirst();
+ BlockCachePropertiesSnapshot props(options.statistics.get());
+ props.AssertEqual(1, 1 + 1, /* index block hit */
+ 1, 0 + 1 /* data block hit */);
+ // Cache hit, bytes read from cache should increase
+ ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
+ ASSERT_EQ(props.GetCacheBytesWrite(),
+ static_cast<int64_t>(table_options.block_cache->GetUsage()));
+ }
+ // release the iterator so that the block cache can reset correctly.
+ iter.reset();
+
+ c.ResetTableReader();
+
+ // -- PART 2: Open with very small block cache
+ // In this test, no block will ever get hit since the block cache is
+ // too small to fit even one entry.
+ table_options.block_cache = NewLRUCache(1, 4);
+ options.statistics = CreateDBStatistics();
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ const ImmutableCFOptions ioptions2(options);
+ const MutableCFOptions moptions2(options);
+ c.Reopen(ioptions2, moptions2);
+ {
+ BlockCachePropertiesSnapshot props(options.statistics.get());
+ props.AssertEqual(1, // index block miss
+ 0, 0, 0);
+ // Cache miss, Bytes read from cache should not change
+ ASSERT_EQ(props.GetCacheBytesRead(), 0);
+ }
+
+ {
+ // Both index and data block get accessed.
+ // It first cache index block then data block. But since the cache size
+ // is only 1, index block will be purged after data block is inserted.
+ iter.reset(c.NewIterator(moptions2.prefix_extractor.get()));
+ BlockCachePropertiesSnapshot props(options.statistics.get());
+ props.AssertEqual(1 + 1, // index block miss
+ 0, 0, // data block miss
+ 0);
+ // Cache hit, bytes read from cache should increase
+ ASSERT_EQ(props.GetCacheBytesRead(), 0);
+ }
+
+ {
+ // SeekToFirst() accesses data block. With similar reason, we expect data
+ // block's cache miss.
+ iter->SeekToFirst();
+ BlockCachePropertiesSnapshot props(options.statistics.get());
+ props.AssertEqual(2, 0, 0 + 1, // data block miss
+ 0);
+ // Cache miss, Bytes read from cache should not change
+ ASSERT_EQ(props.GetCacheBytesRead(), 0);
+ }
+ iter.reset();
+ c.ResetTableReader();
+
+ // -- PART 3: Open table with bloom filter enabled but not in SST file
+ table_options.block_cache = NewLRUCache(4096, 4);
+ table_options.cache_index_and_filter_blocks = false;
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ TableConstructor c3(BytewiseComparator());
+ std::string user_key = "k01";
+ InternalKey internal_key(user_key, 0, kTypeValue);
+ c3.Add(internal_key.Encode().ToString(), "hello");
+ ImmutableCFOptions ioptions3(options);
+ MutableCFOptions moptions3(options);
+ // Generate table without filter policy
+ c3.Finish(options, ioptions3, moptions3, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+ c3.ResetTableReader();
+
+ // Open table with filter policy
+ table_options.filter_policy.reset(NewBloomFilterPolicy(1));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ options.statistics = CreateDBStatistics();
+ ImmutableCFOptions ioptions4(options);
+ MutableCFOptions moptions4(options);
+ ASSERT_OK(c3.Reopen(ioptions4, moptions4));
+ reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
+ ASSERT_FALSE(reader->TEST_FilterBlockInCache());
+ PinnableSlice value;
+ GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, user_key, &value, nullptr,
+ nullptr, true, nullptr, nullptr);
+ ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context,
+ moptions4.prefix_extractor.get()));
+ ASSERT_STREQ(value.data(), "hello");
+ BlockCachePropertiesSnapshot props(options.statistics.get());
+ props.AssertFilterBlockStat(0, 0);
+ c3.ResetTableReader();
+}
+
+void ValidateBlockSizeDeviation(int value, int expected) {
+ BlockBasedTableOptions table_options;
+ table_options.block_size_deviation = value;
+ BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
+
+ const BlockBasedTableOptions* normalized_table_options =
+ (const BlockBasedTableOptions*)factory->GetOptions();
+ ASSERT_EQ(normalized_table_options->block_size_deviation, expected);
+
+ delete factory;
+}
+
+void ValidateBlockRestartInterval(int value, int expected) {
+ BlockBasedTableOptions table_options;
+ table_options.block_restart_interval = value;
+ BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
+
+ const BlockBasedTableOptions* normalized_table_options =
+ (const BlockBasedTableOptions*)factory->GetOptions();
+ ASSERT_EQ(normalized_table_options->block_restart_interval, expected);
+
+ delete factory;
+}
+
+TEST_P(BlockBasedTableTest, InvalidOptions) {
+ // invalid values for block_size_deviation (<0 or >100) are silently set to 0
+ ValidateBlockSizeDeviation(-10, 0);
+ ValidateBlockSizeDeviation(-1, 0);
+ ValidateBlockSizeDeviation(0, 0);
+ ValidateBlockSizeDeviation(1, 1);
+ ValidateBlockSizeDeviation(99, 99);
+ ValidateBlockSizeDeviation(100, 100);
+ ValidateBlockSizeDeviation(101, 0);
+ ValidateBlockSizeDeviation(1000, 0);
+
+ // invalid values for block_restart_interval (<1) are silently set to 1
+ ValidateBlockRestartInterval(-10, 1);
+ ValidateBlockRestartInterval(-1, 1);
+ ValidateBlockRestartInterval(0, 1);
+ ValidateBlockRestartInterval(1, 1);
+ ValidateBlockRestartInterval(2, 2);
+ ValidateBlockRestartInterval(1000, 1000);
+}
+
+TEST_P(BlockBasedTableTest, BlockReadCountTest) {
+ // bloom_filter_type = 0 -- block-based filter
+ // bloom_filter_type = 0 -- full filter
+ for (int bloom_filter_type = 0; bloom_filter_type < 2; ++bloom_filter_type) {
+ for (int index_and_filter_in_cache = 0; index_and_filter_in_cache < 2;
+ ++index_and_filter_in_cache) {
+ Options options;
+ options.create_if_missing = true;
+
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.block_cache = NewLRUCache(1, 0);
+ table_options.cache_index_and_filter_blocks = index_and_filter_in_cache;
+ table_options.filter_policy.reset(
+ NewBloomFilterPolicy(10, bloom_filter_type == 0));
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+
+ TableConstructor c(BytewiseComparator());
+ std::string user_key = "k04";
+ InternalKey internal_key(user_key, 0, kTypeValue);
+ std::string encoded_key = internal_key.Encode().ToString();
+ c.Add(encoded_key, "hello");
+ ImmutableCFOptions ioptions(options);
+ MutableCFOptions moptions(options);
+ // Generate table with filter policy
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+ auto reader = c.GetTableReader();
+ PinnableSlice value;
+ {
+ GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, user_key, &value, nullptr,
+ nullptr, true, nullptr, nullptr);
+ get_perf_context()->Reset();
+ ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+ moptions.prefix_extractor.get()));
+ if (index_and_filter_in_cache) {
+ // data, index and filter block
+ ASSERT_EQ(get_perf_context()->block_read_count, 3);
+ ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
+ ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
+ } else {
+ // just the data block
+ ASSERT_EQ(get_perf_context()->block_read_count, 1);
+ }
+ ASSERT_EQ(get_context.State(), GetContext::kFound);
+ ASSERT_STREQ(value.data(), "hello");
+ }
+
+ // Get non-existing key
+ user_key = "does-not-exist";
+ internal_key = InternalKey(user_key, 0, kTypeValue);
+ encoded_key = internal_key.Encode().ToString();
+
+ value.Reset();
+ {
+ GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, user_key, &value, nullptr,
+ nullptr, true, nullptr, nullptr);
+ get_perf_context()->Reset();
+ ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+ moptions.prefix_extractor.get()));
+ ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+ }
+
+ if (index_and_filter_in_cache) {
+ if (bloom_filter_type == 0) {
+ // with block-based, we read index and then the filter
+ ASSERT_EQ(get_perf_context()->block_read_count, 2);
+ ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
+ ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
+ } else {
+ // with full-filter, we read filter first and then we stop
+ ASSERT_EQ(get_perf_context()->block_read_count, 1);
+ ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
+ }
+ } else {
+ // filter is already in memory and it figures out that the key doesn't
+ // exist
+ ASSERT_EQ(get_perf_context()->block_read_count, 0);
+ }
+ }
+ }
+}
+
+TEST_P(BlockBasedTableTest, BlockCacheLeak) {
+ // Check that when we reopen a table we don't lose access to blocks already
+ // in the cache. This test checks whether the Table actually makes use of the
+ // unique ID from the file.
+
+ Options opt;
+ std::unique_ptr<InternalKeyComparator> ikc;
+ ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+ opt.compression = kNoCompression;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.block_size = 1024;
+ // big enough so we don't ever lose cached values.
+ table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+ opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+ c.Add("k01", "hello");
+ c.Add("k02", "hello2");
+ c.Add("k03", std::string(10000, 'x'));
+ c.Add("k04", std::string(200000, 'x'));
+ c.Add("k05", std::string(300000, 'x'));
+ c.Add("k06", "hello3");
+ c.Add("k07", std::string(100000, 'x'));
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ const ImmutableCFOptions ioptions(opt);
+ const MutableCFOptions moptions(opt);
+ c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
+
+ std::unique_ptr<InternalIterator> iter(
+ c.NewIterator(moptions.prefix_extractor.get()));
+ iter->SeekToFirst();
+ while (iter->Valid()) {
+ iter->key();
+ iter->value();
+ iter->Next();
+ }
+ ASSERT_OK(iter->status());
+ iter.reset();
+
+ const ImmutableCFOptions ioptions1(opt);
+ const MutableCFOptions moptions1(opt);
+ ASSERT_OK(c.Reopen(ioptions1, moptions1));
+ auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+ for (const std::string& key : keys) {
+ InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+ ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+ }
+ c.ResetTableReader();
+
+ // rerun with different block cache
+ table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+ opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ const ImmutableCFOptions ioptions2(opt);
+ const MutableCFOptions moptions2(opt);
+ ASSERT_OK(c.Reopen(ioptions2, moptions2));
+ table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+ for (const std::string& key : keys) {
+ InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+ ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+ }
+ c.ResetTableReader();
+}
+
+namespace {
+class CustomMemoryAllocator : public MemoryAllocator {
+ public:
+ const char* Name() const override { return "CustomMemoryAllocator"; }
+
+ void* Allocate(size_t size) override {
+ ++numAllocations;
+ auto ptr = new char[size + 16];
+ memcpy(ptr, "memory_allocator_", 16); // mangle first 16 bytes
+ return reinterpret_cast<void*>(ptr + 16);
+ }
+ void Deallocate(void* p) override {
+ ++numDeallocations;
+ char* ptr = reinterpret_cast<char*>(p) - 16;
+ delete[] ptr;
+ }
+
+ std::atomic<int> numAllocations;
+ std::atomic<int> numDeallocations;
+};
+} // namespace
+
+TEST_P(BlockBasedTableTest, MemoryAllocator) {
+ auto custom_memory_allocator = std::make_shared<CustomMemoryAllocator>();
+ {
+ Options opt;
+ std::unique_ptr<InternalKeyComparator> ikc;
+ ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+ opt.compression = kNoCompression;
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1024;
+ LRUCacheOptions lruOptions;
+ lruOptions.memory_allocator = custom_memory_allocator;
+ lruOptions.capacity = 16 * 1024 * 1024;
+ lruOptions.num_shard_bits = 4;
+ table_options.block_cache = NewLRUCache(std::move(lruOptions));
+ opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+ TableConstructor c(BytewiseComparator(),
+ true /* convert_to_internal_key_ */);
+ c.Add("k01", "hello");
+ c.Add("k02", "hello2");
+ c.Add("k03", std::string(10000, 'x'));
+ c.Add("k04", std::string(200000, 'x'));
+ c.Add("k05", std::string(300000, 'x'));
+ c.Add("k06", "hello3");
+ c.Add("k07", std::string(100000, 'x'));
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ const ImmutableCFOptions ioptions(opt);
+ const MutableCFOptions moptions(opt);
+ c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
+
+ std::unique_ptr<InternalIterator> iter(
+ c.NewIterator(moptions.prefix_extractor.get()));
+ iter->SeekToFirst();
+ while (iter->Valid()) {
+ iter->key();
+ iter->value();
+ iter->Next();
+ }
+ ASSERT_OK(iter->status());
+ }
+
+ // out of scope, block cache should have been deleted, all allocations
+ // deallocated
+ EXPECT_EQ(custom_memory_allocator->numAllocations.load(),
+ custom_memory_allocator->numDeallocations.load());
+ // make sure that allocations actually happened through the cache allocator
+ EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0);
+}
+
+// Test the file checksum of block based table
+TEST_P(BlockBasedTableTest, NoFileChecksum) {
+ Options options;
+ ImmutableCFOptions ioptions(options);
+ MutableCFOptions moptions(options);
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ std::unique_ptr<InternalKeyComparator> comparator(
+ new InternalKeyComparator(BytewiseComparator()));
+ SequenceNumber largest_seqno = 0;
+ int level = 0;
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+
+ if (largest_seqno != 0) {
+ // Pretend that it's an external file written by SstFileWriter.
+ int_tbl_prop_collector_factories.emplace_back(
+ new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+ 0 /* global_seqno*/));
+ }
+ std::string column_family_name;
+
+ FileChecksumTestHelper f(true);
+ f.CreateWriteableFile();
+ std::unique_ptr<TableBuilder> builder;
+ builder.reset(ioptions.table_factory->NewTableBuilder(
+ TableBuilderOptions(ioptions, moptions, *comparator,
+ &int_tbl_prop_collector_factories,
+ options.compression, options.sample_for_compression,
+ options.compression_opts, false /* skip_filters */,
+ column_family_name, level),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ f.GetFileWriter()));
+ f.ResetTableBuilder(std::move(builder));
+ f.AddKVtoKVMap(1000);
+ f.WriteKVAndFlushTable();
+ ASSERT_STREQ(f.GetFileChecksumFuncName(),
+ kUnknownFileChecksumFuncName.c_str());
+ ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum.c_str());
+}
+
+TEST_P(BlockBasedTableTest, Crc32FileChecksum) {
+ Options options;
+ options.sst_file_checksum_func =
+ std::shared_ptr<FileChecksumFunc>(CreateFileChecksumFuncCrc32c());
+ ImmutableCFOptions ioptions(options);
+ MutableCFOptions moptions(options);
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ std::unique_ptr<InternalKeyComparator> comparator(
+ new InternalKeyComparator(BytewiseComparator()));
+ SequenceNumber largest_seqno = 0;
+ int level = 0;
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+
+ if (largest_seqno != 0) {
+ // Pretend that it's an external file written by SstFileWriter.
+ int_tbl_prop_collector_factories.emplace_back(
+ new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+ 0 /* global_seqno*/));
+ }
+ std::string column_family_name;
+
+ FileChecksumTestHelper f(true);
+ f.CreateWriteableFile();
+ f.SetFileChecksumFunc(options.sst_file_checksum_func.get());
+ std::unique_ptr<TableBuilder> builder;
+ builder.reset(ioptions.table_factory->NewTableBuilder(
+ TableBuilderOptions(ioptions, moptions, *comparator,
+ &int_tbl_prop_collector_factories,
+ options.compression, options.sample_for_compression,
+ options.compression_opts, false /* skip_filters */,
+ column_family_name, level),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ f.GetFileWriter()));
+ f.ResetTableBuilder(std::move(builder));
+ f.AddKVtoKVMap(1000);
+ f.WriteKVAndFlushTable();
+ ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
+ std::string checksum;
+ ASSERT_OK(
+ f.CalculateFileChecksum(options.sst_file_checksum_func.get(), &checksum));
+ ASSERT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
+}
+
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(PlainTableTest, BasicPlainTableProperties) {
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 8;
+ plain_table_options.bloom_bits_per_key = 8;
+ plain_table_options.hash_table_ratio = 0;
+
+ PlainTableFactory factory(plain_table_options);
+ test::StringSink sink;
+ std::unique_ptr<WritableFileWriter> file_writer(
+ test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
+ Options options;
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ InternalKeyComparator ikc(options.comparator);
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+ std::string column_family_name;
+ int unknown_level = -1;
+ std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
+ TableBuilderOptions(
+ ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
+ kNoCompression, 0 /* sample_for_compression */, CompressionOptions(),
+ false /* skip_filters */, column_family_name, unknown_level),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ file_writer.get()));
+
+ for (char c = 'a'; c <= 'z'; ++c) {
+ std::string key(8, c);
+ key.append("\1 "); // PlainTable expects internal key structure
+ std::string value(28, c + 42);
+ builder->Add(key, value);
+ }
+ ASSERT_OK(builder->Finish());
+ file_writer->Flush();
+
+ test::StringSink* ss =
+ ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter(file_writer.get());
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ test::GetRandomAccessFileReader(
+ new test::StringSource(ss->contents(), 72242, true)));
+
+ TableProperties* props = nullptr;
+ auto s = ReadTableProperties(file_reader.get(), ss->contents().size(),
+ kPlainTableMagicNumber, ioptions,
+ &props, true /* compression_type_missing */);
+ std::unique_ptr<TableProperties> props_guard(props);
+ ASSERT_OK(s);
+
+ ASSERT_EQ(0ul, props->index_size);
+ ASSERT_EQ(0ul, props->filter_size);
+ ASSERT_EQ(16ul * 26, props->raw_key_size);
+ ASSERT_EQ(28ul * 26, props->raw_value_size);
+ ASSERT_EQ(26ul, props->num_entries);
+ ASSERT_EQ(1ul, props->num_data_blocks);
+}
+
+TEST_F(PlainTableTest, NoFileChecksum) {
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 20;
+ plain_table_options.bloom_bits_per_key = 8;
+ plain_table_options.hash_table_ratio = 0;
+ PlainTableFactory factory(plain_table_options);
+
+ Options options;
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ InternalKeyComparator ikc(options.comparator);
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+ std::string column_family_name;
+ int unknown_level = -1;
+ FileChecksumTestHelper f(true);
+ f.CreateWriteableFile();
+
+ std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
+ TableBuilderOptions(
+ ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
+ kNoCompression, 0 /* sample_for_compression */, CompressionOptions(),
+ false /* skip_filters */, column_family_name, unknown_level),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ f.GetFileWriter()));
+ f.ResetTableBuilder(std::move(builder));
+ f.AddKVtoKVMap(1000);
+ f.WriteKVAndFlushTable();
+ ASSERT_STREQ(f.GetFileChecksumFuncName(),
+ kUnknownFileChecksumFuncName.c_str());
+ EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum.c_str());
+}
+
+TEST_F(PlainTableTest, Crc32FileChecksum) {
+ PlainTableOptions plain_table_options;
+ plain_table_options.user_key_len = 20;
+ plain_table_options.bloom_bits_per_key = 8;
+ plain_table_options.hash_table_ratio = 0;
+ PlainTableFactory factory(plain_table_options);
+
+ Options options;
+ options.sst_file_checksum_func =
+ std::shared_ptr<FileChecksumFunc>(CreateFileChecksumFuncCrc32c());
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ InternalKeyComparator ikc(options.comparator);
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+ std::string column_family_name;
+ int unknown_level = -1;
+ FileChecksumTestHelper f(true);
+ f.CreateWriteableFile();
+ f.SetFileChecksumFunc(options.sst_file_checksum_func.get());
+
+ std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
+ TableBuilderOptions(
+ ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
+ kNoCompression, 0 /* sample_for_compression */, CompressionOptions(),
+ false /* skip_filters */, column_family_name, unknown_level),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ f.GetFileWriter()));
+ f.ResetTableBuilder(std::move(builder));
+ f.AddKVtoKVMap(1000);
+ f.WriteKVAndFlushTable();
+ ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
+ std::string checksum;
+ ASSERT_OK(
+ f.CalculateFileChecksum(options.sst_file_checksum_func.get(), &checksum));
+ EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
+}
+
+#endif // !ROCKSDB_LITE
+
+TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
+ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+ c.Add("k01", "hello");
+ c.Add("k02", "hello2");
+ c.Add("k03", std::string(10000, 'x'));
+ c.Add("k04", std::string(200000, 'x'));
+ c.Add("k05", std::string(300000, 'x'));
+ c.Add("k06", "hello3");
+ c.Add("k07", std::string(100000, 'x'));
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ Options options;
+ test::PlainInternalKeyComparator internal_comparator(options.comparator);
+ options.compression = kNoCompression;
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1024;
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+ &keys, &kvmap);
+
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000));
+ // k04 and k05 will be in two consecutive blocks, the index is
+ // an arbitrary slice between k04 and k05, either before or after k04a
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 10000, 211000));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000));
+ c.ResetTableReader();
+}
+
+static void DoCompressionTest(CompressionType comp) {
+ Random rnd(301);
+ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+ std::string tmp;
+ c.Add("k01", "hello");
+ c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+ c.Add("k03", "hello3");
+ c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ Options options;
+ test::PlainInternalKeyComparator ikc(options.comparator);
+ options.compression = comp;
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 1024;
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
+
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3500));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3500));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6500));
+ c.ResetTableReader();
+}
+
+TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) {
+ std::vector<CompressionType> compression_state;
+ if (!Snappy_Supported()) {
+ fprintf(stderr, "skipping snappy compression tests\n");
+ } else {
+ compression_state.push_back(kSnappyCompression);
+ }
+
+ if (!Zlib_Supported()) {
+ fprintf(stderr, "skipping zlib compression tests\n");
+ } else {
+ compression_state.push_back(kZlibCompression);
+ }
+
+ // TODO(kailiu) DoCompressionTest() doesn't work with BZip2.
+ /*
+ if (!BZip2_Supported()) {
+ fprintf(stderr, "skipping bzip2 compression tests\n");
+ } else {
+ compression_state.push_back(kBZip2Compression);
+ }
+ */
+
+ if (!LZ4_Supported()) {
+ fprintf(stderr, "skipping lz4 and lz4hc compression tests\n");
+ } else {
+ compression_state.push_back(kLZ4Compression);
+ compression_state.push_back(kLZ4HCCompression);
+ }
+
+ if (!XPRESS_Supported()) {
+ fprintf(stderr, "skipping xpress and xpress compression tests\n");
+ }
+ else {
+ compression_state.push_back(kXpressCompression);
+ }
+
+ for (auto state : compression_state) {
+ DoCompressionTest(state);
+ }
+}
+
+#ifndef ROCKSDB_VALGRIND_RUN
+// RandomizedHarnessTest is very slow for certain combination of arguments
+// Split into 8 pieces to reduce the time individual tests take.
+TEST_F(HarnessTest, Randomized1) {
+ // part 1 out of 8
+ const size_t part = 1;
+ const size_t total = 8;
+ RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized2) {
+ // part 2 out of 8
+ const size_t part = 2;
+ const size_t total = 8;
+ RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized3) {
+ // part 3 out of 8
+ const size_t part = 3;
+ const size_t total = 8;
+ RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized4) {
+ // part 4 out of 8
+ const size_t part = 4;
+ const size_t total = 8;
+ RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized5) {
+ // part 5 out of 8
+ const size_t part = 5;
+ const size_t total = 8;
+ RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized6) {
+ // part 6 out of 8
+ const size_t part = 6;
+ const size_t total = 8;
+ RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized7) {
+ // part 7 out of 8
+ const size_t part = 7;
+ const size_t total = 8;
+ RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized8) {
+ // part 8 out of 8
+ const size_t part = 8;
+ const size_t total = 8;
+ RandomizedHarnessTest(part, total);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(HarnessTest, RandomizedLongDB) {
+ Random rnd(test::RandomSeed());
+ TestArgs args = {DB_TEST, false, 16, kNoCompression, 0, false};
+ Init(args);
+ int num_entries = 100000;
+ for (int e = 0; e < num_entries; e++) {
+ std::string v;
+ Add(test::RandomKey(&rnd, rnd.Skewed(4)),
+ test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
+ }
+ Test(&rnd);
+
+ // We must have created enough data to force merging
+ int files = 0;
+ for (int level = 0; level < db()->NumberLevels(); level++) {
+ std::string value;
+ char name[100];
+ snprintf(name, sizeof(name), "rocksdb.num-files-at-level%d", level);
+ ASSERT_TRUE(db()->GetProperty(name, &value));
+ files += atoi(value.c_str());
+ }
+ ASSERT_GT(files, 0);
+}
+#endif // ROCKSDB_LITE
+#endif // ROCKSDB_VALGRIND_RUN
+
+class MemTableTest : public testing::Test {};
+
+TEST_F(MemTableTest, Simple) {
+ InternalKeyComparator cmp(BytewiseComparator());
+ auto table_factory = std::make_shared<SkipListFactory>();
+ Options options;
+ options.memtable_factory = table_factory;
+ ImmutableCFOptions ioptions(options);
+ WriteBufferManager wb(options.db_write_buffer_size);
+ MemTable* memtable =
+ new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+ kMaxSequenceNumber, 0 /* column_family_id */);
+ memtable->Ref();
+ WriteBatch batch;
+ WriteBatchInternal::SetSequence(&batch, 100);
+ batch.Put(std::string("k1"), std::string("v1"));
+ batch.Put(std::string("k2"), std::string("v2"));
+ batch.Put(std::string("k3"), std::string("v3"));
+ batch.Put(std::string("largekey"), std::string("vlarge"));
+ batch.DeleteRange(std::string("chi"), std::string("xigua"));
+ batch.DeleteRange(std::string("begin"), std::string("end"));
+ ColumnFamilyMemTablesDefault cf_mems_default(memtable);
+ ASSERT_TRUE(
+ WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr)
+ .ok());
+
+ for (int i = 0; i < 2; ++i) {
+ Arena arena;
+ ScopedArenaIterator arena_iter_guard;
+ std::unique_ptr<InternalIterator> iter_guard;
+ InternalIterator* iter;
+ if (i == 0) {
+ iter = memtable->NewIterator(ReadOptions(), &arena);
+ arena_iter_guard.set(iter);
+ } else {
+ iter = memtable->NewRangeTombstoneIterator(
+ ReadOptions(), kMaxSequenceNumber /* read_seq */);
+ iter_guard.reset(iter);
+ }
+ if (iter == nullptr) {
+ continue;
+ }
+ iter->SeekToFirst();
+ while (iter->Valid()) {
+ fprintf(stderr, "key: '%s' -> '%s'\n", iter->key().ToString().c_str(),
+ iter->value().ToString().c_str());
+ iter->Next();
+ }
+ }
+
+ delete memtable->Unref();
+}
+
+// Test the empty key
+TEST_F(HarnessTest, SimpleEmptyKey) {
+ auto args = GenerateArgList();
+ for (const auto& arg : args) {
+ Init(arg);
+ Random rnd(test::RandomSeed() + 1);
+ Add("", "v");
+ Test(&rnd);
+ }
+}
+
+TEST_F(HarnessTest, SimpleSingle) {
+ auto args = GenerateArgList();
+ for (const auto& arg : args) {
+ Init(arg);
+ Random rnd(test::RandomSeed() + 2);
+ Add("abc", "v");
+ Test(&rnd);
+ }
+}
+
+TEST_F(HarnessTest, SimpleMulti) {
+ auto args = GenerateArgList();
+ for (const auto& arg : args) {
+ Init(arg);
+ Random rnd(test::RandomSeed() + 3);
+ Add("abc", "v");
+ Add("abcd", "v");
+ Add("ac", "v2");
+ Test(&rnd);
+ }
+}
+
+TEST_F(HarnessTest, SimpleSpecialKey) {
+ auto args = GenerateArgList();
+ for (const auto& arg : args) {
+ Init(arg);
+ Random rnd(test::RandomSeed() + 4);
+ Add("\xff\xff", "v3");
+ Test(&rnd);
+ }
+}
+
+TEST_F(HarnessTest, FooterTests) {
+ {
+ // upconvert legacy block based
+ std::string encoded;
+ Footer footer(kLegacyBlockBasedTableMagicNumber, 0);
+ BlockHandle meta_index(10, 5), index(20, 15);
+ footer.set_metaindex_handle(meta_index);
+ footer.set_index_handle(index);
+ footer.EncodeTo(&encoded);
+ Footer decoded_footer;
+ Slice encoded_slice(encoded);
+ decoded_footer.DecodeFrom(&encoded_slice);
+ ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+ ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
+ ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+ ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+ ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+ ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+ ASSERT_EQ(decoded_footer.version(), 0U);
+ }
+ {
+ // xxhash block based
+ std::string encoded;
+ Footer footer(kBlockBasedTableMagicNumber, 1);
+ BlockHandle meta_index(10, 5), index(20, 15);
+ footer.set_metaindex_handle(meta_index);
+ footer.set_index_handle(index);
+ footer.set_checksum(kxxHash);
+ footer.EncodeTo(&encoded);
+ Footer decoded_footer;
+ Slice encoded_slice(encoded);
+ decoded_footer.DecodeFrom(&encoded_slice);
+ ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+ ASSERT_EQ(decoded_footer.checksum(), kxxHash);
+ ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+ ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+ ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+ ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+ ASSERT_EQ(decoded_footer.version(), 1U);
+ }
+ {
+ // xxhash64 block based
+ std::string encoded;
+ Footer footer(kBlockBasedTableMagicNumber, 1);
+ BlockHandle meta_index(10, 5), index(20, 15);
+ footer.set_metaindex_handle(meta_index);
+ footer.set_index_handle(index);
+ footer.set_checksum(kxxHash64);
+ footer.EncodeTo(&encoded);
+ Footer decoded_footer;
+ Slice encoded_slice(encoded);
+ decoded_footer.DecodeFrom(&encoded_slice);
+ ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+ ASSERT_EQ(decoded_footer.checksum(), kxxHash64);
+ ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+ ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+ ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+ ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+ ASSERT_EQ(decoded_footer.version(), 1U);
+ }
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+ {
+ // upconvert legacy plain table
+ std::string encoded;
+ Footer footer(kLegacyPlainTableMagicNumber, 0);
+ BlockHandle meta_index(10, 5), index(20, 15);
+ footer.set_metaindex_handle(meta_index);
+ footer.set_index_handle(index);
+ footer.EncodeTo(&encoded);
+ Footer decoded_footer;
+ Slice encoded_slice(encoded);
+ decoded_footer.DecodeFrom(&encoded_slice);
+ ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
+ ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
+ ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+ ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+ ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+ ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+ ASSERT_EQ(decoded_footer.version(), 0U);
+ }
+ {
+ // xxhash block based
+ std::string encoded;
+ Footer footer(kPlainTableMagicNumber, 1);
+ BlockHandle meta_index(10, 5), index(20, 15);
+ footer.set_metaindex_handle(meta_index);
+ footer.set_index_handle(index);
+ footer.set_checksum(kxxHash);
+ footer.EncodeTo(&encoded);
+ Footer decoded_footer;
+ Slice encoded_slice(encoded);
+ decoded_footer.DecodeFrom(&encoded_slice);
+ ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
+ ASSERT_EQ(decoded_footer.checksum(), kxxHash);
+ ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+ ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+ ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+ ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+ ASSERT_EQ(decoded_footer.version(), 1U);
+ }
+#endif // !ROCKSDB_LITE
+ {
+ // version == 2
+ std::string encoded;
+ Footer footer(kBlockBasedTableMagicNumber, 2);
+ BlockHandle meta_index(10, 5), index(20, 15);
+ footer.set_metaindex_handle(meta_index);
+ footer.set_index_handle(index);
+ footer.EncodeTo(&encoded);
+ Footer decoded_footer;
+ Slice encoded_slice(encoded);
+ decoded_footer.DecodeFrom(&encoded_slice);
+ ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+ ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
+ ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+ ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+ ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+ ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+ ASSERT_EQ(decoded_footer.version(), 2U);
+ }
+}
+
+class IndexBlockRestartIntervalTest
+ : public TableTest,
+ public ::testing::WithParamInterface<std::pair<int, bool>> {
+ public:
+ static std::vector<std::pair<int, bool>> GetRestartValues() {
+ return {{-1, false}, {0, false}, {1, false}, {8, false},
+ {16, false}, {32, false}, {-1, true}, {0, true},
+ {1, true}, {8, true}, {16, true}, {32, true}};
+ }
+};
+
+INSTANTIATE_TEST_CASE_P(
+ IndexBlockRestartIntervalTest, IndexBlockRestartIntervalTest,
+ ::testing::ValuesIn(IndexBlockRestartIntervalTest::GetRestartValues()));
+
+TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
+ const int kKeysInTable = 10000;
+ const int kKeySize = 100;
+ const int kValSize = 500;
+
+ const int index_block_restart_interval = std::get<0>(GetParam());
+ const bool value_delta_encoding = std::get<1>(GetParam());
+
+ Options options;
+ BlockBasedTableOptions table_options;
+ table_options.block_size = 64; // small block size to get big index block
+ table_options.index_block_restart_interval = index_block_restart_interval;
+ if (value_delta_encoding) {
+ table_options.format_version = 4;
+ }
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+ TableConstructor c(BytewiseComparator());
+ static Random rnd(301);
+ for (int i = 0; i < kKeysInTable; i++) {
+ InternalKey k(RandomString(&rnd, kKeySize), 0, kTypeValue);
+ c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize));
+ }
+
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ std::unique_ptr<InternalKeyComparator> comparator(
+ new InternalKeyComparator(BytewiseComparator()));
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+ &kvmap);
+ auto reader = c.GetTableReader();
+
+ std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
+ ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+ // Test point lookup
+ for (auto& kv : kvmap) {
+ db_iter->Seek(kv.first);
+
+ ASSERT_TRUE(db_iter->Valid());
+ ASSERT_OK(db_iter->status());
+ ASSERT_EQ(db_iter->key(), kv.first);
+ ASSERT_EQ(db_iter->value(), kv.second);
+ }
+
+ // Test iterating
+ auto kv_iter = kvmap.begin();
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ ASSERT_EQ(db_iter->key(), kv_iter->first);
+ ASSERT_EQ(db_iter->value(), kv_iter->second);
+ kv_iter++;
+ }
+ ASSERT_EQ(kv_iter, kvmap.end());
+ c.ResetTableReader();
+}
+
+class PrefixTest : public testing::Test {
+ public:
+ PrefixTest() : testing::Test() {}
+ ~PrefixTest() override {}
+};
+
+namespace {
+// A simple PrefixExtractor that only works for test PrefixAndWholeKeyTest
+class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform {
+ public:
+ ~TestPrefixExtractor() override{};
+ const char* Name() const override { return "TestPrefixExtractor"; }
+
+ ROCKSDB_NAMESPACE::Slice Transform(
+ const ROCKSDB_NAMESPACE::Slice& src) const override {
+ assert(IsValid(src));
+ return ROCKSDB_NAMESPACE::Slice(src.data(), 3);
+ }
+
+ bool InDomain(const ROCKSDB_NAMESPACE::Slice& src) const override {
+ assert(IsValid(src));
+ return true;
+ }
+
+ bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override {
+ return true;
+ }
+
+ bool IsValid(const ROCKSDB_NAMESPACE::Slice& src) const {
+ if (src.size() != 4) {
+ return false;
+ }
+ if (src[0] != '[') {
+ return false;
+ }
+ if (src[1] < '0' || src[1] > '9') {
+ return false;
+ }
+ if (src[2] != ']') {
+ return false;
+ }
+ if (src[3] < '0' || src[3] > '9') {
+ return false;
+ }
+ return true;
+ }
+};
+} // namespace
+
+TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
+ ROCKSDB_NAMESPACE::Options options;
+ options.compaction_style = ROCKSDB_NAMESPACE::kCompactionStyleUniversal;
+ options.num_levels = 20;
+ options.create_if_missing = true;
+ options.optimize_filters_for_hits = false;
+ options.target_file_size_base = 268435456;
+ options.prefix_extractor = std::make_shared<TestPrefixExtractor>();
+ ROCKSDB_NAMESPACE::BlockBasedTableOptions bbto;
+ bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+ bbto.block_size = 262144;
+ bbto.whole_key_filtering = true;
+
+ const std::string kDBPath = test::PerThreadDBPath("table_prefix_test");
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyDB(kDBPath, options);
+ ROCKSDB_NAMESPACE::DB* db;
+ ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
+
+ // Create a bunch of keys with 10 filters.
+ for (int i = 0; i < 10; i++) {
+ std::string prefix = "[" + std::to_string(i) + "]";
+ for (int j = 0; j < 10; j++) {
+ std::string key = prefix + std::to_string(j);
+ db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, "1");
+ }
+ }
+
+ // Trigger compaction.
+ db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+ delete db;
+ // In the second round, turn whole_key_filtering off and expect
+ // rocksdb still works.
+}
+
+/*
+ * Disable TableWithGlobalSeqno since RocksDB does not store global_seqno in
+ * the SST file any more. Instead, RocksDB deduces global_seqno from the
+ * MANIFEST while reading from an SST. Therefore, it's not possible to test the
+ * functionality of global_seqno in a single, isolated unit test without the
+ * involvement of Version, VersionSet, etc.
+ */
+TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
+ BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+ test::StringSink* sink = new test::StringSink();
+ std::unique_ptr<WritableFileWriter> file_writer(
+ test::GetWritableFileWriter(sink, "" /* don't care */));
+ Options options;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ InternalKeyComparator ikc(options.comparator);
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+ int_tbl_prop_collector_factories.emplace_back(
+ new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+ 0 /* global_seqno*/));
+ std::string column_family_name;
+ std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+ TableBuilderOptions(ioptions, moptions, ikc,
+ &int_tbl_prop_collector_factories, kNoCompression,
+ 0 /* sample_for_compression */, CompressionOptions(),
+ false /* skip_filters */, column_family_name, -1),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ file_writer.get()));
+
+ for (char c = 'a'; c <= 'z'; ++c) {
+ std::string key(8, c);
+ std::string value = key;
+ InternalKey ik(key, 0, kTypeValue);
+
+ builder->Add(ik.Encode(), value);
+ }
+ ASSERT_OK(builder->Finish());
+ file_writer->Flush();
+
+ test::RandomRWStringSink ss_rw(sink);
+ uint32_t version;
+ uint64_t global_seqno;
+ uint64_t global_seqno_offset;
+
+ // Helper function to get version, global_seqno, global_seqno_offset
+ std::function<void()> GetVersionAndGlobalSeqno = [&]() {
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ test::GetRandomAccessFileReader(
+ new test::StringSource(ss_rw.contents(), 73342, true)));
+
+ TableProperties* props = nullptr;
+ ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
+ kBlockBasedTableMagicNumber, ioptions,
+ &props, true /* compression_type_missing */));
+
+ UserCollectedProperties user_props = props->user_collected_properties;
+ version = DecodeFixed32(
+ user_props[ExternalSstFilePropertyNames::kVersion].c_str());
+ global_seqno = DecodeFixed64(
+ user_props[ExternalSstFilePropertyNames::kGlobalSeqno].c_str());
+ global_seqno_offset =
+ props->properties_offsets[ExternalSstFilePropertyNames::kGlobalSeqno];
+
+ delete props;
+ };
+
+ // Helper function to update the value of the global seqno in the file
+ std::function<void(uint64_t)> SetGlobalSeqno = [&](uint64_t val) {
+ std::string new_global_seqno;
+ PutFixed64(&new_global_seqno, val);
+
+ ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno));
+ };
+
+ // Helper function to get the contents of the table InternalIterator
+ std::unique_ptr<TableReader> table_reader;
+ std::function<InternalIterator*()> GetTableInternalIter = [&]() {
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ test::GetRandomAccessFileReader(
+ new test::StringSource(ss_rw.contents(), 73342, true)));
+
+ options.table_factory->NewTableReader(
+ TableReaderOptions(ioptions, moptions.prefix_extractor.get(),
+ EnvOptions(), ikc),
+ std::move(file_reader), ss_rw.contents().size(), &table_reader);
+
+ return table_reader->NewIterator(
+ ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+ };
+
+ GetVersionAndGlobalSeqno();
+ ASSERT_EQ(2u, version);
+ ASSERT_EQ(0u, global_seqno);
+
+ InternalIterator* iter = GetTableInternalIter();
+ char current_c = 'a';
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey pik;
+ ASSERT_TRUE(ParseInternalKey(iter->key(), &pik));
+
+ ASSERT_EQ(pik.type, ValueType::kTypeValue);
+ ASSERT_EQ(pik.sequence, 0);
+ ASSERT_EQ(pik.user_key, iter->value());
+ ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
+ current_c++;
+ }
+ ASSERT_EQ(current_c, 'z' + 1);
+ delete iter;
+
+ // Update global sequence number to 10
+ SetGlobalSeqno(10);
+ GetVersionAndGlobalSeqno();
+ ASSERT_EQ(2u, version);
+ ASSERT_EQ(10u, global_seqno);
+
+ iter = GetTableInternalIter();
+ current_c = 'a';
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey pik;
+ ASSERT_TRUE(ParseInternalKey(iter->key(), &pik));
+
+ ASSERT_EQ(pik.type, ValueType::kTypeValue);
+ ASSERT_EQ(pik.sequence, 10);
+ ASSERT_EQ(pik.user_key, iter->value());
+ ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
+ current_c++;
+ }
+ ASSERT_EQ(current_c, 'z' + 1);
+
+ // Verify Seek
+ for (char c = 'a'; c <= 'z'; c++) {
+ std::string k = std::string(8, c);
+ InternalKey ik(k, 10, kValueTypeForSeek);
+ iter->Seek(ik.Encode());
+ ASSERT_TRUE(iter->Valid());
+
+ ParsedInternalKey pik;
+ ASSERT_TRUE(ParseInternalKey(iter->key(), &pik));
+
+ ASSERT_EQ(pik.type, ValueType::kTypeValue);
+ ASSERT_EQ(pik.sequence, 10);
+ ASSERT_EQ(pik.user_key.ToString(), k);
+ ASSERT_EQ(iter->value().ToString(), k);
+ }
+ delete iter;
+
+ // Update global sequence number to 3
+ SetGlobalSeqno(3);
+ GetVersionAndGlobalSeqno();
+ ASSERT_EQ(2u, version);
+ ASSERT_EQ(3u, global_seqno);
+
+ iter = GetTableInternalIter();
+ current_c = 'a';
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey pik;
+ ASSERT_TRUE(ParseInternalKey(iter->key(), &pik));
+
+ ASSERT_EQ(pik.type, ValueType::kTypeValue);
+ ASSERT_EQ(pik.sequence, 3);
+ ASSERT_EQ(pik.user_key, iter->value());
+ ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
+ current_c++;
+ }
+ ASSERT_EQ(current_c, 'z' + 1);
+
+ // Verify Seek
+ for (char c = 'a'; c <= 'z'; c++) {
+ std::string k = std::string(8, c);
+ // seqno=4 is less than 3 so we still should get our key
+ InternalKey ik(k, 4, kValueTypeForSeek);
+ iter->Seek(ik.Encode());
+ ASSERT_TRUE(iter->Valid());
+
+ ParsedInternalKey pik;
+ ASSERT_TRUE(ParseInternalKey(iter->key(), &pik));
+
+ ASSERT_EQ(pik.type, ValueType::kTypeValue);
+ ASSERT_EQ(pik.sequence, 3);
+ ASSERT_EQ(pik.user_key.ToString(), k);
+ ASSERT_EQ(iter->value().ToString(), k);
+ }
+
+ delete iter;
+}
+
+TEST_P(BlockBasedTableTest, BlockAlignTest) {
+ BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+ bbto.block_align = true;
+ test::StringSink* sink = new test::StringSink();
+ std::unique_ptr<WritableFileWriter> file_writer(
+ test::GetWritableFileWriter(sink, "" /* don't care */));
+ Options options;
+ options.compression = kNoCompression;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ InternalKeyComparator ikc(options.comparator);
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+ std::string column_family_name;
+ std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+ TableBuilderOptions(ioptions, moptions, ikc,
+ &int_tbl_prop_collector_factories, kNoCompression,
+ 0 /* sample_for_compression */, CompressionOptions(),
+ false /* skip_filters */, column_family_name, -1),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ file_writer.get()));
+
+ for (int i = 1; i <= 10000; ++i) {
+ std::ostringstream ostr;
+ ostr << std::setfill('0') << std::setw(5) << i;
+ std::string key = ostr.str();
+ std::string value = "val";
+ InternalKey ik(key, 0, kTypeValue);
+
+ builder->Add(ik.Encode(), value);
+ }
+ ASSERT_OK(builder->Finish());
+ file_writer->Flush();
+
+ test::RandomRWStringSink ss_rw(sink);
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ test::GetRandomAccessFileReader(
+ new test::StringSource(ss_rw.contents(), 73342, true)));
+
+ // Helper function to get version, global_seqno, global_seqno_offset
+ std::function<void()> VerifyBlockAlignment = [&]() {
+ TableProperties* props = nullptr;
+ ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
+ kBlockBasedTableMagicNumber, ioptions,
+ &props, true /* compression_type_missing */));
+
+ uint64_t data_block_size = props->data_size / props->num_data_blocks;
+ ASSERT_EQ(data_block_size, 4096);
+ ASSERT_EQ(props->data_size, data_block_size * props->num_data_blocks);
+ delete props;
+ };
+
+ VerifyBlockAlignment();
+
+ // The below block of code verifies that we can read back the keys. Set
+ // block_align to false when creating the reader to ensure we can flip between
+ // the two modes without any issues
+ std::unique_ptr<TableReader> table_reader;
+ bbto.block_align = false;
+ Options options2;
+ options2.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ ImmutableCFOptions ioptions2(options2);
+ const MutableCFOptions moptions2(options2);
+
+ ASSERT_OK(ioptions.table_factory->NewTableReader(
+ TableReaderOptions(ioptions2, moptions2.prefix_extractor.get(),
+ EnvOptions(),
+ GetPlainInternalComparator(options2.comparator)),
+ std::move(file_reader), ss_rw.contents().size(), &table_reader));
+
+ std::unique_ptr<InternalIterator> db_iter(table_reader->NewIterator(
+ ReadOptions(), moptions2.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+ int expected_key = 1;
+ for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+ std::ostringstream ostr;
+ ostr << std::setfill('0') << std::setw(5) << expected_key++;
+ std::string key = ostr.str();
+ std::string value = "val";
+
+ ASSERT_OK(db_iter->status());
+ ASSERT_EQ(ExtractUserKey(db_iter->key()).ToString(), key);
+ ASSERT_EQ(db_iter->value().ToString(), value);
+ }
+ expected_key--;
+ ASSERT_EQ(expected_key, 10000);
+ table_reader.reset();
+}
+
+TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
+ BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+ bbto.block_align = true;
+ test::StringSink* sink = new test::StringSink();
+ std::unique_ptr<WritableFileWriter> file_writer(
+ test::GetWritableFileWriter(sink, "" /* don't care */));
+
+ Options options;
+ options.compression = kNoCompression;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ InternalKeyComparator ikc(options.comparator);
+ std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+ int_tbl_prop_collector_factories;
+ std::string column_family_name;
+
+ std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+ TableBuilderOptions(ioptions, moptions, ikc,
+ &int_tbl_prop_collector_factories, kNoCompression,
+ 0 /* sample_for_compression */, CompressionOptions(),
+ false /* skip_filters */, column_family_name, -1),
+ TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+ file_writer.get()));
+
+ for (int i = 1; i <= 10000; ++i) {
+ std::ostringstream ostr;
+ ostr << std::setfill('0') << std::setw(5) << i;
+ std::string key = ostr.str();
+ std::string value = "val";
+ InternalKey ik(key, 0, kTypeValue);
+
+ builder->Add(ik.Encode(), value);
+ }
+ ASSERT_OK(builder->Finish());
+ file_writer->Flush();
+
+ test::RandomRWStringSink ss_rw(sink);
+ std::unique_ptr<RandomAccessFileReader> file_reader(
+ test::GetRandomAccessFileReader(
+ new test::StringSource(ss_rw.contents(), 73342, true)));
+
+ {
+ RandomAccessFileReader* file = file_reader.get();
+ uint64_t file_size = ss_rw.contents().size();
+
+ Footer footer;
+ ASSERT_OK(ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
+ &footer, kBlockBasedTableMagicNumber));
+
+ auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type,
+ BlockContents* contents) {
+ ReadOptions read_options;
+ read_options.verify_checksums = false;
+ PersistentCacheOptions cache_options;
+
+ BlockFetcher block_fetcher(
+ file, nullptr /* prefetch_buffer */, footer, read_options, handle,
+ contents, ioptions, false /* decompress */,
+ false /*maybe_compressed*/, block_type,
+ UncompressionDict::GetEmptyDict(), cache_options);
+
+ ASSERT_OK(block_fetcher.ReadBlockContents());
+ };
+
+ // -- Read metaindex block
+ auto metaindex_handle = footer.metaindex_handle();
+ BlockContents metaindex_contents;
+
+ BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex,
+ &metaindex_contents);
+ Block metaindex_block(std::move(metaindex_contents),
+ kDisableGlobalSequenceNumber);
+
+ std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
+ BytewiseComparator(), BytewiseComparator()));
+ bool found_properties_block = true;
+ ASSERT_OK(SeekToPropertiesBlock(meta_iter.get(), &found_properties_block));
+ ASSERT_TRUE(found_properties_block);
+
+ // -- Read properties block
+ Slice v = meta_iter->value();
+ BlockHandle properties_handle;
+ ASSERT_OK(properties_handle.DecodeFrom(&v));
+ BlockContents properties_contents;
+
+ BlockFetchHelper(properties_handle, BlockType::kProperties,
+ &properties_contents);
+ Block properties_block(std::move(properties_contents),
+ kDisableGlobalSequenceNumber);
+
+ ASSERT_EQ(properties_block.NumRestarts(), 1u);
+ }
+}
+
+TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
+ // The properties meta-block should come at the end since we always need to
+ // read it when opening a file, unlike index/filter/other meta-blocks, which
+ // are sometimes read depending on the user's configuration. This ordering
+ // allows us to do a small readahead on the end of the file to read properties
+ // and meta-index blocks with one I/O.
+ TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+ c.Add("a1", "val1");
+ c.Add("b2", "val2");
+ c.Add("c3", "val3");
+ c.Add("d4", "val4");
+ c.Add("e5", "val5");
+ c.Add("f6", "val6");
+ c.Add("g7", "val7");
+ c.Add("h8", "val8");
+ c.Add("j9", "val9");
+
+ // write an SST file
+ Options options;
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.filter_policy.reset(NewBloomFilterPolicy(
+ 8 /* bits_per_key */, false /* use_block_based_filter */));
+ options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+ ImmutableCFOptions ioptions(options);
+ MutableCFOptions moptions(options);
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ c.Finish(options, ioptions, moptions, table_options,
+ GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+ // get file reader
+ test::StringSink* table_sink = c.TEST_GetSink();
+ std::unique_ptr<RandomAccessFileReader> table_reader{
+ test::GetRandomAccessFileReader(
+ new test::StringSource(table_sink->contents(), 0 /* unique_id */,
+ false /* allow_mmap_reads */))};
+ size_t table_size = table_sink->contents().size();
+
+ // read footer
+ Footer footer;
+ ASSERT_OK(ReadFooterFromFile(table_reader.get(),
+ nullptr /* prefetch_buffer */, table_size,
+ &footer, kBlockBasedTableMagicNumber));
+
+ // read metaindex
+ auto metaindex_handle = footer.metaindex_handle();
+ BlockContents metaindex_contents;
+ PersistentCacheOptions pcache_opts;
+ BlockFetcher block_fetcher(
+ table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
+ metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
+ false /*maybe_compressed*/, BlockType::kMetaIndex,
+ UncompressionDict::GetEmptyDict(), pcache_opts,
+ nullptr /*memory_allocator*/);
+ ASSERT_OK(block_fetcher.ReadBlockContents());
+ Block metaindex_block(std::move(metaindex_contents),
+ kDisableGlobalSequenceNumber);
+
+ // verify properties block comes last
+ std::unique_ptr<InternalIterator> metaindex_iter{
+ metaindex_block.NewDataIterator(options.comparator, options.comparator)};
+ uint64_t max_offset = 0;
+ std::string key_at_max_offset;
+ for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+ metaindex_iter->Next()) {
+ BlockHandle handle;
+ Slice value = metaindex_iter->value();
+ ASSERT_OK(handle.DecodeFrom(&value));
+ if (handle.offset() > max_offset) {
+ max_offset = handle.offset();
+ key_at_max_offset = metaindex_iter->key().ToString();
+ }
+ }
+ ASSERT_EQ(kPropertiesBlock, key_at_max_offset);
+ // index handle is stored in footer rather than metaindex block, so need
+ // separate logic to verify it comes before properties block.
+ ASSERT_GT(max_offset, footer.index_handle().offset());
+ c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, BadOptions) {
+ ROCKSDB_NAMESPACE::Options options;
+ options.compression = kNoCompression;
+ BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+ bbto.block_size = 4000;
+ bbto.block_align = true;
+
+ const std::string kDBPath =
+ test::PerThreadDBPath("block_based_table_bad_options_test");
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ DestroyDB(kDBPath, options);
+ ROCKSDB_NAMESPACE::DB* db;
+ ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
+
+ bbto.block_size = 4096;
+ options.compression = kSnappyCompression;
+ options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+ ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
+}
+
+TEST_F(BBTTailPrefetchTest, TestTailPrefetchStats) {
+ TailPrefetchStats tpstats;
+ ASSERT_EQ(0, tpstats.GetSuggestedPrefetchSize());
+ tpstats.RecordEffectiveSize(size_t{1000});
+ tpstats.RecordEffectiveSize(size_t{1005});
+ tpstats.RecordEffectiveSize(size_t{1002});
+ ASSERT_EQ(1005, tpstats.GetSuggestedPrefetchSize());
+
+ // One single super large value shouldn't influence much
+ tpstats.RecordEffectiveSize(size_t{1002000});
+ tpstats.RecordEffectiveSize(size_t{999});
+ ASSERT_LE(1005, tpstats.GetSuggestedPrefetchSize());
+ ASSERT_GT(1200, tpstats.GetSuggestedPrefetchSize());
+
+ // Only history of 32 is kept
+ for (int i = 0; i < 32; i++) {
+ tpstats.RecordEffectiveSize(size_t{100});
+ }
+ ASSERT_EQ(100, tpstats.GetSuggestedPrefetchSize());
+
+ // 16 large values and 16 small values. The result should be closer
+ // to the small value as the algorithm.
+ for (int i = 0; i < 16; i++) {
+ tpstats.RecordEffectiveSize(size_t{1000});
+ }
+ tpstats.RecordEffectiveSize(size_t{10});
+ tpstats.RecordEffectiveSize(size_t{20});
+ for (int i = 0; i < 6; i++) {
+ tpstats.RecordEffectiveSize(size_t{100});
+ }
+ ASSERT_LE(80, tpstats.GetSuggestedPrefetchSize());
+ ASSERT_GT(200, tpstats.GetSuggestedPrefetchSize());
+}
+
+TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) {
+ TailPrefetchStats tpstats;
+ FilePrefetchBuffer buffer(nullptr, 0, 0, false, true);
+ buffer.TryReadFromCache(500, 10, nullptr);
+ buffer.TryReadFromCache(480, 10, nullptr);
+ buffer.TryReadFromCache(490, 10, nullptr);
+ ASSERT_EQ(480, buffer.min_offset_read());
+}
+
+TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
+ const int kNumKeys = 500;
+ const int kKeySize = 8;
+ const int kValSize = 40;
+
+ BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+ table_options.data_block_index_type =
+ BlockBasedTableOptions::kDataBlockBinaryAndHash;
+
+ Options options;
+ options.comparator = BytewiseComparator();
+
+ options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+ TableConstructor c(options.comparator);
+
+ static Random rnd(1048);
+ for (int i = 0; i < kNumKeys; i++) {
+ // padding one "0" to mark existent keys.
+ std::string random_key(RandomString(&rnd, kKeySize - 1) + "1");
+ InternalKey k(random_key, 0, kTypeValue);
+ c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize));
+ }
+
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ const InternalKeyComparator internal_comparator(options.comparator);
+ c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+ &keys, &kvmap);
+
+ auto reader = c.GetTableReader();
+
+ std::unique_ptr<InternalIterator> seek_iter;
+ seek_iter.reset(reader->NewIterator(
+ ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+ for (int i = 0; i < 2; ++i) {
+ ReadOptions ro;
+ // for every kv, we seek using two method: Get() and Seek()
+ // Get() will use the SuffixIndexHash in Block. For non-existent key it
+ // will invalidate the iterator
+ // Seek() will use the default BinarySeek() in Block. So for non-existent
+ // key it will land at the closest key that is large than target.
+
+ // Search for existent keys
+ for (auto& kv : kvmap) {
+ if (i == 0) {
+ // Search using Seek()
+ seek_iter->Seek(kv.first);
+ ASSERT_OK(seek_iter->status());
+ ASSERT_TRUE(seek_iter->Valid());
+ ASSERT_EQ(seek_iter->key(), kv.first);
+ ASSERT_EQ(seek_iter->value(), kv.second);
+ } else {
+ // Search using Get()
+ PinnableSlice value;
+ std::string user_key = ExtractUserKey(kv.first).ToString();
+ GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, user_key, &value, nullptr,
+ nullptr, true, nullptr, nullptr);
+ ASSERT_OK(reader->Get(ro, kv.first, &get_context,
+ moptions.prefix_extractor.get()));
+ ASSERT_EQ(get_context.State(), GetContext::kFound);
+ ASSERT_EQ(value, Slice(kv.second));
+ value.Reset();
+ }
+ }
+
+ // Search for non-existent keys
+ for (auto& kv : kvmap) {
+ std::string user_key = ExtractUserKey(kv.first).ToString();
+ user_key.back() = '0'; // make it non-existent key
+ InternalKey internal_key(user_key, 0, kTypeValue);
+ std::string encoded_key = internal_key.Encode().ToString();
+ if (i == 0) { // Search using Seek()
+ seek_iter->Seek(encoded_key);
+ ASSERT_OK(seek_iter->status());
+ if (seek_iter->Valid()) {
+ ASSERT_TRUE(BytewiseComparator()->Compare(
+ user_key, ExtractUserKey(seek_iter->key())) < 0);
+ }
+ } else { // Search using Get()
+ PinnableSlice value;
+ GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, user_key, &value, nullptr,
+ nullptr, true, nullptr, nullptr);
+ ASSERT_OK(reader->Get(ro, encoded_key, &get_context,
+ moptions.prefix_extractor.get()));
+ ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+ value.Reset();
+ }
+ }
+ }
+}
+
+// BlockBasedTableIterator should invalidate itself and return
+// OutOfBound()=true immediately after Seek(), to allow LevelIterator
+// filter out corresponding level.
+TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) {
+ TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/);
+ c.Add("foo", "v1");
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ Options options;
+ BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
+ options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_opt,
+ GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
+ auto* reader = c.GetTableReader();
+ ReadOptions read_opt;
+ std::string upper_bound = "bar";
+ Slice upper_bound_slice(upper_bound);
+ read_opt.iterate_upper_bound = &upper_bound_slice;
+ std::unique_ptr<InternalIterator> iter;
+ iter.reset(new KeyConvertingIterator(reader->NewIterator(
+ read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+ iter->SeekToFirst();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->IsOutOfBound());
+ iter.reset(new KeyConvertingIterator(reader->NewIterator(
+ read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+ iter->Seek("foo");
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->IsOutOfBound());
+}
+
+// BlockBasedTableIterator should invalidate itself and return
+// OutOfBound()=true after Next(), if it finds current index key is no smaller
+// than upper bound, unless it is pointing to the last data block.
+TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
+ TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/);
+ c.Add("bar", "v");
+ c.Add("foo", "v");
+ std::vector<std::string> keys;
+ stl_wrappers::KVMap kvmap;
+ Options options;
+ BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
+ table_opt.flush_block_policy_factory =
+ std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+ options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
+ const ImmutableCFOptions ioptions(options);
+ const MutableCFOptions moptions(options);
+ c.Finish(options, ioptions, moptions, table_opt,
+ GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
+ auto* reader = c.GetTableReader();
+ ReadOptions read_opt;
+ std::string ub1 = "bar_after";
+ Slice ub_slice1(ub1);
+ read_opt.iterate_upper_bound = &ub_slice1;
+ std::unique_ptr<InternalIterator> iter;
+ iter.reset(new KeyConvertingIterator(reader->NewIterator(
+ read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_TRUE(iter->IsOutOfBound());
+ std::string ub2 = "foo_after";
+ Slice ub_slice2(ub2);
+ read_opt.iterate_upper_bound = &ub_slice2;
+ iter.reset(new KeyConvertingIterator(reader->NewIterator(
+ read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key());
+ iter->Next();
+ ASSERT_FALSE(iter->Valid());
+ ASSERT_FALSE(iter->IsOutOfBound());
+}
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/two_level_iterator.cc b/src/rocksdb/table/two_level_iterator.cc
new file mode 100644
index 000000000..a17d56df5
--- /dev/null
+++ b/src/rocksdb/table/two_level_iterator.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/two_level_iterator.h"
+#include "db/pinned_iterators_manager.h"
+#include "memory/arena.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class TwoLevelIndexIterator : public InternalIteratorBase<IndexValue> {
+ public:
+ explicit TwoLevelIndexIterator(
+ TwoLevelIteratorState* state,
+ InternalIteratorBase<IndexValue>* first_level_iter);
+
+ ~TwoLevelIndexIterator() override {
+ first_level_iter_.DeleteIter(false /* is_arena_mode */);
+ second_level_iter_.DeleteIter(false /* is_arena_mode */);
+ delete state_;
+ }
+
+ void Seek(const Slice& target) override;
+ void SeekForPrev(const Slice& target) override;
+ void SeekToFirst() override;
+ void SeekToLast() override;
+ void Next() override;
+ void Prev() override;
+
+ bool Valid() const override { return second_level_iter_.Valid(); }
+ Slice key() const override {
+ assert(Valid());
+ return second_level_iter_.key();
+ }
+ IndexValue value() const override {
+ assert(Valid());
+ return second_level_iter_.value();
+ }
+ Status status() const override {
+ if (!first_level_iter_.status().ok()) {
+ assert(second_level_iter_.iter() == nullptr);
+ return first_level_iter_.status();
+ } else if (second_level_iter_.iter() != nullptr &&
+ !second_level_iter_.status().ok()) {
+ return second_level_iter_.status();
+ } else {
+ return status_;
+ }
+ }
+ void SetPinnedItersMgr(
+ PinnedIteratorsManager* /*pinned_iters_mgr*/) override {}
+ bool IsKeyPinned() const override { return false; }
+ bool IsValuePinned() const override { return false; }
+
+ private:
+ void SaveError(const Status& s) {
+ if (status_.ok() && !s.ok()) status_ = s;
+ }
+ void SkipEmptyDataBlocksForward();
+ void SkipEmptyDataBlocksBackward();
+ void SetSecondLevelIterator(InternalIteratorBase<IndexValue>* iter);
+ void InitDataBlock();
+
+ TwoLevelIteratorState* state_;
+ IteratorWrapperBase<IndexValue> first_level_iter_;
+ IteratorWrapperBase<IndexValue> second_level_iter_; // May be nullptr
+ Status status_;
+ // If second_level_iter is non-nullptr, then "data_block_handle_" holds the
+ // "index_value" passed to block_function_ to create the second_level_iter.
+ BlockHandle data_block_handle_;
+};
+
+TwoLevelIndexIterator::TwoLevelIndexIterator(
+ TwoLevelIteratorState* state,
+ InternalIteratorBase<IndexValue>* first_level_iter)
+ : state_(state), first_level_iter_(first_level_iter) {}
+
+void TwoLevelIndexIterator::Seek(const Slice& target) {
+ first_level_iter_.Seek(target);
+
+ InitDataBlock();
+ if (second_level_iter_.iter() != nullptr) {
+ second_level_iter_.Seek(target);
+ }
+ SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIndexIterator::SeekForPrev(const Slice& target) {
+ first_level_iter_.Seek(target);
+ InitDataBlock();
+ if (second_level_iter_.iter() != nullptr) {
+ second_level_iter_.SeekForPrev(target);
+ }
+ if (!Valid()) {
+ if (!first_level_iter_.Valid() && first_level_iter_.status().ok()) {
+ first_level_iter_.SeekToLast();
+ InitDataBlock();
+ if (second_level_iter_.iter() != nullptr) {
+ second_level_iter_.SeekForPrev(target);
+ }
+ }
+ SkipEmptyDataBlocksBackward();
+ }
+}
+
+void TwoLevelIndexIterator::SeekToFirst() {
+ first_level_iter_.SeekToFirst();
+ InitDataBlock();
+ if (second_level_iter_.iter() != nullptr) {
+ second_level_iter_.SeekToFirst();
+ }
+ SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIndexIterator::SeekToLast() {
+ first_level_iter_.SeekToLast();
+ InitDataBlock();
+ if (second_level_iter_.iter() != nullptr) {
+ second_level_iter_.SeekToLast();
+ }
+ SkipEmptyDataBlocksBackward();
+}
+
+void TwoLevelIndexIterator::Next() {
+ assert(Valid());
+ second_level_iter_.Next();
+ SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIndexIterator::Prev() {
+ assert(Valid());
+ second_level_iter_.Prev();
+ SkipEmptyDataBlocksBackward();
+}
+
+void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() {
+ while (second_level_iter_.iter() == nullptr ||
+ (!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
+ // Move to next block
+ if (!first_level_iter_.Valid()) {
+ SetSecondLevelIterator(nullptr);
+ return;
+ }
+ first_level_iter_.Next();
+ InitDataBlock();
+ if (second_level_iter_.iter() != nullptr) {
+ second_level_iter_.SeekToFirst();
+ }
+ }
+}
+
+void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() {
+ while (second_level_iter_.iter() == nullptr ||
+ (!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
+ // Move to next block
+ if (!first_level_iter_.Valid()) {
+ SetSecondLevelIterator(nullptr);
+ return;
+ }
+ first_level_iter_.Prev();
+ InitDataBlock();
+ if (second_level_iter_.iter() != nullptr) {
+ second_level_iter_.SeekToLast();
+ }
+ }
+}
+
+void TwoLevelIndexIterator::SetSecondLevelIterator(
+ InternalIteratorBase<IndexValue>* iter) {
+ InternalIteratorBase<IndexValue>* old_iter = second_level_iter_.Set(iter);
+ delete old_iter;
+}
+
+void TwoLevelIndexIterator::InitDataBlock() {
+ if (!first_level_iter_.Valid()) {
+ SetSecondLevelIterator(nullptr);
+ } else {
+ BlockHandle handle = first_level_iter_.value().handle;
+ if (second_level_iter_.iter() != nullptr &&
+ !second_level_iter_.status().IsIncomplete() &&
+ handle.offset() == data_block_handle_.offset()) {
+ // second_level_iter is already constructed with this iterator, so
+ // no need to change anything
+ } else {
+ InternalIteratorBase<IndexValue>* iter =
+ state_->NewSecondaryIterator(handle);
+ data_block_handle_ = handle;
+ SetSecondLevelIterator(iter);
+ }
+ }
+}
+
+} // namespace
+
+InternalIteratorBase<IndexValue>* NewTwoLevelIterator(
+ TwoLevelIteratorState* state,
+ InternalIteratorBase<IndexValue>* first_level_iter) {
+ return new TwoLevelIndexIterator(state, first_level_iter);
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/two_level_iterator.h b/src/rocksdb/table/two_level_iterator.h
new file mode 100644
index 000000000..885dff84b
--- /dev/null
+++ b/src/rocksdb/table/two_level_iterator.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/iterator.h"
+#include "rocksdb/env.h"
+#include "table/iterator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ReadOptions;
+class InternalKeyComparator;
+
+// TwoLevelIteratorState expects iterators are not created using the arena
+struct TwoLevelIteratorState {
+ TwoLevelIteratorState() {}
+
+ virtual ~TwoLevelIteratorState() {}
+ virtual InternalIteratorBase<IndexValue>* NewSecondaryIterator(
+ const BlockHandle& handle) = 0;
+};
+
+// Return a new two level iterator. A two-level iterator contains an
+// index iterator whose values point to a sequence of blocks where
+// each block is itself a sequence of key,value pairs. The returned
+// two-level iterator yields the concatenation of all key/value pairs
+// in the sequence of blocks. Takes ownership of "index_iter" and
+// will delete it when no longer needed.
+//
+// Uses a supplied function to convert an index_iter value into
+// an iterator over the contents of the corresponding block.
+// Note: this function expects first_level_iter was not created using the arena
+extern InternalIteratorBase<IndexValue>* NewTwoLevelIterator(
+ TwoLevelIteratorState* state,
+ InternalIteratorBase<IndexValue>* first_level_iter);
+
+} // namespace ROCKSDB_NAMESPACE