summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/table/cuckoo
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/table/cuckoo')
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_builder.cc553
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_builder.h138
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc640
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_factory.cc104
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_factory.h82
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_reader.cc411
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_reader.h100
-rw-r--r--src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc584
8 files changed, 2612 insertions, 0 deletions
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc b/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc
new file mode 100644
index 000000000..296825d94
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc
@@ -0,0 +1,553 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_builder.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "util/autovector.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+const std::string CuckooTablePropertyNames::kEmptyKey =
+ "rocksdb.cuckoo.bucket.empty.key";
+const std::string CuckooTablePropertyNames::kNumHashFunc =
+ "rocksdb.cuckoo.hash.num";
+const std::string CuckooTablePropertyNames::kHashTableSize =
+ "rocksdb.cuckoo.hash.size";
+const std::string CuckooTablePropertyNames::kValueLength =
+ "rocksdb.cuckoo.value.length";
+const std::string CuckooTablePropertyNames::kIsLastLevel =
+ "rocksdb.cuckoo.file.islastlevel";
+const std::string CuckooTablePropertyNames::kCuckooBlockSize =
+ "rocksdb.cuckoo.hash.cuckooblocksize";
+const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
+ "rocksdb.cuckoo.hash.identityfirst";
+const std::string CuckooTablePropertyNames::kUseModuleHash =
+ "rocksdb.cuckoo.hash.usemodule";
+const std::string CuckooTablePropertyNames::kUserKeyLength =
+ "rocksdb.cuckoo.hash.userkeylength";
+
+// Obtained by running echo rocksdb.table.cuckoo | sha1sum
+extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
+
+CuckooTableBuilder::CuckooTableBuilder(
+ WritableFileWriter* file, double max_hash_table_ratio,
+ uint32_t max_num_hash_table, uint32_t max_search_depth,
+ const Comparator* user_comparator, uint32_t cuckoo_block_size,
+ bool use_module_hash, bool identity_as_first_hash,
+ uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
+ uint32_t column_family_id, const std::string& column_family_name,
+ const std::string& db_id, const std::string& db_session_id,
+ uint64_t file_number)
+ : num_hash_func_(2),
+ file_(file),
+ max_hash_table_ratio_(max_hash_table_ratio),
+ max_num_hash_func_(max_num_hash_table),
+ max_search_depth_(max_search_depth),
+ cuckoo_block_size_(std::max(1U, cuckoo_block_size)),
+ hash_table_size_(use_module_hash ? 0 : 2),
+ is_last_level_file_(false),
+ has_seen_first_key_(false),
+ has_seen_first_value_(false),
+ key_size_(0),
+ value_size_(0),
+ num_entries_(0),
+ num_values_(0),
+ ucomp_(user_comparator),
+ use_module_hash_(use_module_hash),
+ identity_as_first_hash_(identity_as_first_hash),
+ get_slice_hash_(get_slice_hash),
+ closed_(false) {
+ // Data is in a huge block.
+ properties_.num_data_blocks = 1;
+ properties_.index_size = 0;
+ properties_.filter_size = 0;
+ properties_.column_family_id = column_family_id;
+ properties_.column_family_name = column_family_name;
+ properties_.db_id = db_id;
+ properties_.db_session_id = db_session_id;
+ properties_.orig_file_number = file_number;
+ status_.PermitUncheckedError();
+ io_status_.PermitUncheckedError();
+}
+
+void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
+ if (num_entries_ >= kMaxVectorIdx - 1) {
+ status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
+ return;
+ }
+ ParsedInternalKey ikey;
+ Status pik_status =
+ ParseInternalKey(key, &ikey, false /* log_err_key */); // TODO
+ if (!pik_status.ok()) {
+ status_ = Status::Corruption("Unable to parse key into internal key. ",
+ pik_status.getState());
+ return;
+ }
+ if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
+ status_ = Status::NotSupported("Unsupported key type " +
+ std::to_string(ikey.type));
+ return;
+ }
+
+ // Determine if we can ignore the sequence number and value type from
+ // internal keys by looking at sequence number from first key. We assume
+ // that if first key has a zero sequence number, then all the remaining
+ // keys will have zero seq. no.
+ if (!has_seen_first_key_) {
+ is_last_level_file_ = ikey.sequence == 0;
+ has_seen_first_key_ = true;
+ smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+ largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+ key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
+ }
+ if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) {
+ status_ = Status::NotSupported("all keys have to be the same size");
+ return;
+ }
+
+ if (ikey.type == kTypeValue) {
+ if (!has_seen_first_value_) {
+ has_seen_first_value_ = true;
+ value_size_ = value.size();
+ }
+ if (value_size_ != value.size()) {
+ status_ = Status::NotSupported("all values have to be the same size");
+ return;
+ }
+
+ if (is_last_level_file_) {
+ kvs_.append(ikey.user_key.data(), ikey.user_key.size());
+ } else {
+ kvs_.append(key.data(), key.size());
+ }
+ kvs_.append(value.data(), value.size());
+ ++num_values_;
+ } else {
+ if (is_last_level_file_) {
+ deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size());
+ } else {
+ deleted_keys_.append(key.data(), key.size());
+ }
+ }
+ ++num_entries_;
+
+ // In order to fill the empty buckets in the hash table, we identify a
+ // key which is not used so far (unused_user_key). We determine this by
+ // maintaining smallest and largest keys inserted so far in bytewise order
+ // and use them to find a key outside this range in Finish() operation.
+ // Note that this strategy is independent of user comparator used here.
+ if (ikey.user_key.compare(smallest_user_key_) < 0) {
+ smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+ } else if (ikey.user_key.compare(largest_user_key_) > 0) {
+ largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+ }
+ if (!use_module_hash_) {
+ if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
+ hash_table_size_ *= 2;
+ }
+ }
+}
+
+bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const {
+ assert(closed_);
+ return idx >= num_values_;
+}
+
+Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
+ assert(closed_);
+ if (IsDeletedKey(idx)) {
+ return Slice(
+ &deleted_keys_[static_cast<size_t>((idx - num_values_) * key_size_)],
+ static_cast<size_t>(key_size_));
+ }
+ return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_))],
+ static_cast<size_t>(key_size_));
+}
+
+Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
+ assert(closed_);
+ return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
+}
+
+Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
+ assert(closed_);
+ if (IsDeletedKey(idx)) {
+ static std::string empty_value(static_cast<unsigned int>(value_size_), 'a');
+ return Slice(empty_value);
+ }
+ return Slice(
+ &kvs_[static_cast<size_t>(idx * (key_size_ + value_size_) + key_size_)],
+ static_cast<size_t>(value_size_));
+}
+
+Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
+ buckets->resize(
+ static_cast<size_t>(hash_table_size_ + cuckoo_block_size_ - 1));
+ uint32_t make_space_for_key_call_id = 0;
+ for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
+ uint64_t bucket_id = 0;
+ bool bucket_found = false;
+ autovector<uint64_t> hash_vals;
+ Slice user_key = GetUserKey(vector_idx);
+ for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
+ ++hash_cnt) {
+ uint64_t hash_val =
+ CuckooHash(user_key, hash_cnt, use_module_hash_, hash_table_size_,
+ identity_as_first_hash_, get_slice_hash_);
+ // If there is a collision, check next cuckoo_block_size_ locations for
+ // empty locations. While checking, if we reach end of the hash table,
+ // stop searching and proceed for next hash function.
+ for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+ ++block_idx, ++hash_val) {
+ if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx ==
+ kMaxVectorIdx) {
+ bucket_id = hash_val;
+ bucket_found = true;
+ break;
+ } else {
+ if (ucomp_->Compare(
+ user_key, GetUserKey((*buckets)[static_cast<size_t>(hash_val)]
+ .vector_idx)) == 0) {
+ return Status::NotSupported("Same key is being inserted again.");
+ }
+ hash_vals.push_back(hash_val);
+ }
+ }
+ }
+ while (!bucket_found &&
+ !MakeSpaceForKey(hash_vals, ++make_space_for_key_call_id, buckets,
+ &bucket_id)) {
+ // Rehash by increashing number of hash tables.
+ if (num_hash_func_ >= max_num_hash_func_) {
+ return Status::NotSupported("Too many collisions. Unable to hash.");
+ }
+ // We don't really need to rehash the entire table because old hashes are
+ // still valid and we only increased the number of hash functions.
+ uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_,
+ hash_table_size_, identity_as_first_hash_,
+ get_slice_hash_);
+ ++num_hash_func_;
+ for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+ ++block_idx, ++hash_val) {
+ if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx ==
+ kMaxVectorIdx) {
+ bucket_found = true;
+ bucket_id = hash_val;
+ break;
+ } else {
+ hash_vals.push_back(hash_val);
+ }
+ }
+ }
+ (*buckets)[static_cast<size_t>(bucket_id)].vector_idx = vector_idx;
+ }
+ return Status::OK();
+}
+
+Status CuckooTableBuilder::Finish() {
+ assert(!closed_);
+ closed_ = true;
+ std::vector<CuckooBucket> buckets;
+ std::string unused_bucket;
+ if (num_entries_ > 0) {
+ // Calculate the real hash size if module hash is enabled.
+ if (use_module_hash_) {
+ hash_table_size_ =
+ static_cast<uint64_t>(num_entries_ / max_hash_table_ratio_);
+ }
+ status_ = MakeHashTable(&buckets);
+ if (!status_.ok()) {
+ return status_;
+ }
+ // Determine unused_user_key to fill empty buckets.
+ std::string unused_user_key = smallest_user_key_;
+ int curr_pos = static_cast<int>(unused_user_key.size()) - 1;
+ while (curr_pos >= 0) {
+ --unused_user_key[curr_pos];
+ if (Slice(unused_user_key).compare(smallest_user_key_) < 0) {
+ break;
+ }
+ --curr_pos;
+ }
+ if (curr_pos < 0) {
+ // Try using the largest key to identify an unused key.
+ unused_user_key = largest_user_key_;
+ curr_pos = static_cast<int>(unused_user_key.size()) - 1;
+ while (curr_pos >= 0) {
+ ++unused_user_key[curr_pos];
+ if (Slice(unused_user_key).compare(largest_user_key_) > 0) {
+ break;
+ }
+ --curr_pos;
+ }
+ }
+ if (curr_pos < 0) {
+ return Status::Corruption("Unable to find unused key");
+ }
+ if (is_last_level_file_) {
+ unused_bucket = unused_user_key;
+ } else {
+ ParsedInternalKey ikey(unused_user_key, 0, kTypeValue);
+ AppendInternalKey(&unused_bucket, ikey);
+ }
+ }
+ properties_.num_entries = num_entries_;
+ properties_.num_deletions = num_entries_ - num_values_;
+ properties_.fixed_key_len = key_size_;
+ properties_.user_collected_properties[CuckooTablePropertyNames::kValueLength]
+ .assign(reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
+
+ uint64_t bucket_size = key_size_ + value_size_;
+ unused_bucket.resize(static_cast<size_t>(bucket_size), 'a');
+ // Write the table.
+ uint32_t num_added = 0;
+ for (auto& bucket : buckets) {
+ if (bucket.vector_idx == kMaxVectorIdx) {
+ io_status_ = file_->Append(Slice(unused_bucket));
+ } else {
+ ++num_added;
+ io_status_ = file_->Append(GetKey(bucket.vector_idx));
+ if (io_status_.ok()) {
+ if (value_size_ > 0) {
+ io_status_ = file_->Append(GetValue(bucket.vector_idx));
+ }
+ }
+ }
+ if (!io_status_.ok()) {
+ status_ = io_status_;
+ return status_;
+ }
+ }
+ assert(num_added == NumEntries());
+ properties_.raw_key_size = num_added * properties_.fixed_key_len;
+ properties_.raw_value_size = num_added * value_size_;
+
+ uint64_t offset = buckets.size() * bucket_size;
+ properties_.data_size = offset;
+ unused_bucket.resize(static_cast<size_t>(properties_.fixed_key_len));
+ properties_.user_collected_properties[CuckooTablePropertyNames::kEmptyKey] =
+ unused_bucket;
+ properties_.user_collected_properties[CuckooTablePropertyNames::kNumHashFunc]
+ .assign(reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_));
+
+ properties_
+ .user_collected_properties[CuckooTablePropertyNames::kHashTableSize]
+ .assign(reinterpret_cast<const char*>(&hash_table_size_),
+ sizeof(hash_table_size_));
+ properties_.user_collected_properties[CuckooTablePropertyNames::kIsLastLevel]
+ .assign(reinterpret_cast<const char*>(&is_last_level_file_),
+ sizeof(is_last_level_file_));
+ properties_
+ .user_collected_properties[CuckooTablePropertyNames::kCuckooBlockSize]
+ .assign(reinterpret_cast<const char*>(&cuckoo_block_size_),
+ sizeof(cuckoo_block_size_));
+ properties_
+ .user_collected_properties[CuckooTablePropertyNames::kIdentityAsFirstHash]
+ .assign(reinterpret_cast<const char*>(&identity_as_first_hash_),
+ sizeof(identity_as_first_hash_));
+ properties_
+ .user_collected_properties[CuckooTablePropertyNames::kUseModuleHash]
+ .assign(reinterpret_cast<const char*>(&use_module_hash_),
+ sizeof(use_module_hash_));
+ uint32_t user_key_len = static_cast<uint32_t>(smallest_user_key_.size());
+ properties_
+ .user_collected_properties[CuckooTablePropertyNames::kUserKeyLength]
+ .assign(reinterpret_cast<const char*>(&user_key_len),
+ sizeof(user_key_len));
+
+ // Write meta blocks.
+ MetaIndexBuilder meta_index_builder;
+ PropertyBlockBuilder property_block_builder;
+
+ property_block_builder.AddTableProperty(properties_);
+ property_block_builder.Add(properties_.user_collected_properties);
+ Slice property_block = property_block_builder.Finish();
+ BlockHandle property_block_handle;
+ property_block_handle.set_offset(offset);
+ property_block_handle.set_size(property_block.size());
+ io_status_ = file_->Append(property_block);
+ offset += property_block.size();
+ if (!io_status_.ok()) {
+ status_ = io_status_;
+ return status_;
+ }
+
+ meta_index_builder.Add(kPropertiesBlockName, property_block_handle);
+ Slice meta_index_block = meta_index_builder.Finish();
+
+ BlockHandle meta_index_block_handle;
+ meta_index_block_handle.set_offset(offset);
+ meta_index_block_handle.set_size(meta_index_block.size());
+ io_status_ = file_->Append(meta_index_block);
+ if (!io_status_.ok()) {
+ status_ = io_status_;
+ return status_;
+ }
+
+ FooterBuilder footer;
+ footer.Build(kCuckooTableMagicNumber, /* format_version */ 1, offset,
+ kNoChecksum, meta_index_block_handle);
+ io_status_ = file_->Append(footer.GetSlice());
+ status_ = io_status_;
+ return status_;
+}
+
+void CuckooTableBuilder::Abandon() {
+ assert(!closed_);
+ closed_ = true;
+}
+
+uint64_t CuckooTableBuilder::NumEntries() const { return num_entries_; }
+
+uint64_t CuckooTableBuilder::FileSize() const {
+ if (closed_) {
+ return file_->GetFileSize();
+ } else if (num_entries_ == 0) {
+ return 0;
+ }
+
+ if (use_module_hash_) {
+ return static_cast<uint64_t>((key_size_ + value_size_) * num_entries_ /
+ max_hash_table_ratio_);
+ } else {
+ // Account for buckets being a power of two.
+ // As elements are added, file size remains constant for a while and
+ // doubles its size. Since compaction algorithm stops adding elements
+ // only after it exceeds the file limit, we account for the extra element
+ // being added here.
+ uint64_t expected_hash_table_size = hash_table_size_;
+ if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
+ expected_hash_table_size *= 2;
+ }
+ return (key_size_ + value_size_) * expected_hash_table_size - 1;
+ }
+}
+
+// This method is invoked when there is no place to insert the target key.
+// It searches for a set of elements that can be moved to accommodate target
+// key. The search is a BFS graph traversal with first level (hash_vals)
+// being all the buckets target key could go to.
+// Then, from each node (curr_node), we find all the buckets that curr_node
+// could go to. They form the children of curr_node in the tree.
+// We continue the traversal until we find an empty bucket, in which case, we
+// move all elements along the path from first level to this empty bucket, to
+// make space for target key which is inserted at first level (*bucket_id).
+// If tree depth exceedes max depth, we return false indicating failure.
+bool CuckooTableBuilder::MakeSpaceForKey(
+ const autovector<uint64_t>& hash_vals,
+ const uint32_t make_space_for_key_call_id,
+ std::vector<CuckooBucket>* buckets, uint64_t* bucket_id) {
+ struct CuckooNode {
+ uint64_t bucket_id;
+ uint32_t depth;
+ uint32_t parent_pos;
+ CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos)
+ : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {}
+ };
+ // This is BFS search tree that is stored simply as a vector.
+ // Each node stores the index of parent node in the vector.
+ std::vector<CuckooNode> tree;
+ // We want to identify already visited buckets in the current method call so
+ // that we don't add same buckets again for exploration in the tree.
+ // We do this by maintaining a count of current method call in
+ // make_space_for_key_call_id, which acts as a unique id for this invocation
+ // of the method. We store this number into the nodes that we explore in
+ // current method call.
+ // It is unlikely for the increment operation to overflow because the maximum
+ // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
+ for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
+ uint64_t bid = hash_vals[hash_cnt];
+ (*buckets)[static_cast<size_t>(bid)].make_space_for_key_call_id =
+ make_space_for_key_call_id;
+ tree.push_back(CuckooNode(bid, 0, 0));
+ }
+ bool null_found = false;
+ uint32_t curr_pos = 0;
+ while (!null_found && curr_pos < tree.size()) {
+ CuckooNode& curr_node = tree[curr_pos];
+ uint32_t curr_depth = curr_node.depth;
+ if (curr_depth >= max_search_depth_) {
+ break;
+ }
+ CuckooBucket& curr_bucket =
+ (*buckets)[static_cast<size_t>(curr_node.bucket_id)];
+ for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !null_found;
+ ++hash_cnt) {
+ uint64_t child_bucket_id = CuckooHash(
+ GetUserKey(curr_bucket.vector_idx), hash_cnt, use_module_hash_,
+ hash_table_size_, identity_as_first_hash_, get_slice_hash_);
+ // Iterate inside Cuckoo Block.
+ for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+ ++block_idx, ++child_bucket_id) {
+ if ((*buckets)[static_cast<size_t>(child_bucket_id)]
+ .make_space_for_key_call_id == make_space_for_key_call_id) {
+ continue;
+ }
+ (*buckets)[static_cast<size_t>(child_bucket_id)]
+ .make_space_for_key_call_id = make_space_for_key_call_id;
+ tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, curr_pos));
+ if ((*buckets)[static_cast<size_t>(child_bucket_id)].vector_idx ==
+ kMaxVectorIdx) {
+ null_found = true;
+ break;
+ }
+ }
+ }
+ ++curr_pos;
+ }
+
+ if (null_found) {
+ // There is an empty node in tree.back(). Now, traverse the path from this
+ // empty node to top of the tree and at every node in the path, replace
+ // child with the parent. Stop when first level is reached in the tree
+ // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
+ // this location in first level for target key to be inserted.
+ uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
+ while (bucket_to_replace_pos >= num_hash_func_) {
+ CuckooNode& curr_node = tree[bucket_to_replace_pos];
+ (*buckets)[static_cast<size_t>(curr_node.bucket_id)] =
+ (*buckets)[static_cast<size_t>(tree[curr_node.parent_pos].bucket_id)];
+ bucket_to_replace_pos = curr_node.parent_pos;
+ }
+ *bucket_id = tree[bucket_to_replace_pos].bucket_id;
+ }
+ return null_found;
+}
+
+std::string CuckooTableBuilder::GetFileChecksum() const {
+ if (file_ != nullptr) {
+ return file_->GetFileChecksum();
+ } else {
+ return kUnknownFileChecksum;
+ }
+}
+
+const char* CuckooTableBuilder::GetFileChecksumFuncName() const {
+ if (file_ != nullptr) {
+ return file_->GetFileChecksumFuncName();
+ } else {
+ return kUnknownFileChecksumFuncName;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder.h b/src/rocksdb/table/cuckoo/cuckoo_table_builder.h
new file mode 100644
index 000000000..a125e1f4c
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_builder.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CuckooTableBuilder : public TableBuilder {
+ public:
+ CuckooTableBuilder(
+ WritableFileWriter* file, double max_hash_table_ratio,
+ uint32_t max_num_hash_func, uint32_t max_search_depth,
+ const Comparator* user_comparator, uint32_t cuckoo_block_size,
+ bool use_module_hash, bool identity_as_first_hash,
+ uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
+ uint32_t column_family_id, const std::string& column_family_name,
+ const std::string& db_id = "", const std::string& db_session_id = "",
+ uint64_t file_number = 0);
+ // No copying allowed
+ CuckooTableBuilder(const CuckooTableBuilder&) = delete;
+ void operator=(const CuckooTableBuilder&) = delete;
+
+ // REQUIRES: Either Finish() or Abandon() has been called.
+ ~CuckooTableBuilder() {}
+
+ // Add key,value to the table being constructed.
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Add(const Slice& key, const Slice& value) override;
+
+ // Return non-ok iff some error has been detected.
+ Status status() const override { return status_; }
+
+ // Return non-ok iff some error happens during IO.
+ IOStatus io_status() const override { return io_status_; }
+
+ // Finish building the table. Stops using the file passed to the
+ // constructor after this function returns.
+ // REQUIRES: Finish(), Abandon() have not been called
+ Status Finish() override;
+
+ // Indicate that the contents of this builder should be abandoned. Stops
+ // using the file passed to the constructor after this function returns.
+ // If the caller is not going to call Finish(), it must call Abandon()
+ // before destroying this builder.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Abandon() override;
+
+ // Number of calls to Add() so far.
+ uint64_t NumEntries() const override;
+
+ // Size of the file generated so far. If invoked after a successful
+ // Finish() call, returns the size of the final generated file.
+ uint64_t FileSize() const override;
+
+ TableProperties GetTableProperties() const override { return properties_; }
+
+ // Get file checksum
+ std::string GetFileChecksum() const override;
+
+ // Get file checksum function name
+ const char* GetFileChecksumFuncName() const override;
+
+ private:
+ struct CuckooBucket {
+ CuckooBucket() : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {}
+ uint32_t vector_idx;
+ // This number will not exceed kvs_.size() + max_num_hash_func_.
+ // We assume number of items is <= 2^32.
+ uint32_t make_space_for_key_call_id;
+ };
+ static const uint32_t kMaxVectorIdx = std::numeric_limits<int32_t>::max();
+
+ bool MakeSpaceForKey(const autovector<uint64_t>& hash_vals,
+ const uint32_t call_id,
+ std::vector<CuckooBucket>* buckets, uint64_t* bucket_id);
+ Status MakeHashTable(std::vector<CuckooBucket>* buckets);
+
+ inline bool IsDeletedKey(uint64_t idx) const;
+ inline Slice GetKey(uint64_t idx) const;
+ inline Slice GetUserKey(uint64_t idx) const;
+ inline Slice GetValue(uint64_t idx) const;
+
+ uint32_t num_hash_func_;
+ WritableFileWriter* file_;
+ const double max_hash_table_ratio_;
+ const uint32_t max_num_hash_func_;
+ const uint32_t max_search_depth_;
+ const uint32_t cuckoo_block_size_;
+ uint64_t hash_table_size_;
+ bool is_last_level_file_;
+ bool has_seen_first_key_;
+ bool has_seen_first_value_;
+ uint64_t key_size_;
+ uint64_t value_size_;
+ // A list of fixed-size key-value pairs concatenating into a string.
+ // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific
+ // key / value given an index
+ std::string kvs_;
+ std::string deleted_keys_;
+ // Number of key-value pairs stored in kvs_ + number of deleted keys
+ uint64_t num_entries_;
+ // Number of keys that contain value (non-deletion op)
+ uint64_t num_values_;
+ Status status_;
+ IOStatus io_status_;
+ TableProperties properties_;
+ const Comparator* ucomp_;
+ bool use_module_hash_;
+ bool identity_as_first_hash_;
+ uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
+ uint64_t max_num_buckets);
+ std::string largest_user_key_ = "";
+ std::string smallest_user_key_ = "";
+
+ bool closed_; // Either Finish() or Abandon() has been called.
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc b/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc
new file mode 100644
index 000000000..be1c62117
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc
@@ -0,0 +1,640 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "table/cuckoo/cuckoo_table_builder.h"
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+extern const uint64_t kCuckooTableMagicNumber;
+
+namespace {
+std::unordered_map<std::string, std::vector<uint64_t>> hash_map;
+
+uint64_t GetSliceHash(const Slice& s, uint32_t index,
+ uint64_t /*max_num_buckets*/) {
+ return hash_map[s.ToString()][index];
+}
+} // namespace
+
+class CuckooBuilderTest : public testing::Test {
+ public:
+ CuckooBuilderTest() {
+ env_ = Env::Default();
+ Options options;
+ options.allow_mmap_reads = true;
+ file_options_ = FileOptions(options);
+ }
+
+ void CheckFileContents(const std::vector<std::string>& keys,
+ const std::vector<std::string>& values,
+ const std::vector<uint64_t>& expected_locations,
+ std::string expected_unused_bucket,
+ uint64_t expected_table_size,
+ uint32_t expected_num_hash_func,
+ bool expected_is_last_level,
+ uint32_t expected_cuckoo_block_size = 1) {
+ uint64_t num_deletions = 0;
+ for (const auto& key : keys) {
+ ParsedInternalKey parsed;
+ Status pik_status =
+ ParseInternalKey(key, &parsed, true /* log_err_key */);
+ if (pik_status.ok() && parsed.type == kTypeDeletion) {
+ num_deletions++;
+ }
+ }
+ // Read file
+ uint64_t read_file_size;
+ ASSERT_OK(env_->GetFileSize(fname, &read_file_size));
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ ASSERT_OK(RandomAccessFileReader::Create(
+ env_->GetFileSystem(), fname, file_options_, &file_reader, nullptr));
+
+ Options options;
+ options.allow_mmap_reads = true;
+ ImmutableOptions ioptions(options);
+
+ // Assert Table Properties.
+ std::unique_ptr<TableProperties> props;
+ ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size,
+ kCuckooTableMagicNumber, ioptions, &props));
+ // Check unused bucket.
+ std::string unused_key =
+ props->user_collected_properties[CuckooTablePropertyNames::kEmptyKey];
+ ASSERT_EQ(expected_unused_bucket.substr(0, props->fixed_key_len),
+ unused_key);
+
+ uint64_t value_len_found = *reinterpret_cast<const uint64_t*>(
+ props->user_collected_properties[CuckooTablePropertyNames::kValueLength]
+ .data());
+ ASSERT_EQ(values.empty() ? 0 : values[0].size(), value_len_found);
+ ASSERT_EQ(props->raw_value_size, values.size() * value_len_found);
+ const uint64_t table_size = *reinterpret_cast<const uint64_t*>(
+ props
+ ->user_collected_properties
+ [CuckooTablePropertyNames::kHashTableSize]
+ .data());
+ ASSERT_EQ(expected_table_size, table_size);
+ const uint32_t num_hash_func_found = *reinterpret_cast<const uint32_t*>(
+ props->user_collected_properties[CuckooTablePropertyNames::kNumHashFunc]
+ .data());
+ ASSERT_EQ(expected_num_hash_func, num_hash_func_found);
+ const uint32_t cuckoo_block_size = *reinterpret_cast<const uint32_t*>(
+ props
+ ->user_collected_properties
+ [CuckooTablePropertyNames::kCuckooBlockSize]
+ .data());
+ ASSERT_EQ(expected_cuckoo_block_size, cuckoo_block_size);
+ const bool is_last_level_found = *reinterpret_cast<const bool*>(
+ props->user_collected_properties[CuckooTablePropertyNames::kIsLastLevel]
+ .data());
+ ASSERT_EQ(expected_is_last_level, is_last_level_found);
+
+ ASSERT_EQ(props->num_entries, keys.size());
+ ASSERT_EQ(props->num_deletions, num_deletions);
+ ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size());
+ ASSERT_EQ(props->data_size,
+ expected_unused_bucket.size() *
+ (expected_table_size + expected_cuckoo_block_size - 1));
+ ASSERT_EQ(props->raw_key_size, keys.size() * props->fixed_key_len);
+ ASSERT_EQ(props->column_family_id, 0);
+ ASSERT_EQ(props->column_family_name, kDefaultColumnFamilyName);
+
+ // Check contents of the bucket.
+ std::vector<bool> keys_found(keys.size(), false);
+ size_t bucket_size = expected_unused_bucket.size();
+ for (uint32_t i = 0; i + 1 < table_size + cuckoo_block_size; ++i) {
+ Slice read_slice;
+ ASSERT_OK(file_reader->Read(IOOptions(), i * bucket_size, bucket_size,
+ &read_slice, nullptr, nullptr,
+ Env::IO_TOTAL /* rate_limiter_priority */));
+ size_t key_idx =
+ std::find(expected_locations.begin(), expected_locations.end(), i) -
+ expected_locations.begin();
+ if (key_idx == keys.size()) {
+ // i is not one of the expected locations. Empty bucket.
+ if (read_slice.data() == nullptr) {
+ ASSERT_EQ(0, expected_unused_bucket.size());
+ } else {
+ ASSERT_EQ(read_slice.compare(expected_unused_bucket), 0);
+ }
+ } else {
+ keys_found[key_idx] = true;
+ ASSERT_EQ(read_slice.compare(keys[key_idx] + values[key_idx]), 0);
+ }
+ }
+ for (auto key_found : keys_found) {
+ // Check that all keys wereReader found.
+ ASSERT_TRUE(key_found);
+ }
+ }
+
+ std::string GetInternalKey(Slice user_key, bool zero_seqno,
+ ValueType type = kTypeValue) {
+ IterKey ikey;
+ ikey.SetInternalKey(user_key, zero_seqno ? 0 : 1000, type);
+ return ikey.GetInternalKey().ToString();
+ }
+
+ uint64_t NextPowOf2(uint64_t num) {
+ uint64_t n = 2;
+ while (n <= num) {
+ n *= 2;
+ }
+ return n;
+ }
+
+ uint64_t GetExpectedTableSize(uint64_t num) {
+ return NextPowOf2(static_cast<uint64_t>(num / kHashTableRatio));
+ }
+
+ Env* env_;
+ FileOptions file_options_;
+ std::string fname;
+ const double kHashTableRatio = 0.9;
+};
+
+TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
+ std::unique_ptr<WritableFile> writable_file;
+ fname = test::PerThreadDBPath("EmptyFile");
+ std::unique_ptr<WritableFileWriter> file_writer;
+ ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+ file_options_, &file_writer, nullptr));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100,
+ BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ ASSERT_EQ(0UL, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ CheckFileContents({}, {}, {}, "", 2, 2, false);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
+ for (auto type : {kTypeValue, kTypeDeletion}) {
+ uint32_t num_hash_fun = 4;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+ std::vector<std::string> values;
+ if (type == kTypeValue) {
+ values = {"v01", "v02", "v03", "v04"};
+ } else {
+ values = {"", "", "", ""};
+ }
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1, 2, 3}},
+ {user_keys[1], {1, 2, 3, 4}},
+ {user_keys[2], {2, 3, 4, 5}},
+ {user_keys[3], {3, 4, 5, 6}}};
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+ std::vector<std::string> keys;
+ for (auto& user_key : user_keys) {
+ keys.push_back(GetInternalKey(user_key, false, type));
+ }
+ uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+ fname = test::PerThreadDBPath("NoCollisionFullKey");
+ std::unique_ptr<WritableFileWriter> file_writer;
+ ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+ file_options_, &file_writer, nullptr));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(keys[i]), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = GetInternalKey("key00", true);
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+ expected_table_size, 2, false);
+ }
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
+ uint32_t num_hash_fun = 4;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1, 2, 3}},
+ {user_keys[1], {0, 1, 2, 3}},
+ {user_keys[2], {0, 1, 2, 3}},
+ {user_keys[3], {0, 1, 2, 3}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+ std::vector<std::string> keys;
+ for (auto& user_key : user_keys) {
+ keys.push_back(GetInternalKey(user_key, false));
+ }
+ uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+ fname = test::PerThreadDBPath("WithCollisionFullKey");
+ std::unique_ptr<WritableFileWriter> file_writer;
+ ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+ file_options_, &file_writer, nullptr));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(keys[i]), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = GetInternalKey("key00", true);
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+ expected_table_size, 4, false);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
+ uint32_t num_hash_fun = 4;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1, 2, 3}},
+ {user_keys[1], {0, 1, 2, 3}},
+ {user_keys[2], {0, 1, 2, 3}},
+ {user_keys[3], {0, 1, 2, 3}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+ std::vector<std::string> keys;
+ for (auto& user_key : user_keys) {
+ keys.push_back(GetInternalKey(user_key, false));
+ }
+ uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+ std::unique_ptr<WritableFileWriter> file_writer;
+ uint32_t cuckoo_block_size = 2;
+ fname = test::PerThreadDBPath("WithCollisionFullKey2");
+ ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+ file_options_, &file_writer, nullptr));
+ CuckooTableBuilder builder(
+ file_writer.get(), kHashTableRatio, num_hash_fun, 100,
+ BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash,
+ 0 /* column_family_id */, kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(keys[i]), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = GetInternalKey("key00", true);
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+ expected_table_size, 3, false, cuckoo_block_size);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) {
+ // Have two hash functions. Insert elements with overlapping hashes.
+ // Finally insert an element with hash value somewhere in the middle
+ // so that it displaces all the elements after that.
+ uint32_t num_hash_fun = 2;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+ "key05"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}},
+ {user_keys[3], {3, 4}}, {user_keys[4], {0, 2}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
+ std::vector<std::string> keys;
+ for (auto& user_key : user_keys) {
+ keys.push_back(GetInternalKey(user_key, false));
+ }
+ uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+ std::unique_ptr<WritableFileWriter> file_writer;
+ fname = test::PerThreadDBPath("WithCollisionPathFullKey");
+ ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+ file_options_, &file_writer, nullptr));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(keys[i]), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = GetInternalKey("key00", true);
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+ expected_table_size, 2, false);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
+ uint32_t num_hash_fun = 2;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+ "key05"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {3, 4}},
+ {user_keys[3], {4, 5}}, {user_keys[4], {0, 3}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {2, 1, 3, 4, 0};
+ std::vector<std::string> keys;
+ for (auto& user_key : user_keys) {
+ keys.push_back(GetInternalKey(user_key, false));
+ }
+ uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+ std::unique_ptr<WritableFileWriter> file_writer;
+ fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock");
+ ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+ file_options_, &file_writer, nullptr));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 2, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(keys[i]), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = GetInternalKey("key00", true);
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+ expected_table_size, 2, false, 2);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
+ uint32_t num_hash_fun = 4;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1, 2, 3}},
+ {user_keys[1], {1, 2, 3, 4}},
+ {user_keys[2], {2, 3, 4, 5}},
+ {user_keys[3], {3, 4, 5, 6}}};
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+ uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+ std::unique_ptr<WritableFileWriter> file_writer;
+ fname = test::PerThreadDBPath("NoCollisionUserKey");
+ ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+ file_options_, &file_writer, nullptr));
+
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = user_keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = "key00";
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(user_keys, values, expected_locations,
+ expected_unused_bucket, expected_table_size, 2, true);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
+ uint32_t num_hash_fun = 4;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1, 2, 3}},
+ {user_keys[1], {0, 1, 2, 3}},
+ {user_keys[2], {0, 1, 2, 3}},
+ {user_keys[3], {0, 1, 2, 3}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+ uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+ std::unique_ptr<WritableFileWriter> file_writer;
+ fname = test::PerThreadDBPath("WithCollisionUserKey");
+ ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+ file_options_, &file_writer, nullptr));
+
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = user_keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = "key00";
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(user_keys, values, expected_locations,
+ expected_unused_bucket, expected_table_size, 4, true);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) {
+ uint32_t num_hash_fun = 2;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+ "key05"};
+ std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}},
+ {user_keys[3], {3, 4}}, {user_keys[4], {0, 2}},
+ };
+ hash_map = std::move(hm);
+
+ std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
+ uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+ std::unique_ptr<WritableFileWriter> file_writer;
+ fname = test::PerThreadDBPath("WithCollisionPathUserKey");
+ ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+ file_options_, &file_writer, nullptr));
+
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 2, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ size_t bucket_size = user_keys[0].size() + values[0].size();
+ ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+ ASSERT_OK(builder.Finish());
+ ASSERT_OK(file_writer->Close());
+ ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+ std::string expected_unused_bucket = "key00";
+ expected_unused_bucket += std::string(values[0].size(), 'a');
+ CheckFileContents(user_keys, values, expected_locations,
+ expected_unused_bucket, expected_table_size, 2, true);
+}
+
+TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
+ // Have two hash functions. Insert elements with overlapping hashes.
+ // Finally try inserting an element with hash value somewhere in the middle
+ // and it should fail because the no. of elements to displace is too high.
+ uint32_t num_hash_fun = 2;
+ std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+ "key05"};
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}},
+ {user_keys[3], {3, 4}}, {user_keys[4], {0, 1}},
+ };
+ hash_map = std::move(hm);
+
+ std::unique_ptr<WritableFileWriter> file_writer;
+ fname = test::PerThreadDBPath("WithCollisionPathUserKey");
+ ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+ file_options_, &file_writer, nullptr));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 2, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t i = 0; i < user_keys.size(); i++) {
+ builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value"));
+ ASSERT_EQ(builder.NumEntries(), i + 1);
+ ASSERT_OK(builder.status());
+ }
+ ASSERT_TRUE(builder.Finish().IsNotSupported());
+ ASSERT_OK(file_writer->Close());
+}
+
+TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) {
+ // Need to have a temporary variable here as VS compiler does not currently
+ // support operator= with initializer_list as a parameter
+ std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+ {"repeatedkey", {0, 1, 2, 3}}};
+ hash_map = std::move(hm);
+ uint32_t num_hash_fun = 4;
+ std::string user_key = "repeatedkey";
+
+ std::unique_ptr<WritableFileWriter> file_writer;
+ fname = test::PerThreadDBPath("FailWhenSameKeyInserted");
+ ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+ file_options_, &file_writer, nullptr));
+ CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+ 100, BytewiseComparator(), 1, false, false,
+ GetSliceHash, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+
+ builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1"));
+ ASSERT_EQ(builder.NumEntries(), 1u);
+ ASSERT_OK(builder.status());
+ builder.Add(Slice(GetInternalKey(user_key, true)), Slice("value2"));
+ ASSERT_EQ(builder.NumEntries(), 2u);
+ ASSERT_OK(builder.status());
+
+ ASSERT_TRUE(builder.Finish().IsNotSupported());
+ ASSERT_OK(file_writer->Close());
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc b/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc
new file mode 100644
index 000000000..1253c92dd
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_factory.h"
+
+#include "db/dbformat.h"
+#include "options/configurable_helper.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status CuckooTableFactory::NewTableReader(
+ const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table,
+ bool /*prefetch_index_and_filter_in_cache*/) const {
+ std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(
+ table_reader_options.ioptions, std::move(file), file_size,
+ table_reader_options.internal_comparator.user_comparator(), nullptr));
+ Status s = new_reader->status();
+ if (s.ok()) {
+ *table = std::move(new_reader);
+ }
+ return s;
+}
+
+TableBuilder* CuckooTableFactory::NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ WritableFileWriter* file) const {
+ // TODO: change builder to take the option struct
+ return new CuckooTableBuilder(
+ file, table_options_.hash_table_ratio, 64,
+ table_options_.max_search_depth,
+ table_builder_options.internal_comparator.user_comparator(),
+ table_options_.cuckoo_block_size, table_options_.use_module_hash,
+ table_options_.identity_as_first_hash, nullptr /* get_slice_hash */,
+ table_builder_options.column_family_id,
+ table_builder_options.column_family_name, table_builder_options.db_id,
+ table_builder_options.db_session_id, table_builder_options.cur_file_num);
+}
+
+std::string CuckooTableFactory::GetPrintableOptions() const {
+ std::string ret;
+ ret.reserve(2000);
+ const int kBufferSize = 200;
+ char buffer[kBufferSize];
+
+ snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n",
+ table_options_.hash_table_ratio);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " max_search_depth: %u\n",
+ table_options_.max_search_depth);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " cuckoo_block_size: %u\n",
+ table_options_.cuckoo_block_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " identity_as_first_hash: %d\n",
+ table_options_.identity_as_first_hash);
+ ret.append(buffer);
+ return ret;
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> cuckoo_table_type_info =
+ {
+#ifndef ROCKSDB_LITE
+ {"hash_table_ratio",
+ {offsetof(struct CuckooTableOptions, hash_table_ratio),
+ OptionType::kDouble, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"max_search_depth",
+ {offsetof(struct CuckooTableOptions, max_search_depth),
+ OptionType::kUInt32T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"cuckoo_block_size",
+ {offsetof(struct CuckooTableOptions, cuckoo_block_size),
+ OptionType::kUInt32T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"identity_as_first_hash",
+ {offsetof(struct CuckooTableOptions, identity_as_first_hash),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"use_module_hash",
+ {offsetof(struct CuckooTableOptions, use_module_hash),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+#endif // ROCKSDB_LITE
+};
+
+CuckooTableFactory::CuckooTableFactory(const CuckooTableOptions& table_options)
+ : table_options_(table_options) {
+ RegisterOptions(&table_options_, &cuckoo_table_type_info);
+}
+
+TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) {
+ return new CuckooTableFactory(table_options);
+}
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_factory.h b/src/rocksdb/table/cuckoo/cuckoo_table_factory.h
new file mode 100644
index 000000000..9937c28dd
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_factory.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "util/murmurhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint32_t kCuckooMurmurSeedMultiplier = 816922183;
+static inline uint64_t CuckooHash(
+ const Slice& user_key, uint32_t hash_cnt, bool use_module_hash,
+ uint64_t table_size_, bool identity_as_first_hash,
+ uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) {
+#if !defined NDEBUG || defined OS_WIN
+ // This part is used only in unit tests but we have to keep it for Windows
+ // build as we run test in both debug and release modes under Windows.
+ if (get_slice_hash != nullptr) {
+ return get_slice_hash(user_key, hash_cnt, table_size_);
+ }
+#else
+ (void)get_slice_hash;
+#endif
+
+ uint64_t value = 0;
+ if (hash_cnt == 0 && identity_as_first_hash) {
+ value = (*reinterpret_cast<const int64_t*>(user_key.data()));
+ } else {
+ value = MurmurHash(user_key.data(), static_cast<int>(user_key.size()),
+ kCuckooMurmurSeedMultiplier * hash_cnt);
+ }
+ if (use_module_hash) {
+ return value % table_size_;
+ } else {
+ return value & (table_size_ - 1);
+ }
+}
+
+// Cuckoo Table is designed for applications that require fast point lookups
+// but not fast range scans.
+//
+// Some assumptions:
+// - Key length and Value length are fixed.
+// - Does not support Snapshot.
+// - Does not support Merge operations.
+// - Does not support prefix bloom filters.
+class CuckooTableFactory : public TableFactory {
+ public:
+ explicit CuckooTableFactory(
+ const CuckooTableOptions& table_option = CuckooTableOptions());
+ ~CuckooTableFactory() {}
+
+ // Method to allow CheckedCast to work for this class
+ static const char* kClassName() { return kCuckooTableName(); }
+ const char* Name() const override { return kCuckooTableName(); }
+
+ using TableFactory::NewTableReader;
+ Status NewTableReader(
+ const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table,
+ bool prefetch_index_and_filter_in_cache = true) const override;
+
+ TableBuilder* NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ WritableFileWriter* file) const override;
+
+ std::string GetPrintableOptions() const override;
+
+ private:
+ CuckooTableOptions table_options_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc b/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc
new file mode 100644
index 000000000..1d70909a6
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc
@@ -0,0 +1,411 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_reader.h"
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/table.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
+const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
+} // namespace
+
+extern const uint64_t kCuckooTableMagicNumber;
+
+CuckooTableReader::CuckooTableReader(
+ const ImmutableOptions& ioptions,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ const Comparator* comparator,
+ uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
+ : file_(std::move(file)),
+ is_last_level_(false),
+ identity_as_first_hash_(false),
+ use_module_hash_(false),
+ num_hash_func_(0),
+ unused_key_(""),
+ key_length_(0),
+ user_key_length_(0),
+ value_length_(0),
+ bucket_length_(0),
+ cuckoo_block_size_(0),
+ cuckoo_block_bytes_minus_one_(0),
+ table_size_(0),
+ ucomp_(comparator),
+ get_slice_hash_(get_slice_hash) {
+ if (!ioptions.allow_mmap_reads) {
+ status_ = Status::InvalidArgument("File is not mmaped");
+ return;
+ }
+ {
+ std::unique_ptr<TableProperties> props;
+ status_ = ReadTableProperties(file_.get(), file_size,
+ kCuckooTableMagicNumber, ioptions, &props);
+ if (!status_.ok()) {
+ return;
+ }
+ table_props_ = std::move(props);
+ }
+ auto& user_props = table_props_->user_collected_properties;
+ auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
+ if (hash_funs == user_props.end()) {
+ status_ = Status::Corruption("Number of hash functions not found");
+ return;
+ }
+ num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
+ auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
+ if (unused_key == user_props.end()) {
+ status_ = Status::Corruption("Empty bucket value not found");
+ return;
+ }
+ unused_key_ = unused_key->second;
+
+ key_length_ = static_cast<uint32_t>(table_props_->fixed_key_len);
+ auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
+ if (user_key_len == user_props.end()) {
+ status_ = Status::Corruption("User key length not found");
+ return;
+ }
+ user_key_length_ =
+ *reinterpret_cast<const uint32_t*>(user_key_len->second.data());
+
+ auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
+ if (value_length == user_props.end()) {
+ status_ = Status::Corruption("Value length not found");
+ return;
+ }
+ value_length_ =
+ *reinterpret_cast<const uint32_t*>(value_length->second.data());
+ bucket_length_ = key_length_ + value_length_;
+
+ auto hash_table_size =
+ user_props.find(CuckooTablePropertyNames::kHashTableSize);
+ if (hash_table_size == user_props.end()) {
+ status_ = Status::Corruption("Hash table size not found");
+ return;
+ }
+ table_size_ =
+ *reinterpret_cast<const uint64_t*>(hash_table_size->second.data());
+
+ auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
+ if (is_last_level == user_props.end()) {
+ status_ = Status::Corruption("Is last level not found");
+ return;
+ }
+ is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
+
+ auto identity_as_first_hash =
+ user_props.find(CuckooTablePropertyNames::kIdentityAsFirstHash);
+ if (identity_as_first_hash == user_props.end()) {
+ status_ = Status::Corruption("identity as first hash not found");
+ return;
+ }
+ identity_as_first_hash_ =
+ *reinterpret_cast<const bool*>(identity_as_first_hash->second.data());
+
+ auto use_module_hash =
+ user_props.find(CuckooTablePropertyNames::kUseModuleHash);
+ if (use_module_hash == user_props.end()) {
+ status_ = Status::Corruption("hash type is not found");
+ return;
+ }
+ use_module_hash_ =
+ *reinterpret_cast<const bool*>(use_module_hash->second.data());
+ auto cuckoo_block_size =
+ user_props.find(CuckooTablePropertyNames::kCuckooBlockSize);
+ if (cuckoo_block_size == user_props.end()) {
+ status_ = Status::Corruption("Cuckoo block size not found");
+ return;
+ }
+ cuckoo_block_size_ =
+ *reinterpret_cast<const uint32_t*>(cuckoo_block_size->second.data());
+ cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
+ // TODO: rate limit reads of whole cuckoo tables.
+ status_ =
+ file_->Read(IOOptions(), 0, static_cast<size_t>(file_size), &file_data_,
+ nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+}
+
+Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
+ const Slice& key, GetContext* get_context,
+ const SliceTransform* /* prefix_extractor */,
+ bool /*skip_filters*/) {
+ assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
+ Slice user_key = ExtractUserKey(key);
+ for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
+ uint64_t offset =
+ bucket_length_ * CuckooHash(user_key, hash_cnt, use_module_hash_,
+ table_size_, identity_as_first_hash_,
+ get_slice_hash_);
+ const char* bucket = &file_data_.data()[offset];
+ for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+ ++block_idx, bucket += bucket_length_) {
+ if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
+ Slice(bucket, user_key.size()))) {
+ return Status::OK();
+ }
+ // Here, we compare only the user key part as we support only one entry
+ // per user key and we don't support snapshot.
+ if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
+ Slice value(bucket + key_length_, value_length_);
+ if (is_last_level_) {
+ // Sequence number is not stored at the last level, so we will use
+ // kMaxSequenceNumber since it is unknown. This could cause some
+ // transactions to fail to lock a key due to known sequence number.
+ // However, it is expected for anyone to use a CuckooTable in a
+ // TransactionDB.
+ get_context->SaveValue(value, kMaxSequenceNumber);
+ } else {
+ Slice full_key(bucket, key_length_);
+ ParsedInternalKey found_ikey;
+ Status s = ParseInternalKey(full_key, &found_ikey,
+ false /* log_err_key */); // TODO
+ if (!s.ok()) return s;
+ bool dont_care __attribute__((__unused__));
+ get_context->SaveValue(found_ikey, value, &dont_care);
+ }
+ // We don't support merge operations. So, we return here.
+ return Status::OK();
+ }
+ }
+ }
+ return Status::OK();
+}
+
+void CuckooTableReader::Prepare(const Slice& key) {
+ // Prefetch the first Cuckoo Block.
+ Slice user_key = ExtractUserKey(key);
+ uint64_t addr =
+ reinterpret_cast<uint64_t>(file_data_.data()) +
+ bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
+ identity_as_first_hash_, nullptr);
+ uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
+ for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
+ PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
+ }
+}
+
+class CuckooTableIterator : public InternalIterator {
+ public:
+ explicit CuckooTableIterator(CuckooTableReader* reader);
+ // No copying allowed
+ CuckooTableIterator(const CuckooTableIterator&) = delete;
+ void operator=(const Iterator&) = delete;
+ ~CuckooTableIterator() override {}
+ bool Valid() const override;
+ void SeekToFirst() override;
+ void SeekToLast() override;
+ void Seek(const Slice& target) override;
+ void SeekForPrev(const Slice& target) override;
+ void Next() override;
+ void Prev() override;
+ Slice key() const override;
+ Slice value() const override;
+ Status status() const override { return Status::OK(); }
+ void InitIfNeeded();
+
+ private:
+ struct BucketComparator {
+ BucketComparator(const Slice& file_data, const Comparator* ucomp,
+ uint32_t bucket_len, uint32_t user_key_len,
+ const Slice& target = Slice())
+ : file_data_(file_data),
+ ucomp_(ucomp),
+ bucket_len_(bucket_len),
+ user_key_len_(user_key_len),
+ target_(target) {}
+ bool operator()(const uint32_t first, const uint32_t second) const {
+ const char* first_bucket = (first == kInvalidIndex)
+ ? target_.data()
+ : &file_data_.data()[first * bucket_len_];
+ const char* second_bucket =
+ (second == kInvalidIndex) ? target_.data()
+ : &file_data_.data()[second * bucket_len_];
+ return ucomp_->Compare(Slice(first_bucket, user_key_len_),
+ Slice(second_bucket, user_key_len_)) < 0;
+ }
+
+ private:
+ const Slice file_data_;
+ const Comparator* ucomp_;
+ const uint32_t bucket_len_;
+ const uint32_t user_key_len_;
+ const Slice target_;
+ };
+
+ const BucketComparator bucket_comparator_;
+ void PrepareKVAtCurrIdx();
+ CuckooTableReader* reader_;
+ bool initialized_;
+ // Contains a map of keys to bucket_id sorted in key order.
+ std::vector<uint32_t> sorted_bucket_ids_;
+ // We assume that the number of items can be stored in uint32 (4 Billion).
+ uint32_t curr_key_idx_;
+ Slice curr_value_;
+ IterKey curr_key_;
+};
+
+CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
+ : bucket_comparator_(reader->file_data_, reader->ucomp_,
+ reader->bucket_length_, reader->user_key_length_),
+ reader_(reader),
+ initialized_(false),
+ curr_key_idx_(kInvalidIndex) {
+ sorted_bucket_ids_.clear();
+ curr_value_.clear();
+ curr_key_.Clear();
+}
+
+void CuckooTableIterator::InitIfNeeded() {
+ if (initialized_) {
+ return;
+ }
+ sorted_bucket_ids_.reserve(
+ static_cast<size_t>(reader_->GetTableProperties()->num_entries));
+ uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
+ assert(num_buckets < kInvalidIndex);
+ const char* bucket = reader_->file_data_.data();
+ for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
+ if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
+ sorted_bucket_ids_.push_back(bucket_id);
+ }
+ bucket += reader_->bucket_length_;
+ }
+ assert(sorted_bucket_ids_.size() ==
+ reader_->GetTableProperties()->num_entries);
+ std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
+ bucket_comparator_);
+ curr_key_idx_ = kInvalidIndex;
+ initialized_ = true;
+}
+
+void CuckooTableIterator::SeekToFirst() {
+ InitIfNeeded();
+ curr_key_idx_ = 0;
+ PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::SeekToLast() {
+ InitIfNeeded();
+ curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
+ PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::Seek(const Slice& target) {
+ InitIfNeeded();
+ const BucketComparator seek_comparator(
+ reader_->file_data_, reader_->ucomp_, reader_->bucket_length_,
+ reader_->user_key_length_, ExtractUserKey(target));
+ auto seek_it =
+ std::lower_bound(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
+ kInvalidIndex, seek_comparator);
+ curr_key_idx_ =
+ static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
+ PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) {
+ // Not supported
+ assert(false);
+}
+
+bool CuckooTableIterator::Valid() const {
+ return curr_key_idx_ < sorted_bucket_ids_.size();
+}
+
+void CuckooTableIterator::PrepareKVAtCurrIdx() {
+ if (!Valid()) {
+ curr_value_.clear();
+ curr_key_.Clear();
+ return;
+ }
+ uint32_t id = sorted_bucket_ids_[curr_key_idx_];
+ const char* offset =
+ reader_->file_data_.data() + id * reader_->bucket_length_;
+ if (reader_->is_last_level_) {
+ // Always return internal key.
+ curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), 0,
+ kTypeValue);
+ } else {
+ curr_key_.SetInternalKey(Slice(offset, reader_->key_length_));
+ }
+ curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
+}
+
+void CuckooTableIterator::Next() {
+ if (!Valid()) {
+ curr_value_.clear();
+ curr_key_.Clear();
+ return;
+ }
+ ++curr_key_idx_;
+ PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::Prev() {
+ if (curr_key_idx_ == 0) {
+ curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
+ }
+ if (!Valid()) {
+ curr_value_.clear();
+ curr_key_.Clear();
+ return;
+ }
+ --curr_key_idx_;
+ PrepareKVAtCurrIdx();
+}
+
+Slice CuckooTableIterator::key() const {
+ assert(Valid());
+ return curr_key_.GetInternalKey();
+}
+
+Slice CuckooTableIterator::value() const {
+ assert(Valid());
+ return curr_value_;
+}
+
+InternalIterator* CuckooTableReader::NewIterator(
+ const ReadOptions& /*read_options*/,
+ const SliceTransform* /* prefix_extractor */, Arena* arena,
+ bool /*skip_filters*/, TableReaderCaller /*caller*/,
+ size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
+ if (!status().ok()) {
+ return NewErrorInternalIterator<Slice>(
+ Status::Corruption("CuckooTableReader status is not okay."), arena);
+ }
+ CuckooTableIterator* iter;
+ if (arena == nullptr) {
+ iter = new CuckooTableIterator(this);
+ } else {
+ auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
+ iter = new (iter_mem) CuckooTableIterator(this);
+ }
+ return iter;
+}
+
+size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
+
+} // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader.h b/src/rocksdb/table/cuckoo/cuckoo_table_reader.h
new file mode 100644
index 000000000..f6c599ae8
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class TableReader;
+struct ImmutableOptions;
+
+class CuckooTableReader : public TableReader {
+ public:
+ CuckooTableReader(const ImmutableOptions& ioptions,
+ std::unique_ptr<RandomAccessFileReader>&& file,
+ uint64_t file_size, const Comparator* user_comparator,
+ uint64_t (*get_slice_hash)(const Slice&, uint32_t,
+ uint64_t));
+ ~CuckooTableReader() {}
+
+ std::shared_ptr<const TableProperties> GetTableProperties() const override {
+ return table_props_;
+ }
+
+ Status status() const { return status_; }
+
+ Status Get(const ReadOptions& readOptions, const Slice& key,
+ GetContext* get_context, const SliceTransform* prefix_extractor,
+ bool skip_filters = false) override;
+
+ // Returns a new iterator over table contents
+ // compaction_readahead_size: its value will only be used if for_compaction =
+ // true
+ InternalIterator* NewIterator(const ReadOptions&,
+ const SliceTransform* prefix_extractor,
+ Arena* arena, bool skip_filters,
+ TableReaderCaller caller,
+ size_t compaction_readahead_size = 0,
+ bool allow_unprepared_value = false) override;
+ void Prepare(const Slice& target) override;
+
+ // Report an approximation of how much memory has been used.
+ size_t ApproximateMemoryUsage() const override;
+
+ // Following methods are not implemented for Cuckoo Table Reader
+ uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+ TableReaderCaller /*caller*/) override {
+ return 0;
+ }
+
+ uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+ TableReaderCaller /*caller*/) override {
+ return 0;
+ }
+
+ void SetupForCompaction() override {}
+ // End of methods not implemented.
+
+ private:
+ friend class CuckooTableIterator;
+ void LoadAllKeys(std::vector<std::pair<Slice, uint32_t>>* key_to_bucket_id);
+ std::unique_ptr<RandomAccessFileReader> file_;
+ Slice file_data_;
+ bool is_last_level_;
+ bool identity_as_first_hash_;
+ bool use_module_hash_;
+ std::shared_ptr<const TableProperties> table_props_;
+ Status status_;
+ uint32_t num_hash_func_;
+ std::string unused_key_;
+ uint32_t key_length_;
+ uint32_t user_key_length_;
+ uint32_t value_length_;
+ uint32_t bucket_length_;
+ uint32_t cuckoo_block_size_;
+ uint32_t cuckoo_block_bytes_minus_one_;
+ uint64_t table_size_;
+ const Comparator* ucomp_;
+ uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
+ uint64_t max_num_buckets);
+};
+
+} // namespace ROCKSDB_NAMESPACE
+#endif // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc b/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc
new file mode 100644
index 000000000..d3d1490c6
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc
@@ -0,0 +1,584 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+ fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+ return 0;
+}
+#else
+
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "memory/arena.h"
+#include "rocksdb/db.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+#include "table/get_context.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_string(file_dir, "",
+ "Directory where the files will be created"
+ " for benchmark. Added for using tmpfs.");
+DEFINE_bool(enable_perf, false, "Run Benchmark Tests too.");
+DEFINE_bool(write, false,
+ "Should write new values to file in performance tests?");
+DEFINE_bool(identity_as_first_hash, true, "use identity as first hash");
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const uint32_t kNumHashFunc = 10;
+// Methods, variables related to Hash functions.
+std::unordered_map<std::string, std::vector<uint64_t> > hash_map;
+
+void AddHashLookups(const std::string& s, uint64_t bucket_id,
+ uint32_t num_hash_fun) {
+ std::vector<uint64_t> v;
+ for (uint32_t i = 0; i < num_hash_fun; i++) {
+ v.push_back(bucket_id + i);
+ }
+ hash_map[s] = v;
+}
+
+uint64_t GetSliceHash(const Slice& s, uint32_t index,
+ uint64_t /*max_num_buckets*/) {
+ return hash_map[s.ToString()][index];
+}
+} // namespace
+
+class CuckooReaderTest : public testing::Test {
+ public:
+ using testing::Test::SetUp;
+
+ CuckooReaderTest() {
+ options.allow_mmap_reads = true;
+ env = options.env;
+ file_options = FileOptions(options);
+ }
+
+ void SetUp(int num) {
+ num_items = num;
+ hash_map.clear();
+ keys.clear();
+ keys.resize(num_items);
+ user_keys.clear();
+ user_keys.resize(num_items);
+ values.clear();
+ values.resize(num_items);
+ }
+
+ std::string NumToStr(int64_t i) {
+ return std::string(reinterpret_cast<char*>(&i), sizeof(i));
+ }
+
+ void CreateCuckooFileAndCheckReader(
+ const Comparator* ucomp = BytewiseComparator()) {
+ std::unique_ptr<WritableFileWriter> file_writer;
+ ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), fname,
+ file_options, &file_writer, nullptr));
+ CuckooTableBuilder builder(
+ file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false,
+ GetSliceHash, 0 /* column_family_id */, kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) {
+ builder.Add(Slice(keys[key_idx]), Slice(values[key_idx]));
+ ASSERT_OK(builder.status());
+ ASSERT_EQ(builder.NumEntries(), key_idx + 1);
+ }
+ ASSERT_OK(builder.Finish());
+ ASSERT_EQ(num_items, builder.NumEntries());
+ file_size = builder.FileSize();
+ ASSERT_OK(file_writer->Close());
+
+ // Check reader now.
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ ASSERT_OK(RandomAccessFileReader::Create(
+ env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+ const ImmutableOptions ioptions(options);
+ CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
+ GetSliceHash);
+ ASSERT_OK(reader.status());
+ // Assume no merge/deletion
+ for (uint32_t i = 0; i < num_items; ++i) {
+ PinnableSlice value;
+ GetContext get_context(ucomp, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, Slice(user_keys[i]), &value,
+ nullptr, nullptr, nullptr, nullptr, true, nullptr,
+ nullptr);
+ ASSERT_OK(
+ reader.Get(ReadOptions(), Slice(keys[i]), &get_context, nullptr));
+ ASSERT_STREQ(values[i].c_str(), value.data());
+ }
+ }
+ void UpdateKeys(bool with_zero_seqno) {
+ for (uint32_t i = 0; i < num_items; i++) {
+ ParsedInternalKey ikey(user_keys[i], with_zero_seqno ? 0 : i + 1000,
+ kTypeValue);
+ keys[i].clear();
+ AppendInternalKey(&keys[i], ikey);
+ }
+ }
+
+ void CheckIterator(const Comparator* ucomp = BytewiseComparator()) {
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ ASSERT_OK(RandomAccessFileReader::Create(
+ env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+ const ImmutableOptions ioptions(options);
+ CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
+ GetSliceHash);
+ ASSERT_OK(reader.status());
+ InternalIterator* it = reader.NewIterator(
+ ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+ /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(!it->Valid());
+ it->SeekToFirst();
+ int cnt = 0;
+ while (it->Valid()) {
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+ ASSERT_TRUE(Slice(values[cnt]) == it->value());
+ ++cnt;
+ it->Next();
+ }
+ ASSERT_EQ(static_cast<uint32_t>(cnt), num_items);
+
+ it->SeekToLast();
+ cnt = static_cast<int>(num_items) - 1;
+ ASSERT_TRUE(it->Valid());
+ while (it->Valid()) {
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+ ASSERT_TRUE(Slice(values[cnt]) == it->value());
+ --cnt;
+ it->Prev();
+ }
+ ASSERT_EQ(cnt, -1);
+
+ cnt = static_cast<int>(num_items) / 2;
+ it->Seek(keys[cnt]);
+ while (it->Valid()) {
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+ ASSERT_TRUE(Slice(values[cnt]) == it->value());
+ ++cnt;
+ it->Next();
+ }
+ ASSERT_EQ(static_cast<uint32_t>(cnt), num_items);
+ delete it;
+
+ Arena arena;
+ it = reader.NewIterator(ReadOptions(), /*prefix_extractor=*/nullptr, &arena,
+ /*skip_filters=*/false,
+ TableReaderCaller::kUncategorized);
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(!it->Valid());
+ it->Seek(keys[num_items / 2]);
+ ASSERT_TRUE(it->Valid());
+ ASSERT_OK(it->status());
+ ASSERT_TRUE(keys[num_items / 2] == it->key());
+ ASSERT_TRUE(values[num_items / 2] == it->value());
+ ASSERT_OK(it->status());
+ it->~InternalIterator();
+ }
+
+ std::vector<std::string> keys;
+ std::vector<std::string> user_keys;
+ std::vector<std::string> values;
+ uint64_t num_items;
+ std::string fname;
+ uint64_t file_size;
+ Options options;
+ Env* env;
+ FileOptions file_options;
+};
+
+TEST_F(CuckooReaderTest, FileNotMmaped) {
+ options.allow_mmap_reads = false;
+ ImmutableOptions ioptions(options);
+ CuckooTableReader reader(ioptions, nullptr, 0, nullptr, nullptr);
+ ASSERT_TRUE(reader.status().IsInvalidArgument());
+ ASSERT_STREQ("File is not mmaped", reader.status().getState());
+}
+
+TEST_F(CuckooReaderTest, WhenKeyExists) {
+ SetUp(kNumHashFunc);
+ fname = test::PerThreadDBPath("CuckooReader_WhenKeyExists");
+ for (uint64_t i = 0; i < num_items; i++) {
+ user_keys[i] = "key" + NumToStr(i);
+ ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+ AppendInternalKey(&keys[i], ikey);
+ values[i] = "value" + NumToStr(i);
+ // Give disjoint hash values.
+ AddHashLookups(user_keys[i], i, kNumHashFunc);
+ }
+ CreateCuckooFileAndCheckReader();
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader();
+ // Test with collision. Make all hash values collide.
+ hash_map.clear();
+ for (uint32_t i = 0; i < num_items; i++) {
+ AddHashLookups(user_keys[i], 0, kNumHashFunc);
+ }
+ UpdateKeys(false);
+ CreateCuckooFileAndCheckReader();
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader();
+}
+
+TEST_F(CuckooReaderTest, WhenKeyExistsWithUint64Comparator) {
+ SetUp(kNumHashFunc);
+ fname = test::PerThreadDBPath("CuckooReaderUint64_WhenKeyExists");
+ for (uint64_t i = 0; i < num_items; i++) {
+ user_keys[i].resize(8);
+ memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
+ ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+ AppendInternalKey(&keys[i], ikey);
+ values[i] = "value" + NumToStr(i);
+ // Give disjoint hash values.
+ AddHashLookups(user_keys[i], i, kNumHashFunc);
+ }
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+ // Test with collision. Make all hash values collide.
+ hash_map.clear();
+ for (uint32_t i = 0; i < num_items; i++) {
+ AddHashLookups(user_keys[i], 0, kNumHashFunc);
+ }
+ UpdateKeys(false);
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+}
+
+TEST_F(CuckooReaderTest, CheckIterator) {
+ SetUp(2 * kNumHashFunc);
+ fname = test::PerThreadDBPath("CuckooReader_CheckIterator");
+ for (uint64_t i = 0; i < num_items; i++) {
+ user_keys[i] = "key" + NumToStr(i);
+ ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue);
+ AppendInternalKey(&keys[i], ikey);
+ values[i] = "value" + NumToStr(i);
+ // Give disjoint hash values, in reverse order.
+ AddHashLookups(user_keys[i], num_items - i - 1, kNumHashFunc);
+ }
+ CreateCuckooFileAndCheckReader();
+ CheckIterator();
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader();
+ CheckIterator();
+}
+
+TEST_F(CuckooReaderTest, CheckIteratorUint64) {
+ SetUp(2 * kNumHashFunc);
+ fname = test::PerThreadDBPath("CuckooReader_CheckIterator");
+ for (uint64_t i = 0; i < num_items; i++) {
+ user_keys[i].resize(8);
+ memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
+ ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue);
+ AppendInternalKey(&keys[i], ikey);
+ values[i] = "value" + NumToStr(i);
+ // Give disjoint hash values, in reverse order.
+ AddHashLookups(user_keys[i], num_items - i - 1, kNumHashFunc);
+ }
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+ CheckIterator(test::Uint64Comparator());
+ // Last level file.
+ UpdateKeys(true);
+ CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+ CheckIterator(test::Uint64Comparator());
+}
+
+TEST_F(CuckooReaderTest, WhenKeyNotFound) {
+ // Add keys with colliding hash values.
+ SetUp(kNumHashFunc);
+ fname = test::PerThreadDBPath("CuckooReader_WhenKeyNotFound");
+ for (uint64_t i = 0; i < num_items; i++) {
+ user_keys[i] = "key" + NumToStr(i);
+ ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+ AppendInternalKey(&keys[i], ikey);
+ values[i] = "value" + NumToStr(i);
+ // Make all hash values collide.
+ AddHashLookups(user_keys[i], 0, kNumHashFunc);
+ }
+ auto* ucmp = BytewiseComparator();
+ CreateCuckooFileAndCheckReader();
+
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ ASSERT_OK(RandomAccessFileReader::Create(
+ env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+
+ const ImmutableOptions ioptions(options);
+ CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp,
+ GetSliceHash);
+ ASSERT_OK(reader.status());
+ // Search for a key with colliding hash values.
+ std::string not_found_user_key = "key" + NumToStr(num_items);
+ std::string not_found_key;
+ AddHashLookups(not_found_user_key, 0, kNumHashFunc);
+ ParsedInternalKey ikey(not_found_user_key, 1000, kTypeValue);
+ AppendInternalKey(&not_found_key, ikey);
+ PinnableSlice value;
+ GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound,
+ Slice(not_found_key), &value, nullptr, nullptr,
+ nullptr, nullptr, true, nullptr, nullptr);
+ ASSERT_OK(
+ reader.Get(ReadOptions(), Slice(not_found_key), &get_context, nullptr));
+ ASSERT_TRUE(value.empty());
+ ASSERT_OK(reader.status());
+ // Search for a key with an independent hash value.
+ std::string not_found_user_key2 = "key" + NumToStr(num_items + 1);
+ AddHashLookups(not_found_user_key2, kNumHashFunc, kNumHashFunc);
+ ParsedInternalKey ikey2(not_found_user_key2, 1000, kTypeValue);
+ std::string not_found_key2;
+ AppendInternalKey(&not_found_key2, ikey2);
+ value.Reset();
+ GetContext get_context2(ucmp, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, Slice(not_found_key2), &value,
+ nullptr, nullptr, nullptr, nullptr, true, nullptr,
+ nullptr);
+ ASSERT_OK(
+ reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2, nullptr));
+ ASSERT_TRUE(value.empty());
+ ASSERT_OK(reader.status());
+
+ // Test read when key is unused key.
+ std::string unused_key =
+ reader.GetTableProperties()->user_collected_properties.at(
+ CuckooTablePropertyNames::kEmptyKey);
+ // Add hash values that map to empty buckets.
+ AddHashLookups(ExtractUserKey(unused_key).ToString(), kNumHashFunc,
+ kNumHashFunc);
+ value.Reset();
+ GetContext get_context3(
+ ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(unused_key),
+ &value, nullptr, nullptr, nullptr, nullptr, true, nullptr, nullptr);
+ ASSERT_OK(
+ reader.Get(ReadOptions(), Slice(unused_key), &get_context3, nullptr));
+ ASSERT_TRUE(value.empty());
+ ASSERT_OK(reader.status());
+}
+
+// Performance tests
+namespace {
+void GetKeys(uint64_t num, std::vector<std::string>* keys) {
+ keys->clear();
+ IterKey k;
+ k.SetInternalKey("", 0, kTypeValue);
+ std::string internal_key_suffix = k.GetInternalKey().ToString();
+ ASSERT_EQ(static_cast<size_t>(8), internal_key_suffix.size());
+ for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
+ uint64_t value = 2 * key_idx;
+ std::string new_key(reinterpret_cast<char*>(&value), sizeof(value));
+ new_key += internal_key_suffix;
+ keys->push_back(new_key);
+ }
+}
+
+std::string GetFileName(uint64_t num) {
+ if (FLAGS_file_dir.empty()) {
+ FLAGS_file_dir = test::TmpDir();
+ }
+ return test::PerThreadDBPath(FLAGS_file_dir, "cuckoo_read_benchmark") +
+ std::to_string(num / 1000000) + "Mkeys";
+}
+
+// Create last level file as we are interested in measuring performance of
+// last level file only.
+void WriteFile(const std::vector<std::string>& keys, const uint64_t num,
+ double hash_ratio) {
+ Options options;
+ options.allow_mmap_reads = true;
+ const auto& fs = options.env->GetFileSystem();
+ FileOptions file_options(options);
+ std::string fname = GetFileName(num);
+
+ std::unique_ptr<WritableFileWriter> file_writer;
+ ASSERT_OK(WritableFileWriter::Create(fs, fname, file_options, &file_writer,
+ nullptr));
+ CuckooTableBuilder builder(
+ file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5,
+ false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */,
+ kDefaultColumnFamilyName);
+ ASSERT_OK(builder.status());
+ for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
+ // Value is just a part of key.
+ builder.Add(Slice(keys[key_idx]), Slice(&keys[key_idx][0], 4));
+ ASSERT_EQ(builder.NumEntries(), key_idx + 1);
+ ASSERT_OK(builder.status());
+ }
+ ASSERT_OK(builder.Finish());
+ ASSERT_EQ(num, builder.NumEntries());
+ ASSERT_OK(file_writer->Close());
+
+ uint64_t file_size;
+ ASSERT_OK(
+ fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr));
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options,
+ &file_reader, nullptr));
+
+ const ImmutableOptions ioptions(options);
+ CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
+ test::Uint64Comparator(), nullptr);
+ ASSERT_OK(reader.status());
+ ReadOptions r_options;
+ PinnableSlice value;
+ // Assume only the fast path is triggered
+ GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, Slice(), &value, nullptr,
+ nullptr, nullptr, true, nullptr, nullptr);
+ for (uint64_t i = 0; i < num; ++i) {
+ value.Reset();
+ value.clear();
+ ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context, nullptr));
+ ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4));
+ }
+}
+
+void ReadKeys(uint64_t num, uint32_t batch_size) {
+ Options options;
+ options.allow_mmap_reads = true;
+ Env* env = options.env;
+ const auto& fs = options.env->GetFileSystem();
+ FileOptions file_options(options);
+ std::string fname = GetFileName(num);
+
+ uint64_t file_size;
+ ASSERT_OK(
+ fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr));
+ std::unique_ptr<RandomAccessFileReader> file_reader;
+ ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options,
+ &file_reader, nullptr));
+
+ const ImmutableOptions ioptions(options);
+ CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
+ test::Uint64Comparator(), nullptr);
+ ASSERT_OK(reader.status());
+ const UserCollectedProperties user_props =
+ reader.GetTableProperties()->user_collected_properties;
+ const uint32_t num_hash_fun = *reinterpret_cast<const uint32_t*>(
+ user_props.at(CuckooTablePropertyNames::kNumHashFunc).data());
+ const uint64_t table_size = *reinterpret_cast<const uint64_t*>(
+ user_props.at(CuckooTablePropertyNames::kHashTableSize).data());
+ fprintf(stderr,
+ "With %" PRIu64
+ " items, utilization is %.2f%%, number of"
+ " hash functions: %u.\n",
+ num, num * 100.0 / (table_size), num_hash_fun);
+ ReadOptions r_options;
+
+ std::vector<uint64_t> keys;
+ keys.reserve(num);
+ for (uint64_t i = 0; i < num; ++i) {
+ keys.push_back(2 * i);
+ }
+ RandomShuffle(keys.begin(), keys.end());
+
+ PinnableSlice value;
+ // Assume only the fast path is triggered
+ GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+ GetContext::kNotFound, Slice(), &value, nullptr,
+ nullptr, nullptr, true, nullptr, nullptr);
+ uint64_t start_time = env->NowMicros();
+ if (batch_size > 0) {
+ for (uint64_t i = 0; i < num; i += batch_size) {
+ for (uint64_t j = i; j < i + batch_size && j < num; ++j) {
+ reader.Prepare(Slice(reinterpret_cast<char*>(&keys[j]), 16));
+ }
+ for (uint64_t j = i; j < i + batch_size && j < num; ++j) {
+ reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[j]), 16),
+ &get_context, nullptr);
+ }
+ }
+ } else {
+ for (uint64_t i = 0; i < num; i++) {
+ reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[i]), 16),
+ &get_context, nullptr);
+ }
+ }
+ float time_per_op = (env->NowMicros() - start_time) * 1.0f / num;
+ fprintf(stderr,
+ "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n",
+ time_per_op, 1.0 / time_per_op, batch_size);
+}
+} // namespace.
+
+TEST_F(CuckooReaderTest, TestReadPerformance) {
+ if (!FLAGS_enable_perf) {
+ return;
+ }
+ double hash_ratio = 0.95;
+ // These numbers are chosen to have a hash utilization % close to
+ // 0.9, 0.75, 0.6 and 0.5 respectively.
+ // They all create 128 M buckets.
+ std::vector<uint64_t> nums = {120 * 1024 * 1024, 100 * 1024 * 1024,
+ 80 * 1024 * 1024, 70 * 1024 * 1024};
+#ifndef NDEBUG
+ fprintf(
+ stdout,
+ "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n");
+#endif
+ for (uint64_t num : nums) {
+ if (FLAGS_write ||
+ Env::Default()->FileExists(GetFileName(num)).IsNotFound()) {
+ std::vector<std::string> all_keys;
+ GetKeys(num, &all_keys);
+ WriteFile(all_keys, num, hash_ratio);
+ }
+ ReadKeys(num, 0);
+ ReadKeys(num, 10);
+ ReadKeys(num, 25);
+ ReadKeys(num, 50);
+ ReadKeys(num, 100);
+ fprintf(stderr, "\n");
+ }
+}
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ if (ROCKSDB_NAMESPACE::port::kLittleEndian) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ ParseCommandLineFlags(&argc, &argv, true);
+ return RUN_ALL_TESTS();
+ } else {
+ fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n");
+ return 0;
+ }
+}
+
+#endif // GFLAGS.
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+ fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+ return 0;
+}
+
+#endif // ROCKSDB_LITE