diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/rocksdb/table | |
parent | Initial commit. (diff) | |
download | ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/table')
88 files changed, 28832 insertions, 0 deletions
diff --git a/src/rocksdb/table/adaptive_table_factory.cc b/src/rocksdb/table/adaptive_table_factory.cc new file mode 100644 index 00000000..bbba3b91 --- /dev/null +++ b/src/rocksdb/table/adaptive_table_factory.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#include "table/adaptive_table_factory.h" + +#include "table/table_builder.h" +#include "table/format.h" +#include "port/port.h" + +namespace rocksdb { + +AdaptiveTableFactory::AdaptiveTableFactory( + std::shared_ptr<TableFactory> table_factory_to_write, + std::shared_ptr<TableFactory> block_based_table_factory, + std::shared_ptr<TableFactory> plain_table_factory, + std::shared_ptr<TableFactory> cuckoo_table_factory) + : table_factory_to_write_(table_factory_to_write), + block_based_table_factory_(block_based_table_factory), + plain_table_factory_(plain_table_factory), + cuckoo_table_factory_(cuckoo_table_factory) { + if (!plain_table_factory_) { + plain_table_factory_.reset(NewPlainTableFactory()); + } + if (!block_based_table_factory_) { + block_based_table_factory_.reset(NewBlockBasedTableFactory()); + } + if (!cuckoo_table_factory_) { + cuckoo_table_factory_.reset(NewCuckooTableFactory()); + } + if (!table_factory_to_write_) { + table_factory_to_write_ = block_based_table_factory_; + } +} + +extern const uint64_t kPlainTableMagicNumber; +extern const uint64_t kLegacyPlainTableMagicNumber; +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kLegacyBlockBasedTableMagicNumber; +extern const uint64_t kCuckooTableMagicNumber; + +Status AdaptiveTableFactory::NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table, + bool /*prefetch_index_and_filter_in_cache*/) const { + Footer footer; + auto s = ReadFooterFromFile(file.get(), nullptr /* prefetch_buffer */, + file_size, &footer); + if (!s.ok()) { + return s; + } + if (footer.table_magic_number() == kPlainTableMagicNumber || + footer.table_magic_number() == kLegacyPlainTableMagicNumber) { + return plain_table_factory_->NewTableReader( + table_reader_options, std::move(file), file_size, table); + } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber || + footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) { + return block_based_table_factory_->NewTableReader( + table_reader_options, std::move(file), file_size, table); + } else if (footer.table_magic_number() == kCuckooTableMagicNumber) { + return cuckoo_table_factory_->NewTableReader( + table_reader_options, std::move(file), file_size, table); + } else { + return Status::NotSupported("Unidentified table format"); + } +} + +TableBuilder* AdaptiveTableFactory::NewTableBuilder( + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + WritableFileWriter* file) const { + return table_factory_to_write_->NewTableBuilder(table_builder_options, + column_family_id, file); +} + +std::string AdaptiveTableFactory::GetPrintableTableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + if (table_factory_to_write_) { + snprintf(buffer, kBufferSize, " write factory (%s) options:\n%s\n", + (table_factory_to_write_->Name() ? table_factory_to_write_->Name() + : ""), + table_factory_to_write_->GetPrintableTableOptions().c_str()); + ret.append(buffer); + } + if (plain_table_factory_) { + snprintf(buffer, kBufferSize, " %s options:\n%s\n", + plain_table_factory_->Name() ? plain_table_factory_->Name() : "", + plain_table_factory_->GetPrintableTableOptions().c_str()); + ret.append(buffer); + } + if (block_based_table_factory_) { + snprintf( + buffer, kBufferSize, " %s options:\n%s\n", + (block_based_table_factory_->Name() ? block_based_table_factory_->Name() + : ""), + block_based_table_factory_->GetPrintableTableOptions().c_str()); + ret.append(buffer); + } + if (cuckoo_table_factory_) { + snprintf(buffer, kBufferSize, " %s options:\n%s\n", + cuckoo_table_factory_->Name() ? cuckoo_table_factory_->Name() : "", + cuckoo_table_factory_->GetPrintableTableOptions().c_str()); + ret.append(buffer); + } + return ret; +} + +extern TableFactory* NewAdaptiveTableFactory( + std::shared_ptr<TableFactory> table_factory_to_write, + std::shared_ptr<TableFactory> block_based_table_factory, + std::shared_ptr<TableFactory> plain_table_factory, + std::shared_ptr<TableFactory> cuckoo_table_factory) { + return new AdaptiveTableFactory(table_factory_to_write, + block_based_table_factory, plain_table_factory, cuckoo_table_factory); +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/adaptive_table_factory.h b/src/rocksdb/table/adaptive_table_factory.h new file mode 100644 index 00000000..5534c8b3 --- /dev/null +++ b/src/rocksdb/table/adaptive_table_factory.h @@ -0,0 +1,62 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <string> +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +struct EnvOptions; + +class Status; +class RandomAccessFile; +class WritableFile; +class Table; +class TableBuilder; + +class AdaptiveTableFactory : public TableFactory { + public: + ~AdaptiveTableFactory() {} + + explicit AdaptiveTableFactory( + std::shared_ptr<TableFactory> table_factory_to_write, + std::shared_ptr<TableFactory> block_based_table_factory, + std::shared_ptr<TableFactory> plain_table_factory, + std::shared_ptr<TableFactory> cuckoo_table_factory); + + const char* Name() const override { return "AdaptiveTableFactory"; } + + Status NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table, + bool prefetch_index_and_filter_in_cache = true) const override; + + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + uint32_t column_family_id, WritableFileWriter* file) const override; + + // Sanitizes the specified DB Options. + Status SanitizeOptions( + const DBOptions& /*db_opts*/, + const ColumnFamilyOptions& /*cf_opts*/) const override { + return Status::OK(); + } + + std::string GetPrintableTableOptions() const override; + + private: + std::shared_ptr<TableFactory> table_factory_to_write_; + std::shared_ptr<TableFactory> block_based_table_factory_; + std::shared_ptr<TableFactory> plain_table_factory_; + std::shared_ptr<TableFactory> cuckoo_table_factory_; +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/block.cc b/src/rocksdb/table/block.cc new file mode 100644 index 00000000..7c83ebb6 --- /dev/null +++ b/src/rocksdb/table/block.cc @@ -0,0 +1,960 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Decodes the blocks generated by block_builder.cc. + +#include "table/block.h" +#include <algorithm> +#include <string> +#include <unordered_map> +#include <vector> + +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/comparator.h" +#include "table/block_prefix_index.h" +#include "table/data_block_footer.h" +#include "table/format.h" +#include "util/coding.h" +#include "util/logging.h" + +namespace rocksdb { + +// Helper routine: decode the next block entry starting at "p", +// storing the number of shared key bytes, non_shared key bytes, +// and the length of the value in "*shared", "*non_shared", and +// "*value_length", respectively. Will not derefence past "limit". +// +// If any errors are detected, returns nullptr. Otherwise, returns a +// pointer to the key delta (just past the three decoded values). +struct DecodeEntry { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + assert(limit - p >= 3); + *shared = reinterpret_cast<const unsigned char*>(p)[0]; + *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; + *value_length = reinterpret_cast<const unsigned char*>(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { + return nullptr; + } + } + + // Using an assert in place of "return null" since we should not pay the + // cost of checking for corruption on every single key decoding + assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length))); + return p; + } +}; + +// Helper routine: similar to DecodeEntry but does not have assertions. +// Instead, returns nullptr so that caller can detect and report failure. +struct CheckAndDecodeEntry { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + if (limit - p < 3) { + return nullptr; + } + *shared = reinterpret_cast<const unsigned char*>(p)[0]; + *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; + *value_length = reinterpret_cast<const unsigned char*>(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { + return nullptr; + } + } + + if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) { + return nullptr; + } + return p; + } +}; + +struct DecodeKey { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + uint32_t value_length; + return DecodeEntry()(p, limit, shared, non_shared, &value_length); + } +}; + +// In format_version 4, which is used by index blocks, the value size is not +// encoded before the entry, as the value is known to be the handle with the +// known size. +struct DecodeKeyV4 { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + if (limit - p < 3) return nullptr; + *shared = reinterpret_cast<const unsigned char*>(p)[0]; + *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; + if ((*shared | *non_shared) < 128) { + // Fast path: all three values are encoded in one byte each + p += 2; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + } + return p; + } +}; + +void DataBlockIter::Next() { + assert(Valid()); + ParseNextDataKey<DecodeEntry>(); +} + +void DataBlockIter::NextOrReport() { + assert(Valid()); + ParseNextDataKey<CheckAndDecodeEntry>(); +} + +void IndexBlockIter::Next() { + assert(Valid()); + ParseNextIndexKey(); +} + +void IndexBlockIter::Prev() { + assert(Valid()); + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + SeekToRestartPoint(restart_index_); + do { + if (!ParseNextIndexKey()) { + break; + } + // Loop until end of current entry hits the start of original entry + } while (NextEntryOffset() < original); +} + +// Similar to IndexBlockIter::Prev but also caches the prev entries +void DataBlockIter::Prev() { + assert(Valid()); + + assert(prev_entries_idx_ == -1 || + static_cast<size_t>(prev_entries_idx_) < prev_entries_.size()); + // Check if we can use cached prev_entries_ + if (prev_entries_idx_ > 0 && + prev_entries_[prev_entries_idx_].offset == current_) { + // Read cached CachedPrevEntry + prev_entries_idx_--; + const CachedPrevEntry& current_prev_entry = + prev_entries_[prev_entries_idx_]; + + const char* key_ptr = nullptr; + if (current_prev_entry.key_ptr != nullptr) { + // The key is not delta encoded and stored in the data block + key_ptr = current_prev_entry.key_ptr; + key_pinned_ = true; + } else { + // The key is delta encoded and stored in prev_entries_keys_buff_ + key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; + key_pinned_ = false; + } + const Slice current_key(key_ptr, current_prev_entry.key_size); + + current_ = current_prev_entry.offset; + key_.SetKey(current_key, false /* copy */); + value_ = current_prev_entry.value; + + return; + } + + // Clear prev entries cache + prev_entries_idx_ = -1; + prev_entries_.clear(); + prev_entries_keys_buff_.clear(); + + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + + SeekToRestartPoint(restart_index_); + + do { + if (!ParseNextDataKey<DecodeEntry>()) { + break; + } + Slice current_key = key(); + + if (key_.IsKeyPinned()) { + // The key is not delta encoded + prev_entries_.emplace_back(current_, current_key.data(), 0, + current_key.size(), value()); + } else { + // The key is delta encoded, cache decoded key in buffer + size_t new_key_offset = prev_entries_keys_buff_.size(); + prev_entries_keys_buff_.append(current_key.data(), current_key.size()); + + prev_entries_.emplace_back(current_, nullptr, new_key_offset, + current_key.size(), value()); + } + // Loop until end of current entry hits the start of original entry + } while (NextEntryOffset() < original); + prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1; +} + +void DataBlockIter::Seek(const Slice& target) { + Slice seek_key = target; + PERF_TIMER_GUARD(block_seek_nanos); + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index, + comparator_); + + if (!ok) { + return; + } + SeekToRestartPoint(index); + // Linear search (within restart block) for first key >= target + + while (true) { + if (!ParseNextDataKey<DecodeEntry>() || Compare(key_, seek_key) >= 0) { + return; + } + } +} + +// Optimized Seek for point lookup for an internal key `target` +// target = "seek_user_key @ type | seqno". +// +// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion, +// or kTypeBlobIndex, this function behaves identically as Seek(). +// +// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion, +// or kTypeBlobIndex: +// +// If the return value is FALSE, iter location is undefined, and it means: +// 1) there is no key in this block falling into the range: +// ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"], +// inclusive; AND +// 2) the last key of this block has a greater user_key from seek_user_key +// +// If the return value is TRUE, iter location has two possibilies: +// 1) If iter is valid, it is set to a location as if set by BinarySeek. In +// this case, it points to the first key_ with a larger user_key or a +// matching user_key with a seqno no greater than the seeking seqno. +// 2) If the iter is invalid, it means that either all the user_key is less +// than the seek_user_key, or the block ends with a matching user_key but +// with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno +// but larger type). +bool DataBlockIter::SeekForGetImpl(const Slice& target) { + Slice user_key = ExtractUserKey(target); + uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t); + uint8_t entry = data_block_hash_index_->Lookup(data_, map_offset, user_key); + + if (entry == kCollision) { + // HashSeek not effective, falling back + Seek(target); + return true; + } + + if (entry == kNoEntry) { + // Even if we cannot find the user_key in this block, the result may + // exist in the next block. Consider this exmpale: + // + // Block N: [aab@100, ... , app@120] + // bounary key: axy@50 (we make minimal assumption about a boundary key) + // Block N+1: [axy@10, ... ] + // + // If seek_key = axy@60, the search will starts from Block N. + // Even if the user_key is not found in the hash map, the caller still + // have to conntinue searching the next block. + // + // In this case, we pretend the key is the the last restart interval. + // The while-loop below will search the last restart interval for the + // key. It will stop at the first key that is larger than the seek_key, + // or to the end of the block if no one is larger. + entry = static_cast<uint8_t>(num_restarts_ - 1); + } + + uint32_t restart_index = entry; + + // check if the key is in the restart_interval + assert(restart_index < num_restarts_); + SeekToRestartPoint(restart_index); + + const char* limit = nullptr; + if (restart_index_ + 1 < num_restarts_) { + limit = data_ + GetRestartPoint(restart_index_ + 1); + } else { + limit = data_ + restarts_; + } + + while (true) { + // Here we only linear seek the target key inside the restart interval. + // If a key does not exist inside a restart interval, we avoid + // further searching the block content accross restart interval boundary. + // + // TODO(fwu): check the left and write boundary of the restart interval + // to avoid linear seek a target key that is out of range. + if (!ParseNextDataKey<DecodeEntry>(limit) || Compare(key_, target) >= 0) { + // we stop at the first potential matching user key. + break; + } + } + + if (current_ == restarts_) { + // Search reaches to the end of the block. There are three possibilites: + // 1) there is only one user_key match in the block (otherwise collsion). + // the matching user_key resides in the last restart interval, and it + // is the last key of the restart interval and of the block as well. + // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller. + // + // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, + // AND all existing user_keys in the restart interval are smaller than + // seek_user_key. + // + // 3) The seek_key is a false positive and happens to be hashed to the + // last restart interval, AND all existing user_keys in the restart + // interval are smaller than seek_user_key. + // + // The result may exist in the next block each case, so we return true. + return true; + } + + if (user_comparator_->Compare(key_.GetUserKey(), user_key) != 0) { + // the key is not in this block and cannot be at the next block either. + return false; + } + + // Here we are conservative and only support a limited set of cases + ValueType value_type = ExtractValueType(key_.GetKey()); + if (value_type != ValueType::kTypeValue && + value_type != ValueType::kTypeDeletion && + value_type != ValueType::kTypeSingleDeletion && + value_type != ValueType::kTypeBlobIndex) { + Seek(target); + return true; + } + + // Result found, and the iter is correctly set. + return true; +} + +void IndexBlockIter::Seek(const Slice& target) { + Slice seek_key = target; + if (!key_includes_seq_) { + seek_key = ExtractUserKey(target); + } + PERF_TIMER_GUARD(block_seek_nanos); + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool ok = false; + if (prefix_index_) { + ok = PrefixSeek(target, &index); + } else if (value_delta_encoded_) { + ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index, + comparator_); + } else { + ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index, + comparator_); + } + + if (!ok) { + return; + } + SeekToRestartPoint(index); + // Linear search (within restart block) for first key >= target + + while (true) { + if (!ParseNextIndexKey() || Compare(key_, seek_key) >= 0) { + return; + } + } +} + +void DataBlockIter::SeekForPrev(const Slice& target) { + PERF_TIMER_GUARD(block_seek_nanos); + Slice seek_key = target; + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index, + comparator_); + + if (!ok) { + return; + } + SeekToRestartPoint(index); + // Linear search (within restart block) for first key >= seek_key + + while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) { + } + if (!Valid()) { + SeekToLast(); + } else { + while (Valid() && Compare(key_, seek_key) > 0) { + Prev(); + } + } +} + +void DataBlockIter::SeekToFirst() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(0); + ParseNextDataKey<DecodeEntry>(); +} + +void DataBlockIter::SeekToFirstOrReport() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(0); + ParseNextDataKey<CheckAndDecodeEntry>(); +} + +void IndexBlockIter::SeekToFirst() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(0); + ParseNextIndexKey(); +} + +void DataBlockIter::SeekToLast() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(num_restarts_ - 1); + while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) { + // Keep skipping + } +} + +void IndexBlockIter::SeekToLast() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(num_restarts_ - 1); + while (ParseNextIndexKey() && NextEntryOffset() < restarts_) { + // Keep skipping + } +} + +template <class TValue> +void BlockIter<TValue>::CorruptionError() { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption("bad entry in block"); + key_.Clear(); + value_.clear(); +} + +template <typename DecodeEntryFunc> +bool DataBlockIter::ParseNextDataKey(const char* limit) { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + if (!limit) { + limit = data_ + restarts_; // Restarts come right after data + } + + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + + // Decode next entry + uint32_t shared, non_shared, value_length; + p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); + if (p == nullptr || key_.Size() < shared) { + CorruptionError(); + return false; + } else { + if (shared == 0) { + // If this key dont share any bytes with prev key then we dont need + // to decode it and can use it's address in the block directly. + key_.SetKey(Slice(p, non_shared), false /* copy */); + key_pinned_ = true; + } else { + // This key share `shared` bytes with prev key, we need to decode it + key_.TrimAppend(shared, p, non_shared); + key_pinned_ = false; + } + + if (global_seqno_ != kDisableGlobalSequenceNumber) { + // If we are reading a file with a global sequence number we should + // expect that all encoded sequence numbers are zeros and any value + // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion. + assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0); + + ValueType value_type = ExtractValueType(key_.GetKey()); + assert(value_type == ValueType::kTypeValue || + value_type == ValueType::kTypeMerge || + value_type == ValueType::kTypeDeletion || + value_type == ValueType::kTypeRangeDeletion); + + if (key_pinned_) { + // TODO(tec): Investigate updating the seqno in the loaded block + // directly instead of doing a copy and update. + + // We cannot use the key address in the block directly because + // we have a global_seqno_ that will overwrite the encoded one. + key_.OwnKey(); + key_pinned_ = false; + } + + key_.UpdateInternalKey(global_seqno_, value_type); + } + + value_ = Slice(p + non_shared, value_length); + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed + return true; + } +} + +bool IndexBlockIter::ParseNextIndexKey() { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + const char* limit = data_ + restarts_; // Restarts come right after data + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + + // Decode next entry + uint32_t shared, non_shared, value_length; + if (value_delta_encoded_) { + p = DecodeKeyV4()(p, limit, &shared, &non_shared); + value_length = 0; + } else { + p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); + } + if (p == nullptr || key_.Size() < shared) { + CorruptionError(); + return false; + } + if (shared == 0) { + // If this key dont share any bytes with prev key then we dont need + // to decode it and can use it's address in the block directly. + key_.SetKey(Slice(p, non_shared), false /* copy */); + key_pinned_ = true; + } else { + // This key share `shared` bytes with prev key, we need to decode it + key_.TrimAppend(shared, p, non_shared); + key_pinned_ = false; + } + value_ = Slice(p + non_shared, value_length); + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed + if (value_delta_encoded_) { + assert(value_length == 0); + DecodeCurrentValue(shared); + } + return true; +} + +// The format: +// restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// ... +// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// where, k is key, v is value, and its encoding is in parenthesis. +// The format of each key is (shared_size, non_shared_size, shared, non_shared) +// The format of each value, i.e., block hanlde, is (offset, size) whenever the +// shared_size is 0, which included the first entry in each restart point. +// Otherwise the format is delta-size = block handle size - size of last block +// handle. +void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { + assert(value_delta_encoded_); + const char* limit = data_ + restarts_; + if (shared == 0) { + uint64_t o, s; + const char* newp = GetVarint64Ptr(value_.data(), limit, &o); + assert(newp); + newp = GetVarint64Ptr(newp, limit, &s); + assert(newp); + decoded_value_ = BlockHandle(o, s); + value_ = Slice(value_.data(), newp - value_.data()); + } else { + uint64_t next_value_base = + decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize; + int64_t delta; + const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta); + decoded_value_ = + BlockHandle(next_value_base, decoded_value_.size() + delta); + value_ = Slice(value_.data(), newp - value_.data()); + } +} + +// Binary search in restart array to find the first restart point that +// is either the last restart point with a key less than target, +// which means the key of next restart point is larger than target, or +// the first restart point with a key = target +template <class TValue> +template <typename DecodeKeyFunc> +bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left, + uint32_t right, uint32_t* index, + const Comparator* comp) { + assert(left <= right); + + while (left < right) { + uint32_t mid = (left + right + 1) / 2; + uint32_t region_offset = GetRestartPoint(mid); + uint32_t shared, non_shared; + const char* key_ptr = DecodeKeyFunc()( + data_ + region_offset, data_ + restarts_, &shared, &non_shared); + if (key_ptr == nullptr || (shared != 0)) { + CorruptionError(); + return false; + } + Slice mid_key(key_ptr, non_shared); + int cmp = comp->Compare(mid_key, target); + if (cmp < 0) { + // Key at "mid" is smaller than "target". Therefore all + // blocks before "mid" are uninteresting. + left = mid; + } else if (cmp > 0) { + // Key at "mid" is >= "target". Therefore all blocks at or + // after "mid" are uninteresting. + right = mid - 1; + } else { + left = right = mid; + } + } + + *index = left; + return true; +} + +// Compare target key and the block key of the block of `block_index`. +// Return -1 if error. +int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { + uint32_t region_offset = GetRestartPoint(block_index); + uint32_t shared, non_shared; + const char* key_ptr = + value_delta_encoded_ + ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared) + : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared); + if (key_ptr == nullptr || (shared != 0)) { + CorruptionError(); + return 1; // Return target is smaller + } + Slice block_key(key_ptr, non_shared); + return Compare(block_key, target); +} + +// Binary search in block_ids to find the first block +// with a key >= target +bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, + uint32_t* block_ids, uint32_t left, + uint32_t right, uint32_t* index) { + assert(left <= right); + uint32_t left_bound = left; + + while (left <= right) { + uint32_t mid = (right + left) / 2; + + int cmp = CompareBlockKey(block_ids[mid], target); + if (!status_.ok()) { + return false; + } + if (cmp < 0) { + // Key at "target" is larger than "mid". Therefore all + // blocks before or at "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "target" is <= "mid". Therefore all blocks + // after "mid" are uninteresting. + // If there is only one block left, we found it. + if (left == right) break; + right = mid; + } + } + + if (left == right) { + // In one of the two following cases: + // (1) left is the first one of block_ids + // (2) there is a gap of blocks between block of `left` and `left-1`. + // we can further distinguish the case of key in the block or key not + // existing, by comparing the target key and the key of the previous + // block to the left of the block found. + if (block_ids[left] > 0 && + (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && + CompareBlockKey(block_ids[left] - 1, target) > 0) { + current_ = restarts_; + return false; + } + + *index = block_ids[left]; + return true; + } else { + assert(left > right); + // Mark iterator invalid + current_ = restarts_; + return false; + } +} + +bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index) { + assert(prefix_index_); + Slice seek_key = target; + if (!key_includes_seq_) { + seek_key = ExtractUserKey(target); + } + uint32_t* block_ids = nullptr; + uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); + + if (num_blocks == 0) { + current_ = restarts_; + return false; + } else { + return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index); + } +} + +uint32_t Block::NumRestarts() const { + assert(size_ >= 2 * sizeof(uint32_t)); + uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); + uint32_t num_restarts = block_footer; + if (size_ > kMaxBlockSizeSupportedByHashIndex) { + // In BlockBuilder, we have ensured a block with HashIndex is less than + // kMaxBlockSizeSupportedByHashIndex (64KiB). + // + // Therefore, if we encounter a block with a size > 64KiB, the block + // cannot have HashIndex. So the footer will directly interpreted as + // num_restarts. + // + // Such check is for backward compatibility. We can ensure legacy block + // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted + // correctly as no HashIndex even if the MSB of num_restarts is set. + return num_restarts; + } + BlockBasedTableOptions::DataBlockIndexType index_type; + UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); + return num_restarts; +} + +BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const { + assert(size_ >= 2 * sizeof(uint32_t)); + if (size_ > kMaxBlockSizeSupportedByHashIndex) { + // The check is for the same reason as that in NumRestarts() + return BlockBasedTableOptions::kDataBlockBinarySearch; + } + uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); + uint32_t num_restarts = block_footer; + BlockBasedTableOptions::DataBlockIndexType index_type; + UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); + return index_type; +} + +Block::~Block() { + // This sync point can be re-enabled if RocksDB can control the + // initialization order of any/all static options created by the user. + // TEST_SYNC_POINT("Block::~Block"); +} + +Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, + size_t read_amp_bytes_per_bit, Statistics* statistics) + : contents_(std::move(contents)), + data_(contents_.data.data()), + size_(contents_.data.size()), + restart_offset_(0), + num_restarts_(0), + global_seqno_(_global_seqno) { + TEST_SYNC_POINT("Block::Block:0"); + if (size_ < sizeof(uint32_t)) { + size_ = 0; // Error marker + } else { + // Should only decode restart points for uncompressed blocks + num_restarts_ = NumRestarts(); + switch (IndexType()) { + case BlockBasedTableOptions::kDataBlockBinarySearch: + restart_offset_ = static_cast<uint32_t>(size_) - + (1 + num_restarts_) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. + size_ = 0; + } + break; + case BlockBasedTableOptions::kDataBlockBinaryAndHash: + if (size_ < sizeof(uint32_t) /* block footer */ + + sizeof(uint16_t) /* NUM_BUCK */) { + size_ = 0; + break; + } + + uint16_t map_offset; + data_block_hash_index_.Initialize( + contents.data.data(), + static_cast<uint16_t>(contents.data.size() - + sizeof(uint32_t)), /*chop off + NUM_RESTARTS*/ + &map_offset); + + restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t); + + if (restart_offset_ > map_offset) { + // map_offset is too small for NumRestarts() and + // therefore restart_offset_ wrapped around. + size_ = 0; + break; + } + break; + default: + size_ = 0; // Error marker + } + } + if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { + read_amp_bitmap_.reset(new BlockReadAmpBitmap( + restart_offset_, read_amp_bytes_per_bit, statistics)); + } +} + +template <> +DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, + DataBlockIter* iter, Statistics* stats, + bool /*total_order_seek*/, + bool /*key_includes_seq*/, + bool /*value_is_full*/, + bool block_contents_pinned, + BlockPrefixIndex* /*prefix_index*/) { + DataBlockIter* ret_iter; + if (iter != nullptr) { + ret_iter = iter; + } else { + ret_iter = new DataBlockIter; + } + if (size_ < 2 * sizeof(uint32_t)) { + ret_iter->Invalidate(Status::Corruption("bad block contents")); + return ret_iter; + } + if (num_restarts_ == 0) { + // Empty block. + ret_iter->Invalidate(Status::OK()); + return ret_iter; + } else { + ret_iter->Initialize( + cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_, + read_amp_bitmap_.get(), block_contents_pinned, + data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr); + if (read_amp_bitmap_) { + if (read_amp_bitmap_->GetStatistics() != stats) { + // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ + read_amp_bitmap_->SetStatistics(stats); + } + } + } + + return ret_iter; +} + +template <> +IndexBlockIter* Block::NewIterator(const Comparator* cmp, + const Comparator* ucmp, IndexBlockIter* iter, + Statistics* /*stats*/, bool total_order_seek, + bool key_includes_seq, bool value_is_full, + bool block_contents_pinned, + BlockPrefixIndex* prefix_index) { + IndexBlockIter* ret_iter; + if (iter != nullptr) { + ret_iter = iter; + } else { + ret_iter = new IndexBlockIter; + } + if (size_ < 2 * sizeof(uint32_t)) { + ret_iter->Invalidate(Status::Corruption("bad block contents")); + return ret_iter; + } + if (num_restarts_ == 0) { + // Empty block. + ret_iter->Invalidate(Status::OK()); + return ret_iter; + } else { + BlockPrefixIndex* prefix_index_ptr = + total_order_seek ? nullptr : prefix_index; + ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, + prefix_index_ptr, key_includes_seq, value_is_full, + block_contents_pinned, + nullptr /* data_block_hash_index */); + } + + return ret_iter; +} + +size_t Block::ApproximateMemoryUsage() const { + size_t usage = usable_size(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + if (read_amp_bitmap_) { + usage += read_amp_bitmap_->ApproximateMemoryUsage(); + } + return usage; +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/block.h b/src/rocksdb/table/block.h new file mode 100644 index 00000000..737874ab --- /dev/null +++ b/src/rocksdb/table/block.h @@ -0,0 +1,568 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <stddef.h> +#include <stdint.h> +#include <string> +#include <vector> +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include <malloc_np.h> +#else +#include <malloc.h> +#endif +#endif + +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "format.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "table/block_prefix_index.h" +#include "table/data_block_hash_index.h" +#include "table/internal_iterator.h" +#include "util/random.h" +#include "util/sync_point.h" + +namespace rocksdb { + +struct BlockContents; +class Comparator; +template <class TValue> +class BlockIter; +class DataBlockIter; +class IndexBlockIter; +class BlockPrefixIndex; + +// BlockReadAmpBitmap is a bitmap that map the rocksdb::Block data bytes to +// a bitmap with ratio bytes_per_bit. Whenever we access a range of bytes in +// the Block we update the bitmap and increment READ_AMP_ESTIMATE_USEFUL_BYTES. +class BlockReadAmpBitmap { + public: + explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit, + Statistics* statistics) + : bitmap_(nullptr), + bytes_per_bit_pow_(0), + statistics_(statistics), + rnd_(Random::GetTLSInstance()->Uniform( + static_cast<int>(bytes_per_bit))) { + TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_); + assert(block_size > 0 && bytes_per_bit > 0); + + // convert bytes_per_bit to be a power of 2 + while (bytes_per_bit >>= 1) { + bytes_per_bit_pow_++; + } + + // num_bits_needed = ceil(block_size / bytes_per_bit) + size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1; + assert(num_bits_needed > 0); + + // bitmap_size = ceil(num_bits_needed / kBitsPerEntry) + size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1; + + // Create bitmap and set all the bits to 0 + bitmap_ = new std::atomic<uint32_t>[bitmap_size](); + + RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size); + } + + ~BlockReadAmpBitmap() { delete[] bitmap_; } + + void Mark(uint32_t start_offset, uint32_t end_offset) { + assert(end_offset >= start_offset); + // Index of first bit in mask + uint32_t start_bit = + (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >> + bytes_per_bit_pow_; + // Index of last bit in mask + 1 + uint32_t exclusive_end_bit = + (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_; + if (start_bit >= exclusive_end_bit) { + return; + } + assert(exclusive_end_bit > 0); + + if (GetAndSet(start_bit) == 0) { + uint32_t new_useful_bytes = (exclusive_end_bit - start_bit) + << bytes_per_bit_pow_; + RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES, + new_useful_bytes); + } + } + + Statistics* GetStatistics() { + return statistics_.load(std::memory_order_relaxed); + } + + void SetStatistics(Statistics* stats) { statistics_.store(stats); } + + uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; } + + size_t ApproximateMemoryUsage() const { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + return malloc_usable_size((void*)this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return sizeof(*this); + } + + private: + // Get the current value of bit at `bit_idx` and set it to 1 + inline bool GetAndSet(uint32_t bit_idx) { + const uint32_t byte_idx = bit_idx / kBitsPerEntry; + const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry); + + return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) & + bit_mask; + } + + const uint32_t kBytesPersEntry = sizeof(uint32_t); // 4 bytes + const uint32_t kBitsPerEntry = kBytesPersEntry * 8; // 32 bits + + // Bitmap used to record the bytes that we read, use atomic to protect + // against multiple threads updating the same bit + std::atomic<uint32_t>* bitmap_; + // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize + // muliplication and division + uint8_t bytes_per_bit_pow_; + // Pointer to DB Statistics object, Since this bitmap may outlive the DB + // this pointer maybe invalid, but the DB will update it to a valid pointer + // by using SetStatistics() before calling Mark() + std::atomic<Statistics*> statistics_; + uint32_t rnd_; +}; + +class Block { + public: + // Initialize the block with the specified contents. + explicit Block(BlockContents&& contents, SequenceNumber _global_seqno, + size_t read_amp_bytes_per_bit = 0, + Statistics* statistics = nullptr); + + ~Block(); + + size_t size() const { return size_; } + const char* data() const { return data_; } + // The additional memory space taken by the block data. + size_t usable_size() const { return contents_.usable_size(); } + uint32_t NumRestarts() const; + bool own_bytes() const { return contents_.own_bytes(); } + + BlockBasedTableOptions::DataBlockIndexType IndexType() const; + + // If comparator is InternalKeyComparator, user_comparator is its user + // comparator; they are equal otherwise. + // + // If iter is null, return new Iterator + // If iter is not null, update this one and return it as Iterator* + // + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default true, means that no delta encoding is + // applied to values. + // + // NewIterator<DataBlockIter> + // Same as above but also updates read_amp_bitmap_ if it is not nullptr. + // + // NewIterator<IndexBlockIter> + // If `prefix_index` is not nullptr this block will do hash lookup for the key + // prefix. If total_order_seek is true, prefix_index_ is ignored. + // + // If `block_contents_pinned` is true, the caller will guarantee that when + // the cleanup functions are transferred from the iterator to other + // classes, e.g. PinnableSlice, the pointer to the bytes will still be + // valid. Either the iterator holds cache handle or ownership of some resource + // and release them in a release function, or caller is sure that the data + // will not go away (for example, it's from mmapped file which will not be + // closed). + // + // NOTE: for the hash based lookup, if a key prefix doesn't match any key, + // the iterator will simply be set as "invalid", rather than returning + // the key that is just pass the target key. + template <typename TBlockIter> + TBlockIter* NewIterator( + const Comparator* comparator, const Comparator* user_comparator, + TBlockIter* iter = nullptr, Statistics* stats = nullptr, + bool total_order_seek = true, bool key_includes_seq = true, + bool value_is_full = true, bool block_contents_pinned = false, + BlockPrefixIndex* prefix_index = nullptr); + + // Report an approximation of how much memory has been used. + size_t ApproximateMemoryUsage() const; + + SequenceNumber global_seqno() const { return global_seqno_; } + + private: + BlockContents contents_; + const char* data_; // contents_.data.data() + size_t size_; // contents_.data.size() + uint32_t restart_offset_; // Offset in data_ of restart array + uint32_t num_restarts_; + std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_; + // All keys in the block will have seqno = global_seqno_, regardless of + // the encoded value (kDisableGlobalSequenceNumber means disabled) + const SequenceNumber global_seqno_; + + DataBlockHashIndex data_block_hash_index_; + + // No copying allowed + Block(const Block&) = delete; + void operator=(const Block&) = delete; +}; + +template <class TValue> +class BlockIter : public InternalIteratorBase<TValue> { + public: + void InitializeBase(const Comparator* comparator, const char* data, + uint32_t restarts, uint32_t num_restarts, + SequenceNumber global_seqno, bool block_contents_pinned) { + assert(data_ == nullptr); // Ensure it is called only once + assert(num_restarts > 0); // Ensure the param is valid + + comparator_ = comparator; + data_ = data; + restarts_ = restarts; + num_restarts_ = num_restarts; + current_ = restarts_; + restart_index_ = num_restarts_; + global_seqno_ = global_seqno; + block_contents_pinned_ = block_contents_pinned; + } + + // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do + // nothing. Calls cleanup functions. + void InvalidateBase(Status s) { + // Assert that the BlockIter is never deleted while Pinning is Enabled. + assert(!pinned_iters_mgr_ || + (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); + + data_ = nullptr; + current_ = restarts_; + status_ = s; + + // Call cleanup callbacks. + Cleanable::Reset(); + } + + virtual bool Valid() const override { return current_ < restarts_; } + virtual Status status() const override { return status_; } + virtual Slice key() const override { + assert(Valid()); + return key_.GetKey(); + } + +#ifndef NDEBUG + virtual ~BlockIter() { + // Assert that the BlockIter is never deleted while Pinning is Enabled. + assert(!pinned_iters_mgr_ || + (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); + } + virtual void SetPinnedItersMgr( + PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; +#endif + + virtual bool IsKeyPinned() const override { + return block_contents_pinned_ && key_pinned_; + } + + virtual bool IsValuePinned() const override { return block_contents_pinned_; } + + size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; } + + uint32_t ValueOffset() const { + return static_cast<uint32_t>(value_.data() - data_); + } + + protected: + // Note: The type could be changed to InternalKeyComparator but we see a weird + // performance drop by that. + const Comparator* comparator_; + const char* data_; // underlying block contents + uint32_t num_restarts_; // Number of uint32_t entries in restart array + + // Index of restart block in which current_ or current_-1 falls + uint32_t restart_index_; + uint32_t restarts_; // Offset of restart array (list of fixed32) + // current_ is offset in data_ of current entry. >= restarts_ if !Valid + uint32_t current_; + IterKey key_; + Slice value_; + Status status_; + bool key_pinned_; + // Whether the block data is guaranteed to outlive this iterator, and + // as long as the cleanup functions are transferred to another class, + // e.g. PinnableSlice, the pointer to the bytes will still be valid. + bool block_contents_pinned_; + SequenceNumber global_seqno_; + + public: + // Return the offset in data_ just past the end of the current entry. + inline uint32_t NextEntryOffset() const { + // NOTE: We don't support blocks bigger than 2GB + return static_cast<uint32_t>((value_.data() + value_.size()) - data_); + } + + uint32_t GetRestartPoint(uint32_t index) { + assert(index < num_restarts_); + return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); + } + + void SeekToRestartPoint(uint32_t index) { + key_.Clear(); + restart_index_ = index; + // current_ will be fixed by ParseNextKey(); + + // ParseNextKey() starts at the end of value_, so set value_ accordingly + uint32_t offset = GetRestartPoint(index); + value_ = Slice(data_ + offset, 0); + } + + void CorruptionError(); + + template <typename DecodeKeyFunc> + inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, + uint32_t* index, const Comparator* comp); +}; + +class DataBlockIter final : public BlockIter<Slice> { + public: + DataBlockIter() + : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} + DataBlockIter(const Comparator* comparator, const Comparator* user_comparator, + const char* data, uint32_t restarts, uint32_t num_restarts, + SequenceNumber global_seqno, + BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, + DataBlockHashIndex* data_block_hash_index) + : DataBlockIter() { + Initialize(comparator, user_comparator, data, restarts, num_restarts, + global_seqno, read_amp_bitmap, block_contents_pinned, + data_block_hash_index); + } + void Initialize(const Comparator* comparator, + const Comparator* user_comparator, const char* data, + uint32_t restarts, uint32_t num_restarts, + SequenceNumber global_seqno, + BlockReadAmpBitmap* read_amp_bitmap, + bool block_contents_pinned, + DataBlockHashIndex* data_block_hash_index) { + InitializeBase(comparator, data, restarts, num_restarts, global_seqno, + block_contents_pinned); + user_comparator_ = user_comparator; + key_.SetIsUserKey(false); + read_amp_bitmap_ = read_amp_bitmap; + last_bitmap_offset_ = current_ + 1; + data_block_hash_index_ = data_block_hash_index; + } + + virtual Slice value() const override { + assert(Valid()); + if (read_amp_bitmap_ && current_ < restarts_ && + current_ != last_bitmap_offset_) { + read_amp_bitmap_->Mark(current_ /* current entry offset */, + NextEntryOffset() - 1); + last_bitmap_offset_ = current_; + } + return value_; + } + + virtual void Seek(const Slice& target) override; + + inline bool SeekForGet(const Slice& target) { + if (!data_block_hash_index_) { + Seek(target); + return true; + } + + return SeekForGetImpl(target); + } + + virtual void SeekForPrev(const Slice& target) override; + + virtual void Prev() override; + + virtual void Next() override; + + // Try to advance to the next entry in the block. If there is data corruption + // or error, report it to the caller instead of aborting the process. May + // incur higher CPU overhead because we need to perform check on every entry. + void NextOrReport(); + + virtual void SeekToFirst() override; + + // Try to seek to the first entry in the block. If there is data corruption + // or error, report it to caller instead of aborting the process. May incur + // higher CPU overhead because we need to perform check on every entry. + void SeekToFirstOrReport(); + + virtual void SeekToLast() override; + + void Invalidate(Status s) { + InvalidateBase(s); + // Clear prev entries cache. + prev_entries_keys_buff_.clear(); + prev_entries_.clear(); + prev_entries_idx_ = -1; + } + + private: + // read-amp bitmap + BlockReadAmpBitmap* read_amp_bitmap_; + // last `current_` value we report to read-amp bitmp + mutable uint32_t last_bitmap_offset_; + struct CachedPrevEntry { + explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr, + size_t _key_offset, size_t _key_size, Slice _value) + : offset(_offset), + key_ptr(_key_ptr), + key_offset(_key_offset), + key_size(_key_size), + value(_value) {} + + // offset of entry in block + uint32_t offset; + // Pointer to key data in block (nullptr if key is delta-encoded) + const char* key_ptr; + // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr) + size_t key_offset; + // size of key + size_t key_size; + // value slice pointing to data in block + Slice value; + }; + std::string prev_entries_keys_buff_; + std::vector<CachedPrevEntry> prev_entries_; + int32_t prev_entries_idx_ = -1; + + DataBlockHashIndex* data_block_hash_index_; + const Comparator* user_comparator_; + + template <typename DecodeEntryFunc> + inline bool ParseNextDataKey(const char* limit = nullptr); + + inline int Compare(const IterKey& ikey, const Slice& b) const { + return comparator_->Compare(ikey.GetInternalKey(), b); + } + + bool SeekForGetImpl(const Slice& target); +}; + +class IndexBlockIter final : public BlockIter<BlockHandle> { + public: + IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} + + virtual Slice key() const override { + assert(Valid()); + return key_.GetKey(); + } + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default true, means that no delta encoding is + // applied to values. + IndexBlockIter(const Comparator* comparator, + const Comparator* user_comparator, const char* data, + uint32_t restarts, uint32_t num_restarts, + BlockPrefixIndex* prefix_index, bool key_includes_seq, + bool value_is_full, bool block_contents_pinned) + : IndexBlockIter() { + Initialize(comparator, user_comparator, data, restarts, num_restarts, + prefix_index, key_includes_seq, block_contents_pinned, + value_is_full, nullptr /* data_block_hash_index */); + } + + void Initialize(const Comparator* comparator, + const Comparator* user_comparator, const char* data, + uint32_t restarts, uint32_t num_restarts, + BlockPrefixIndex* prefix_index, bool key_includes_seq, + bool value_is_full, bool block_contents_pinned, + DataBlockHashIndex* /*data_block_hash_index*/) { + InitializeBase(key_includes_seq ? comparator : user_comparator, data, + restarts, num_restarts, kDisableGlobalSequenceNumber, + block_contents_pinned); + key_includes_seq_ = key_includes_seq; + key_.SetIsUserKey(!key_includes_seq_); + prefix_index_ = prefix_index; + value_delta_encoded_ = !value_is_full; + } + + virtual BlockHandle value() const override { + assert(Valid()); + if (value_delta_encoded_) { + return decoded_value_; + } else { + BlockHandle handle; + Slice v = value_; + Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v); + assert(decode_s.ok()); + return handle; + } + } + + virtual void Seek(const Slice& target) override; + + virtual void SeekForPrev(const Slice&) override { + assert(false); + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::InvalidArgument( + "RocksDB internal error: should never call SeekForPrev() on index " + "blocks"); + key_.Clear(); + value_.clear(); + } + + virtual void Prev() override; + + virtual void Next() override; + + virtual void SeekToFirst() override; + + virtual void SeekToLast() override; + + void Invalidate(Status s) { InvalidateBase(s); } + + private: + // Key is in InternalKey format + bool key_includes_seq_; + bool value_delta_encoded_; + BlockPrefixIndex* prefix_index_; + // Whether the value is delta encoded. In that case the value is assumed to be + // BlockHandle. The first value in each restart interval is the full encoded + // BlockHandle; the restart of encoded size part of the BlockHandle. The + // offset of delta encoded BlockHandles is computed by adding the size of + // previous delta encoded values in the same restart interval to the offset of + // the first value in that restart interval. + BlockHandle decoded_value_; + + bool PrefixSeek(const Slice& target, uint32_t* index); + bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, + uint32_t left, uint32_t right, uint32_t* index); + inline int CompareBlockKey(uint32_t block_index, const Slice& target); + + inline int Compare(const Slice& a, const Slice& b) const { + return comparator_->Compare(a, b); + } + + inline int Compare(const IterKey& ikey, const Slice& b) const { + return comparator_->Compare(ikey.GetKey(), b); + } + + inline bool ParseNextIndexKey(); + + // When value_delta_encoded_ is enabled it decodes the value which is assumed + // to be BlockHandle and put it to decoded_value_ + inline void DecodeCurrentValue(uint32_t shared); +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/block_based_filter_block.cc b/src/rocksdb/table/block_based_filter_block.cc new file mode 100644 index 00000000..81087b24 --- /dev/null +++ b/src/rocksdb/table/block_based_filter_block.cc @@ -0,0 +1,254 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_filter_block.h" +#include <algorithm> + +#include "db/dbformat.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/filter_policy.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace rocksdb { + +namespace { + +void AppendItem(std::string* props, const std::string& key, + const std::string& value) { + char cspace = ' '; + std::string value_str(""); + size_t i = 0; + const size_t dataLength = 64; + const size_t tabLength = 2; + const size_t offLength = 16; + + value_str.append(&value[i], std::min(size_t(dataLength), value.size())); + i += dataLength; + while (i < value.size()) { + value_str.append("\n"); + value_str.append(offLength, cspace); + value_str.append(&value[i], std::min(size_t(dataLength), value.size() - i)); + i += dataLength; + } + + std::string result(""); + if (key.size() < (offLength - tabLength)) + result.append(size_t((offLength - tabLength)) - key.size(), cspace); + result.append(key); + + props->append(result + ": " + value_str + "\n"); +} + +template <class TKey> +void AppendItem(std::string* props, const TKey& key, const std::string& value) { + std::string key_str = rocksdb::ToString(key); + AppendItem(props, key_str, value); +} +} // namespace + +// See doc/table_format.txt for an explanation of the filter block format. + +// Generate new filter every 2KB of data +static const size_t kFilterBaseLg = 11; +static const size_t kFilterBase = 1 << kFilterBaseLg; + +BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt) + : policy_(table_opt.filter_policy.get()), + prefix_extractor_(prefix_extractor), + whole_key_filtering_(table_opt.whole_key_filtering), + prev_prefix_start_(0), + prev_prefix_size_(0), + num_added_(0) { + assert(policy_); +} + +void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) { + uint64_t filter_index = (block_offset / kFilterBase); + assert(filter_index >= filter_offsets_.size()); + while (filter_index > filter_offsets_.size()) { + GenerateFilter(); + } +} + +void BlockBasedFilterBlockBuilder::Add(const Slice& key) { + if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { + AddPrefix(key); + } + + if (whole_key_filtering_) { + AddKey(key); + } +} + +// Add key to filter if needed +inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) { + num_added_++; + start_.push_back(entries_.size()); + entries_.append(key.data(), key.size()); +} + +// Add prefix to filter if needed +inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) { + // get slice for most recently added entry + Slice prev; + if (prev_prefix_size_ > 0) { + prev = Slice(entries_.data() + prev_prefix_start_, prev_prefix_size_); + } + + Slice prefix = prefix_extractor_->Transform(key); + // insert prefix only when it's different from the previous prefix. + if (prev.size() == 0 || prefix != prev) { + prev_prefix_start_ = entries_.size(); + prev_prefix_size_ = prefix.size(); + AddKey(prefix); + } +} + +Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, + Status* status) { + // In this impl we ignore BlockHandle + *status = Status::OK(); + if (!start_.empty()) { + GenerateFilter(); + } + + // Append array of per-filter offsets + const uint32_t array_offset = static_cast<uint32_t>(result_.size()); + for (size_t i = 0; i < filter_offsets_.size(); i++) { + PutFixed32(&result_, filter_offsets_[i]); + } + + PutFixed32(&result_, array_offset); + result_.push_back(kFilterBaseLg); // Save encoding parameter in result + return Slice(result_); +} + +void BlockBasedFilterBlockBuilder::GenerateFilter() { + const size_t num_entries = start_.size(); + if (num_entries == 0) { + // Fast path if there are no keys for this filter + filter_offsets_.push_back(static_cast<uint32_t>(result_.size())); + return; + } + + // Make list of keys from flattened key structure + start_.push_back(entries_.size()); // Simplify length computation + tmp_entries_.resize(num_entries); + for (size_t i = 0; i < num_entries; i++) { + const char* base = entries_.data() + start_[i]; + size_t length = start_[i + 1] - start_[i]; + tmp_entries_[i] = Slice(base, length); + } + + // Generate filter for current set of keys and append to result_. + filter_offsets_.push_back(static_cast<uint32_t>(result_.size())); + policy_->CreateFilter(&tmp_entries_[0], static_cast<int>(num_entries), + &result_); + + tmp_entries_.clear(); + entries_.clear(); + start_.clear(); + prev_prefix_start_ = 0; + prev_prefix_size_ = 0; +} + +BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, bool _whole_key_filtering, + BlockContents&& contents, Statistics* stats) + : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), + policy_(table_opt.filter_policy.get()), + prefix_extractor_(prefix_extractor), + data_(nullptr), + offset_(nullptr), + num_(0), + base_lg_(0), + contents_(std::move(contents)) { + assert(policy_); + size_t n = contents_.data.size(); + if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array + base_lg_ = contents_.data[n - 1]; + uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5); + if (last_word > n - 5) return; + data_ = contents_.data.data(); + offset_ = data_ + last_word; + num_ = (n - 5 - last_word) / 4; +} + +bool BlockBasedFilterBlockReader::KeyMayMatch( + const Slice& key, const SliceTransform* /* prefix_extractor */, + uint64_t block_offset, const bool /*no_io*/, + const Slice* const /*const_ikey_ptr*/) { + assert(block_offset != kNotValid); + if (!whole_key_filtering_) { + return true; + } + return MayMatch(key, block_offset); +} + +bool BlockBasedFilterBlockReader::PrefixMayMatch( + const Slice& prefix, const SliceTransform* /* prefix_extractor */, + uint64_t block_offset, const bool /*no_io*/, + const Slice* const /*const_ikey_ptr*/) { + assert(block_offset != kNotValid); + return MayMatch(prefix, block_offset); +} + +bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry, + uint64_t block_offset) { + uint64_t index = block_offset >> base_lg_; + if (index < num_) { + uint32_t start = DecodeFixed32(offset_ + index * 4); + uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); + if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { + Slice filter = Slice(data_ + start, limit - start); + bool const may_match = policy_->KeyMayMatch(entry, filter); + if (may_match) { + PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + return true; + } else { + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + return false; + } + } else if (start == limit) { + // Empty filters do not match any entries + return false; + } + } + return true; // Errors are treated as potential matches +} + +size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const { + return num_ * 4 + 5 + (offset_ - data_); +} + +std::string BlockBasedFilterBlockReader::ToString() const { + std::string result; + result.reserve(1024); + + std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks"); + AppendItem(&result, s_fb, rocksdb::ToString(num_)); + AppendItem(&result, s_bo, s_hd); + + for (size_t index = 0; index < num_; index++) { + uint32_t start = DecodeFixed32(offset_ + index * 4); + uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); + + if (start != limit) { + result.append(" filter block # " + rocksdb::ToString(index + 1) + "\n"); + Slice filter = Slice(data_ + start, limit - start); + AppendItem(&result, start, filter.ToString(true)); + } + } + return result; +} +} // namespace rocksdb diff --git a/src/rocksdb/table/block_based_filter_block.h b/src/rocksdb/table/block_based_filter_block.h new file mode 100644 index 00000000..d1ff5854 --- /dev/null +++ b/src/rocksdb/table/block_based_filter_block.h @@ -0,0 +1,114 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A filter block is stored near the end of a Table file. It contains +// filters (e.g., bloom filters) for all data blocks in the table combined +// into a single filter block. + +#pragma once + +#include <stddef.h> +#include <stdint.h> +#include <memory> +#include <string> +#include <vector> +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/filter_block.h" +#include "util/hash.h" + +namespace rocksdb { + +// A BlockBasedFilterBlockBuilder is used to construct all of the filters for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// +// The sequence of calls to BlockBasedFilterBlockBuilder must match the regexp: +// (StartBlock Add*)* Finish +class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { + public: + BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt); + + virtual bool IsBlockBased() override { return true; } + virtual void StartBlock(uint64_t block_offset) override; + virtual void Add(const Slice& key) override; + virtual size_t NumAdded() const override { return num_added_; } + virtual Slice Finish(const BlockHandle& tmp, Status* status) override; + using FilterBlockBuilder::Finish; + + private: + void AddKey(const Slice& key); + void AddPrefix(const Slice& key); + void GenerateFilter(); + + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + + size_t prev_prefix_start_; // the position of the last appended prefix + // to "entries_". + size_t prev_prefix_size_; // the length of the last appended prefix to + // "entries_". + std::string entries_; // Flattened entry contents + std::vector<size_t> start_; // Starting index in entries_ of each entry + std::string result_; // Filter data computed so far + std::vector<Slice> tmp_entries_; // policy_->CreateFilter() argument + std::vector<uint32_t> filter_offsets_; + size_t num_added_; // Number of keys added + + // No copying allowed + BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&); + void operator=(const BlockBasedFilterBlockBuilder&); +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +class BlockBasedFilterBlockReader : public FilterBlockReader { + public: + // REQUIRES: "contents" and *policy must stay live while *this is live. + BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, + bool whole_key_filtering, + BlockContents&& contents, Statistics* statistics); + virtual bool IsBlockBased() override { return true; } + + virtual bool KeyMayMatch( + const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset = kNotValid, const bool no_io = false, + const Slice* const const_ikey_ptr = nullptr) override; + virtual bool PrefixMayMatch( + const Slice& prefix, const SliceTransform* prefix_extractor, + uint64_t block_offset = kNotValid, const bool no_io = false, + const Slice* const const_ikey_ptr = nullptr) override; + virtual size_t ApproximateMemoryUsage() const override; + + // convert this object to a human readable form + std::string ToString() const override; + + private: + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + const char* data_; // Pointer to filter data (at block-start) + const char* offset_; // Pointer to beginning of offset array (at block-end) + size_t num_; // Number of entries in offset array + size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) + BlockContents contents_; + + bool MayMatch(const Slice& entry, uint64_t block_offset); + + // No copying allowed + BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&); + void operator=(const BlockBasedFilterBlockReader&); +}; +} // namespace rocksdb diff --git a/src/rocksdb/table/block_based_filter_block_test.cc b/src/rocksdb/table/block_based_filter_block_test.cc new file mode 100644 index 00000000..6b352b2f --- /dev/null +++ b/src/rocksdb/table/block_based_filter_block_test.cc @@ -0,0 +1,248 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/string_util.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +// For testing: emit an array with one hash value per key +class TestHashFilter : public FilterPolicy { + public: + const char* Name() const override { return "TestHashFilter"; } + + void CreateFilter(const Slice* keys, int n, std::string* dst) const override { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + bool KeyMayMatch(const Slice& key, const Slice& filter) const override { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } +}; + +class FilterBlockTest : public testing::Test { + public: + TestHashFilter policy_; + BlockBasedTableOptions table_options_; + + FilterBlockTest() { + table_options_.filter_policy.reset(new TestHashFilter()); + } +}; + +TEST_F(FilterBlockTest, EmptyBuilder) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + BlockContents block(builder.Finish()); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); + BlockBasedFilterBlockReader reader(nullptr, table_options_, true, + std::move(block), nullptr); + ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0})); + ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100000)); +} + +TEST_F(FilterBlockTest, SingleChunk) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + ASSERT_EQ(0, builder.NumAdded()); + builder.StartBlock(100); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.StartBlock(200); + builder.Add("box"); + builder.StartBlock(300); + builder.Add("hello"); + ASSERT_EQ(5, builder.NumAdded()); + BlockContents block(builder.Finish()); + BlockBasedFilterBlockReader reader(nullptr, table_options_, true, + std::move(block), nullptr); + ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100)); + ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 100)); + ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 100)); + ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 100)); + ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100)); + ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr, 100)); + ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr, 100)); +} + +TEST_F(FilterBlockTest, MultiChunk) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + + // First filter + builder.StartBlock(0); + builder.Add("foo"); + builder.StartBlock(2000); + builder.Add("bar"); + + // Second filter + builder.StartBlock(3100); + builder.Add("box"); + + // Third filter is empty + + // Last filter + builder.StartBlock(9000); + builder.Add("box"); + builder.Add("hello"); + + BlockContents block(builder.Finish()); + BlockBasedFilterBlockReader reader(nullptr, table_options_, true, + std::move(block), nullptr); + + // Check first filter + ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0})); + ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 2000)); + ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, uint64_t{0})); + ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, uint64_t{0})); + + // Check second filter + ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 3100)); + ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 3100)); + ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 3100)); + ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 3100)); + + // Check third filter (empty) + ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 4100)); + ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 4100)); + ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, 4100)); + ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 4100)); + + // Check last filter + ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 9000)); + ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 9000)); + ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 9000)); + ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 9000)); +} + +// Test for block based filter block +// use new interface in FilterPolicy to create filter builder/reader +class BlockBasedFilterBlockTest : public testing::Test { + public: + BlockBasedTableOptions table_options_; + + BlockBasedFilterBlockTest() { + table_options_.filter_policy.reset(NewBloomFilterPolicy(10)); + } + + ~BlockBasedFilterBlockTest() override {} +}; + +TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { + FilterBlockBuilder* builder = + new BlockBasedFilterBlockBuilder(nullptr, table_options_); + BlockContents block(builder->Finish()); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); + FilterBlockReader* reader = new BlockBasedFilterBlockReader( + nullptr, table_options_, true, std::move(block), nullptr); + ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0})); + ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100000)); + + delete builder; + delete reader; +} + +TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) { + FilterBlockBuilder* builder = + new BlockBasedFilterBlockBuilder(nullptr, table_options_); + builder->StartBlock(100); + builder->Add("foo"); + builder->Add("bar"); + builder->Add("box"); + builder->StartBlock(200); + builder->Add("box"); + builder->StartBlock(300); + builder->Add("hello"); + BlockContents block(builder->Finish()); + FilterBlockReader* reader = new BlockBasedFilterBlockReader( + nullptr, table_options_, true, std::move(block), nullptr); + ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100)); + ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 100)); + ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 100)); + ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 100)); + ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100)); + ASSERT_TRUE(!reader->KeyMayMatch("missing", nullptr, 100)); + ASSERT_TRUE(!reader->KeyMayMatch("other", nullptr, 100)); + + delete builder; + delete reader; +} + +TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { + FilterBlockBuilder* builder = + new BlockBasedFilterBlockBuilder(nullptr, table_options_); + + // First filter + builder->StartBlock(0); + builder->Add("foo"); + builder->StartBlock(2000); + builder->Add("bar"); + + // Second filter + builder->StartBlock(3100); + builder->Add("box"); + + // Third filter is empty + + // Last filter + builder->StartBlock(9000); + builder->Add("box"); + builder->Add("hello"); + + BlockContents block(builder->Finish()); + FilterBlockReader* reader = new BlockBasedFilterBlockReader( + nullptr, table_options_, true, std::move(block), nullptr); + + // Check first filter + ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0})); + ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 2000)); + ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, uint64_t{0})); + ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, uint64_t{0})); + + // Check second filter + ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 3100)); + ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 3100)); + ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 3100)); + ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 3100)); + + // Check third filter (empty) + ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 4100)); + ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 4100)); + ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, 4100)); + ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 4100)); + + // Check last filter + ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 9000)); + ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 9000)); + ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 9000)); + ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 9000)); + + delete builder; + delete reader; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/block_based_table_builder.cc b/src/rocksdb/table/block_based_table_builder.cc new file mode 100644 index 00000000..479311f5 --- /dev/null +++ b/src/rocksdb/table/block_based_table_builder.cc @@ -0,0 +1,1188 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_table_builder.h" + +#include <assert.h> +#include <stdio.h> + +#include <list> +#include <map> +#include <memory> +#include <string> +#include <unordered_map> +#include <utility> + +#include "db/dbformat.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/table.h" + +#include "table/block.h" +#include "table/block_based_filter_block.h" +#include "table/block_based_table_factory.h" +#include "table/block_based_table_reader.h" +#include "table/block_builder.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/full_filter_block.h" +#include "table/table_builder.h" + +#include "util/coding.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/memory_allocator.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/xxhash.h" + +#include "table/index_builder.h" +#include "table/partitioned_filter_block.h" + +namespace rocksdb { + +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; + +typedef BlockBasedTableOptions::IndexType IndexType; + +// Without anonymous namespace here, we fail the warning -Wmissing-prototypes +namespace { + +// Create a filter block builder based on its type. +FilterBlockBuilder* CreateFilterBlockBuilder( + const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt, + const BlockBasedTableOptions& table_opt, + const bool use_delta_encoding_for_index_values, + PartitionedIndexBuilder* const p_index_builder) { + if (table_opt.filter_policy == nullptr) return nullptr; + + FilterBitsBuilder* filter_bits_builder = + table_opt.filter_policy->GetFilterBitsBuilder(); + if (filter_bits_builder == nullptr) { + return new BlockBasedFilterBlockBuilder(mopt.prefix_extractor.get(), + table_opt); + } else { + if (table_opt.partition_filters) { + assert(p_index_builder != nullptr); + // Since after partition cut request from filter builder it takes time + // until index builder actully cuts the partition, we take the lower bound + // as partition size. + assert(table_opt.block_size_deviation <= 100); + auto partition_size = + static_cast<uint32_t>(((table_opt.metadata_block_size * + (100 - table_opt.block_size_deviation)) + + 99) / + 100); + partition_size = std::max(partition_size, static_cast<uint32_t>(1)); + return new PartitionedFilterBlockBuilder( + mopt.prefix_extractor.get(), table_opt.whole_key_filtering, + filter_bits_builder, table_opt.index_block_restart_interval, + use_delta_encoding_for_index_values, p_index_builder, partition_size); + } else { + return new FullFilterBlockBuilder(mopt.prefix_extractor.get(), + table_opt.whole_key_filtering, + filter_bits_builder); + } + } +} + +bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { + // Check to see if compressed less than 12.5% + return compressed_size < raw_size - (raw_size / 8u); +} + +bool CompressBlockInternal(const Slice& raw, + const CompressionInfo& compression_info, + uint32_t format_version, + std::string* compressed_output) { + // Will return compressed block contents if (1) the compression method is + // supported in this platform and (2) the compression rate is "good enough". + switch (compression_info.type()) { + case kSnappyCompression: + return Snappy_Compress(compression_info, raw.data(), raw.size(), + compressed_output); + case kZlibCompression: + return Zlib_Compress( + compression_info, + GetCompressFormatForVersion(kZlibCompression, format_version), + raw.data(), raw.size(), compressed_output); + case kBZip2Compression: + return BZip2_Compress( + compression_info, + GetCompressFormatForVersion(kBZip2Compression, format_version), + raw.data(), raw.size(), compressed_output); + case kLZ4Compression: + return LZ4_Compress( + compression_info, + GetCompressFormatForVersion(kLZ4Compression, format_version), + raw.data(), raw.size(), compressed_output); + case kLZ4HCCompression: + return LZ4HC_Compress( + compression_info, + GetCompressFormatForVersion(kLZ4HCCompression, format_version), + raw.data(), raw.size(), compressed_output); + case kXpressCompression: + return XPRESS_Compress(raw.data(), raw.size(), compressed_output); + case kZSTD: + case kZSTDNotFinalCompression: + return ZSTD_Compress(compression_info, raw.data(), raw.size(), + compressed_output); + default: + // Do not recognize this compression type + return false; + } +} + +} // namespace + +// format_version is the block format as defined in include/rocksdb/table.h +Slice CompressBlock(const Slice& raw, const CompressionInfo& info, + CompressionType* type, uint32_t format_version, + bool do_sample, std::string* compressed_output, + std::string* sampled_output_fast, + std::string* sampled_output_slow) { + *type = info.type(); + + if (info.type() == kNoCompression && !info.SampleForCompression()) { + return raw; + } + + // If requested, we sample one in every N block with a + // fast and slow compression algorithm and report the stats. + // The users can use these stats to decide if it is worthwhile + // enabling compression and they also get a hint about which + // compression algorithm wil be beneficial. + if (do_sample && info.SampleForCompression() && + Random::GetTLSInstance()->OneIn((int)info.SampleForCompression()) && + sampled_output_fast && sampled_output_slow) { + // Sampling with a fast compression algorithm + if (LZ4_Supported() || Snappy_Supported()) { + CompressionType c = + LZ4_Supported() ? kLZ4Compression : kSnappyCompression; + CompressionContext context(c); + CompressionOptions options; + CompressionInfo info_tmp(options, context, + CompressionDict::GetEmptyDict(), c, + info.SampleForCompression()); + + CompressBlockInternal(raw, info_tmp, format_version, sampled_output_fast); + } + + // Sampling with a slow but high-compression algorithm + if (ZSTD_Supported() || Zlib_Supported()) { + CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression; + CompressionContext context(c); + CompressionOptions options; + CompressionInfo info_tmp(options, context, + CompressionDict::GetEmptyDict(), c, + info.SampleForCompression()); + CompressBlockInternal(raw, info_tmp, format_version, sampled_output_slow); + } + } + + // Actually compress the data + if (*type != kNoCompression) { + if (CompressBlockInternal(raw, info, format_version, compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + } + + // Compression method is not supported, or not good + // compression ratio, so just fall back to uncompressed form. + *type = kNoCompression; + return raw; +} + +// kBlockBasedTableMagicNumber was picked by running +// echo rocksdb.table.block_based | sha1sum +// and taking the leading 64 bits. +// Please note that kBlockBasedTableMagicNumber may also be accessed by other +// .cc files +// for that reason we declare it extern in the header but to get the space +// allocated +// it must be not extern in one place. +const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull; +// We also support reading and writing legacy block based table format (for +// backwards compatibility) +const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull; + +// A collector that collects properties of interest to block-based table. +// For now this class looks heavy-weight since we only write one additional +// property. +// But in the foreseeable future, we will add more and more properties that are +// specific to block-based table. +class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector + : public IntTblPropCollector { + public: + explicit BlockBasedTablePropertiesCollector( + BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering, + bool prefix_filtering) + : index_type_(index_type), + whole_key_filtering_(whole_key_filtering), + prefix_filtering_(prefix_filtering) {} + + Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/, + uint64_t /*file_size*/) override { + // Intentionally left blank. Have no interest in collecting stats for + // individual key/value pairs. + return Status::OK(); + } + + virtual void BlockAdd(uint64_t /* blockRawBytes */, + uint64_t /* blockCompressedBytesFast */, + uint64_t /* blockCompressedBytesSlow */) override { + // Intentionally left blank. No interest in collecting stats for + // blocks. + return; + } + + Status Finish(UserCollectedProperties* properties) override { + std::string val; + PutFixed32(&val, static_cast<uint32_t>(index_type_)); + properties->insert({BlockBasedTablePropertyNames::kIndexType, val}); + properties->insert({BlockBasedTablePropertyNames::kWholeKeyFiltering, + whole_key_filtering_ ? kPropTrue : kPropFalse}); + properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering, + prefix_filtering_ ? kPropTrue : kPropFalse}); + return Status::OK(); + } + + // The name of the properties collector can be used for debugging purpose. + const char* Name() const override { + return "BlockBasedTablePropertiesCollector"; + } + + UserCollectedProperties GetReadableProperties() const override { + // Intentionally left blank. + return UserCollectedProperties(); + } + + private: + BlockBasedTableOptions::IndexType index_type_; + bool whole_key_filtering_; + bool prefix_filtering_; +}; + +struct BlockBasedTableBuilder::Rep { + const ImmutableCFOptions ioptions; + const MutableCFOptions moptions; + const BlockBasedTableOptions table_options; + const InternalKeyComparator& internal_comparator; + WritableFileWriter* file; + uint64_t offset = 0; + Status status; + size_t alignment; + BlockBuilder data_block; + // Buffers uncompressed data blocks and keys to replay later. Needed when + // compression dictionary is enabled so we can finalize the dictionary before + // compressing any data blocks. + // TODO(ajkr): ideally we don't buffer all keys and all uncompressed data + // blocks as it's redundant, but it's easier to implement for now. + std::vector<std::pair<std::string, std::vector<std::string>>> + data_block_and_keys_buffers; + BlockBuilder range_del_block; + + InternalKeySliceTransform internal_prefix_transform; + std::unique_ptr<IndexBuilder> index_builder; + PartitionedIndexBuilder* p_index_builder_ = nullptr; + + std::string last_key; + CompressionType compression_type; + uint64_t sample_for_compression; + CompressionOptions compression_opts; + std::unique_ptr<CompressionDict> compression_dict; + CompressionContext compression_ctx; + std::unique_ptr<UncompressionContext> verify_ctx; + std::unique_ptr<UncompressionDict> verify_dict; + + size_t data_begin_offset = 0; + + TableProperties props; + + // States of the builder. + // + // - `kBuffered`: This is the initial state where zero or more data blocks are + // accumulated uncompressed in-memory. From this state, call + // `EnterUnbuffered()` to finalize the compression dictionary if enabled, + // compress/write out any buffered blocks, and proceed to the `kUnbuffered` + // state. + // + // - `kUnbuffered`: This is the state when compression dictionary is finalized + // either because it wasn't enabled in the first place or it's been created + // from sampling previously buffered data. In this state, blocks are simply + // compressed/written out as they fill up. From this state, call `Finish()` + // to complete the file (write meta-blocks, etc.), or `Abandon()` to delete + // the partially created file. + // + // - `kClosed`: This indicates either `Finish()` or `Abandon()` has been + // called, so the table builder is no longer usable. We must be in this + // state by the time the destructor runs. + enum class State { + kBuffered, + kUnbuffered, + kClosed, + }; + State state; + + const bool use_delta_encoding_for_index_values; + std::unique_ptr<FilterBlockBuilder> filter_builder; + char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; + size_t compressed_cache_key_prefix_size; + + BlockHandle pending_handle; // Handle to add to index block + + std::string compressed_output; + std::unique_ptr<FlushBlockPolicy> flush_block_policy; + uint32_t column_family_id; + const std::string& column_family_name; + uint64_t creation_time = 0; + uint64_t oldest_key_time = 0; + const uint64_t target_file_size; + + std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors; + + Rep(const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions, + const BlockBasedTableOptions& table_opt, + const InternalKeyComparator& icomparator, + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + int_tbl_prop_collector_factories, + uint32_t _column_family_id, WritableFileWriter* f, + const CompressionType _compression_type, + const uint64_t _sample_for_compression, + const CompressionOptions& _compression_opts, const bool skip_filters, + const std::string& _column_family_name, const uint64_t _creation_time, + const uint64_t _oldest_key_time, const uint64_t _target_file_size) + : ioptions(_ioptions), + moptions(_moptions), + table_options(table_opt), + internal_comparator(icomparator), + file(f), + alignment(table_options.block_align + ? std::min(table_options.block_size, kDefaultPageSize) + : 0), + data_block(table_options.block_restart_interval, + table_options.use_delta_encoding, + false /* use_value_delta_encoding */, + icomparator.user_comparator() + ->CanKeysWithDifferentByteContentsBeEqual() + ? BlockBasedTableOptions::kDataBlockBinarySearch + : table_options.data_block_index_type, + table_options.data_block_hash_table_util_ratio), + range_del_block(1 /* block_restart_interval */), + internal_prefix_transform(_moptions.prefix_extractor.get()), + compression_type(_compression_type), + sample_for_compression(_sample_for_compression), + compression_opts(_compression_opts), + compression_dict(), + compression_ctx(_compression_type), + verify_dict(), + state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered + : State::kUnbuffered), + use_delta_encoding_for_index_values(table_opt.format_version >= 4 && + !table_opt.block_align), + compressed_cache_key_prefix_size(0), + flush_block_policy( + table_options.flush_block_policy_factory->NewFlushBlockPolicy( + table_options, data_block)), + column_family_id(_column_family_id), + column_family_name(_column_family_name), + creation_time(_creation_time), + oldest_key_time(_oldest_key_time), + target_file_size(_target_file_size) { + if (table_options.index_type == + BlockBasedTableOptions::kTwoLevelIndexSearch) { + p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder( + &internal_comparator, use_delta_encoding_for_index_values, + table_options); + index_builder.reset(p_index_builder_); + } else { + index_builder.reset(IndexBuilder::CreateIndexBuilder( + table_options.index_type, &internal_comparator, + &this->internal_prefix_transform, use_delta_encoding_for_index_values, + table_options)); + } + if (skip_filters) { + filter_builder = nullptr; + } else { + filter_builder.reset(CreateFilterBlockBuilder( + _ioptions, _moptions, table_options, + use_delta_encoding_for_index_values, p_index_builder_)); + } + + for (auto& collector_factories : *int_tbl_prop_collector_factories) { + table_properties_collectors.emplace_back( + collector_factories->CreateIntTblPropCollector(column_family_id)); + } + table_properties_collectors.emplace_back( + new BlockBasedTablePropertiesCollector( + table_options.index_type, table_options.whole_key_filtering, + _moptions.prefix_extractor != nullptr)); + if (table_options.verify_compression) { + verify_ctx.reset(new UncompressionContext(UncompressionContext::NoCache(), + compression_type)); + } + } + + Rep(const Rep&) = delete; + Rep& operator=(const Rep&) = delete; + + ~Rep() {} +}; + +BlockBasedTableBuilder::BlockBasedTableBuilder( + const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, WritableFileWriter* file, + const CompressionType compression_type, + const uint64_t sample_for_compression, + const CompressionOptions& compression_opts, const bool skip_filters, + const std::string& column_family_name, const uint64_t creation_time, + const uint64_t oldest_key_time, const uint64_t target_file_size) { + BlockBasedTableOptions sanitized_table_options(table_options); + if (sanitized_table_options.format_version == 0 && + sanitized_table_options.checksum != kCRC32c) { + ROCKS_LOG_WARN( + ioptions.info_log, + "Silently converting format_version to 1 because checksum is " + "non-default"); + // silently convert format_version to 1 to keep consistent with current + // behavior + sanitized_table_options.format_version = 1; + } + + rep_ = new Rep( + ioptions, moptions, sanitized_table_options, internal_comparator, + int_tbl_prop_collector_factories, column_family_id, file, + compression_type, sample_for_compression, compression_opts, skip_filters, + column_family_name, creation_time, oldest_key_time, target_file_size); + + if (rep_->filter_builder != nullptr) { + rep_->filter_builder->StartBlock(0); + } + if (table_options.block_cache_compressed.get() != nullptr) { + BlockBasedTable::GenerateCachePrefix( + table_options.block_cache_compressed.get(), file->writable_file(), + &rep_->compressed_cache_key_prefix[0], + &rep_->compressed_cache_key_prefix_size); + } +} + +BlockBasedTableBuilder::~BlockBasedTableBuilder() { + // Catch errors where caller forgot to call Finish() + assert(rep_->state == Rep::State::kClosed); + delete rep_; +} + +void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { + Rep* r = rep_; + assert(rep_->state != Rep::State::kClosed); + if (!ok()) return; + ValueType value_type = ExtractValueType(key); + if (IsValueType(value_type)) { +#ifndef NDEBUG + if (r->props.num_entries > r->props.num_range_deletions) { + assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0); + } +#endif // NDEBUG + + auto should_flush = r->flush_block_policy->Update(key, value); + if (should_flush) { + assert(!r->data_block.empty()); + Flush(); + + if (r->state == Rep::State::kBuffered && + r->data_begin_offset > r->target_file_size) { + EnterUnbuffered(); + } + + // Add item to index block. + // We do not emit the index entry for a block until we have seen the + // first key for the next data block. This allows us to use shorter + // keys in the index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who". We can use + // "the r" as the key for the index block entry since it is >= all + // entries in the first block and < all entries in subsequent + // blocks. + if (ok() && r->state == Rep::State::kUnbuffered) { + r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle); + } + } + + // Note: PartitionedFilterBlockBuilder requires key being added to filter + // builder after being added to index builder. + if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) { + r->filter_builder->Add(ExtractUserKey(key)); + } + + r->last_key.assign(key.data(), key.size()); + r->data_block.Add(key, value); + if (r->state == Rep::State::kBuffered) { + // Buffer keys to be replayed during `Finish()` once compression + // dictionary has been finalized. + if (r->data_block_and_keys_buffers.empty() || should_flush) { + r->data_block_and_keys_buffers.emplace_back(); + } + r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString()); + } else { + r->index_builder->OnKeyAdded(key); + } + NotifyCollectTableCollectorsOnAdd(key, value, r->offset, + r->table_properties_collectors, + r->ioptions.info_log); + + } else if (value_type == kTypeRangeDeletion) { + r->range_del_block.Add(key, value); + NotifyCollectTableCollectorsOnAdd(key, value, r->offset, + r->table_properties_collectors, + r->ioptions.info_log); + } else { + assert(false); + } + + r->props.num_entries++; + r->props.raw_key_size += key.size(); + r->props.raw_value_size += value.size(); + if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion) { + r->props.num_deletions++; + } else if (value_type == kTypeRangeDeletion) { + r->props.num_deletions++; + r->props.num_range_deletions++; + } else if (value_type == kTypeMerge) { + r->props.num_merge_operands++; + } +} + +void BlockBasedTableBuilder::Flush() { + Rep* r = rep_; + assert(rep_->state != Rep::State::kClosed); + if (!ok()) return; + if (r->data_block.empty()) return; + WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */); +} + +void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block, + BlockHandle* handle, + bool is_data_block) { + WriteBlock(block->Finish(), handle, is_data_block); + block->Reset(); +} + +void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, + BlockHandle* handle, + bool is_data_block) { + // File format contains a sequence of blocks where each block has: + // block_data: uint8[n] + // type: uint8 + // crc: uint32 + assert(ok()); + Rep* r = rep_; + + auto type = r->compression_type; + uint64_t sample_for_compression = r->sample_for_compression; + Slice block_contents; + bool abort_compression = false; + + StopWatchNano timer( + r->ioptions.env, + ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)); + + if (r->state == Rep::State::kBuffered) { + assert(is_data_block); + assert(!r->data_block_and_keys_buffers.empty()); + r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString(); + r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size(); + return; + } + + if (raw_block_contents.size() < kCompressionSizeLimit) { + const CompressionDict* compression_dict; + if (!is_data_block || r->compression_dict == nullptr) { + compression_dict = &CompressionDict::GetEmptyDict(); + } else { + compression_dict = r->compression_dict.get(); + } + assert(compression_dict != nullptr); + CompressionInfo compression_info(r->compression_opts, r->compression_ctx, + *compression_dict, type, + sample_for_compression); + + std::string sampled_output_fast; + std::string sampled_output_slow; + block_contents = CompressBlock( + raw_block_contents, compression_info, &type, + r->table_options.format_version, is_data_block /* do_sample */, + &r->compressed_output, &sampled_output_fast, &sampled_output_slow); + + // notify collectors on block add + NotifyCollectTableCollectorsOnBlockAdd( + r->table_properties_collectors, raw_block_contents.size(), + sampled_output_fast.size(), sampled_output_slow.size()); + + // Some of the compression algorithms are known to be unreliable. If + // the verify_compression flag is set then try to de-compress the + // compressed data and compare to the input. + if (type != kNoCompression && r->table_options.verify_compression) { + // Retrieve the uncompressed contents into a new buffer + const UncompressionDict* verify_dict; + if (!is_data_block || r->verify_dict == nullptr) { + verify_dict = &UncompressionDict::GetEmptyDict(); + } else { + verify_dict = r->verify_dict.get(); + } + assert(verify_dict != nullptr); + BlockContents contents; + UncompressionInfo uncompression_info(*r->verify_ctx, *verify_dict, + r->compression_type); + Status stat = UncompressBlockContentsForCompressionType( + uncompression_info, block_contents.data(), block_contents.size(), + &contents, r->table_options.format_version, r->ioptions); + + if (stat.ok()) { + bool compressed_ok = contents.data.compare(raw_block_contents) == 0; + if (!compressed_ok) { + // The result of the compression was invalid. abort. + abort_compression = true; + ROCKS_LOG_ERROR(r->ioptions.info_log, + "Decompressed block did not match raw block"); + r->status = + Status::Corruption("Decompressed block did not match raw block"); + } + } else { + // Decompression reported an error. abort. + r->status = Status::Corruption("Could not decompress"); + abort_compression = true; + } + } + } else { + // Block is too big to be compressed. + abort_compression = true; + } + + // Abort compression if the block is too big, or did not pass + // verification. + if (abort_compression) { + RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); + type = kNoCompression; + block_contents = raw_block_contents; + } else if (type != kNoCompression) { + if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)) { + RecordTimeToHistogram(r->ioptions.statistics, COMPRESSION_TIMES_NANOS, + timer.ElapsedNanos()); + } + RecordInHistogram(r->ioptions.statistics, BYTES_COMPRESSED, + raw_block_contents.size()); + RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED); + } else if (type != r->compression_type) { + RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); + } + + WriteRawBlock(block_contents, type, handle, is_data_block); + r->compressed_output.clear(); + if (is_data_block) { + if (r->filter_builder != nullptr) { + r->filter_builder->StartBlock(r->offset); + } + r->props.data_size = r->offset; + ++r->props.num_data_blocks; + } +} + +void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, + CompressionType type, + BlockHandle* handle, + bool is_data_block) { + Rep* r = rep_; + StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS); + handle->set_offset(r->offset); + handle->set_size(block_contents.size()); + assert(r->status.ok()); + r->status = r->file->Append(block_contents); + if (r->status.ok()) { + char trailer[kBlockTrailerSize]; + trailer[0] = type; + char* trailer_without_type = trailer + 1; + switch (r->table_options.checksum) { + case kNoChecksum: + EncodeFixed32(trailer_without_type, 0); + break; + case kCRC32c: { + auto crc = crc32c::Value(block_contents.data(), block_contents.size()); + crc = crc32c::Extend(crc, trailer, 1); // Extend to cover block type + EncodeFixed32(trailer_without_type, crc32c::Mask(crc)); + break; + } + case kxxHash: { + void* xxh = XXH32_init(0); + XXH32_update(xxh, block_contents.data(), + static_cast<uint32_t>(block_contents.size())); + XXH32_update(xxh, trailer, 1); // Extend to cover block type + EncodeFixed32(trailer_without_type, XXH32_digest(xxh)); + break; + } + case kxxHash64: { + XXH64_state_t* const state = XXH64_createState(); + XXH64_reset(state, 0); + XXH64_update(state, block_contents.data(), + static_cast<uint32_t>(block_contents.size())); + XXH64_update(state, trailer, 1); // Extend to cover block type + EncodeFixed32( + trailer_without_type, + static_cast<uint32_t>(XXH64_digest(state) & // lower 32 bits + uint64_t{0xffffffff})); + XXH64_freeState(state); + break; + } + } + + assert(r->status.ok()); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum", + static_cast<char*>(trailer)); + r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); + if (r->status.ok()) { + r->status = InsertBlockInCache(block_contents, type, handle); + } + if (r->status.ok()) { + r->offset += block_contents.size() + kBlockTrailerSize; + if (r->table_options.block_align && is_data_block) { + size_t pad_bytes = + (r->alignment - ((block_contents.size() + kBlockTrailerSize) & + (r->alignment - 1))) & + (r->alignment - 1); + r->status = r->file->Pad(pad_bytes); + if (r->status.ok()) { + r->offset += pad_bytes; + } + } + } + } +} + +Status BlockBasedTableBuilder::status() const { return rep_->status; } + +static void DeleteCachedBlockContents(const Slice& /*key*/, void* value) { + BlockContents* bc = reinterpret_cast<BlockContents*>(value); + delete bc; +} + +// +// Make a copy of the block contents and insert into compressed block cache +// +Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle) { + Rep* r = rep_; + Cache* block_cache_compressed = r->table_options.block_cache_compressed.get(); + + if (type != kNoCompression && block_cache_compressed != nullptr) { + size_t size = block_contents.size(); + + auto ubuf = + AllocateBlock(size + 1, block_cache_compressed->memory_allocator()); + memcpy(ubuf.get(), block_contents.data(), size); + ubuf[size] = type; + + BlockContents* block_contents_to_cache = + new BlockContents(std::move(ubuf), size); +#ifndef NDEBUG + block_contents_to_cache->is_raw_block = true; +#endif // NDEBUG + + // make cache key by appending the file offset to the cache prefix id + char* end = EncodeVarint64( + r->compressed_cache_key_prefix + r->compressed_cache_key_prefix_size, + handle->offset()); + Slice key(r->compressed_cache_key_prefix, + static_cast<size_t>(end - r->compressed_cache_key_prefix)); + + // Insert into compressed block cache. + block_cache_compressed->Insert( + key, block_contents_to_cache, + block_contents_to_cache->ApproximateMemoryUsage(), + &DeleteCachedBlockContents); + + // Invalidate OS cache. + r->file->InvalidateCache(static_cast<size_t>(r->offset), size); + } + return Status::OK(); +} + +void BlockBasedTableBuilder::WriteFilterBlock( + MetaIndexBuilder* meta_index_builder) { + BlockHandle filter_block_handle; + bool empty_filter_block = (rep_->filter_builder == nullptr || + rep_->filter_builder->NumAdded() == 0); + if (ok() && !empty_filter_block) { + Status s = Status::Incomplete(); + while (ok() && s.IsIncomplete()) { + Slice filter_content = + rep_->filter_builder->Finish(filter_block_handle, &s); + assert(s.ok() || s.IsIncomplete()); + rep_->props.filter_size += filter_content.size(); + WriteRawBlock(filter_content, kNoCompression, &filter_block_handle); + } + } + if (ok() && !empty_filter_block) { + // Add mapping from "<filter_block_prefix>.Name" to location + // of filter data. + std::string key; + if (rep_->filter_builder->IsBlockBased()) { + key = BlockBasedTable::kFilterBlockPrefix; + } else { + key = rep_->table_options.partition_filters + ? BlockBasedTable::kPartitionedFilterBlockPrefix + : BlockBasedTable::kFullFilterBlockPrefix; + } + key.append(rep_->table_options.filter_policy->Name()); + meta_index_builder->Add(key, filter_block_handle); + } +} + +void BlockBasedTableBuilder::WriteIndexBlock( + MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) { + IndexBuilder::IndexBlocks index_blocks; + auto index_builder_status = rep_->index_builder->Finish(&index_blocks); + if (index_builder_status.IsIncomplete()) { + // We we have more than one index partition then meta_blocks are not + // supported for the index. Currently meta_blocks are used only by + // HashIndexBuilder which is not multi-partition. + assert(index_blocks.meta_blocks.empty()); + } else if (ok() && !index_builder_status.ok()) { + rep_->status = index_builder_status; + } + if (ok()) { + for (const auto& item : index_blocks.meta_blocks) { + BlockHandle block_handle; + WriteBlock(item.second, &block_handle, false /* is_data_block */); + if (!ok()) { + break; + } + meta_index_builder->Add(item.first, block_handle); + } + } + if (ok()) { + if (rep_->table_options.enable_index_compression) { + WriteBlock(index_blocks.index_block_contents, index_block_handle, false); + } else { + WriteRawBlock(index_blocks.index_block_contents, kNoCompression, + index_block_handle); + } + } + // If there are more index partitions, finish them and write them out + Status s = index_builder_status; + while (ok() && s.IsIncomplete()) { + s = rep_->index_builder->Finish(&index_blocks, *index_block_handle); + if (!s.ok() && !s.IsIncomplete()) { + rep_->status = s; + return; + } + if (rep_->table_options.enable_index_compression) { + WriteBlock(index_blocks.index_block_contents, index_block_handle, false); + } else { + WriteRawBlock(index_blocks.index_block_contents, kNoCompression, + index_block_handle); + } + // The last index_block_handle will be for the partition index block + } +} + +void BlockBasedTableBuilder::WritePropertiesBlock( + MetaIndexBuilder* meta_index_builder) { + BlockHandle properties_block_handle; + if (ok()) { + PropertyBlockBuilder property_block_builder; + rep_->props.column_family_id = rep_->column_family_id; + rep_->props.column_family_name = rep_->column_family_name; + rep_->props.filter_policy_name = + rep_->table_options.filter_policy != nullptr + ? rep_->table_options.filter_policy->Name() + : ""; + rep_->props.index_size = + rep_->index_builder->IndexSize() + kBlockTrailerSize; + rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr + ? rep_->ioptions.user_comparator->Name() + : "nullptr"; + rep_->props.merge_operator_name = + rep_->ioptions.merge_operator != nullptr + ? rep_->ioptions.merge_operator->Name() + : "nullptr"; + rep_->props.compression_name = + CompressionTypeToString(rep_->compression_type); + rep_->props.compression_options = + CompressionOptionsToString(rep_->compression_opts); + rep_->props.prefix_extractor_name = + rep_->moptions.prefix_extractor != nullptr + ? rep_->moptions.prefix_extractor->Name() + : "nullptr"; + + std::string property_collectors_names = "["; + for (size_t i = 0; + i < rep_->ioptions.table_properties_collector_factories.size(); ++i) { + if (i != 0) { + property_collectors_names += ","; + } + property_collectors_names += + rep_->ioptions.table_properties_collector_factories[i]->Name(); + } + property_collectors_names += "]"; + rep_->props.property_collectors_names = property_collectors_names; + if (rep_->table_options.index_type == + BlockBasedTableOptions::kTwoLevelIndexSearch) { + assert(rep_->p_index_builder_ != nullptr); + rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions(); + rep_->props.top_level_index_size = + rep_->p_index_builder_->TopLevelIndexSize(rep_->offset); + } + rep_->props.index_key_is_user_key = + !rep_->index_builder->seperator_is_key_plus_seq(); + rep_->props.index_value_is_delta_encoded = + rep_->use_delta_encoding_for_index_values; + rep_->props.creation_time = rep_->creation_time; + rep_->props.oldest_key_time = rep_->oldest_key_time; + + // Add basic properties + property_block_builder.AddTableProperty(rep_->props); + + // Add use collected properties + NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors, + rep_->ioptions.info_log, + &property_block_builder); + + WriteRawBlock(property_block_builder.Finish(), kNoCompression, + &properties_block_handle); + } + if (ok()) { +#ifndef NDEBUG + { + uint64_t props_block_offset = properties_block_handle.offset(); + uint64_t props_block_size = properties_block_handle.size(); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset", + &props_block_offset); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize", + &props_block_size); + } +#endif // !NDEBUG + meta_index_builder->Add(kPropertiesBlock, properties_block_handle); + } +} + +void BlockBasedTableBuilder::WriteCompressionDictBlock( + MetaIndexBuilder* meta_index_builder) { + if (rep_->compression_dict != nullptr && + rep_->compression_dict->GetRawDict().size()) { + BlockHandle compression_dict_block_handle; + if (ok()) { + WriteRawBlock(rep_->compression_dict->GetRawDict(), kNoCompression, + &compression_dict_block_handle); +#ifndef NDEBUG + Slice compression_dict = rep_->compression_dict->GetRawDict(); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", + &compression_dict); +#endif // NDEBUG + } + if (ok()) { + meta_index_builder->Add(kCompressionDictBlock, + compression_dict_block_handle); + } + } +} + +void BlockBasedTableBuilder::WriteRangeDelBlock( + MetaIndexBuilder* meta_index_builder) { + if (ok() && !rep_->range_del_block.empty()) { + BlockHandle range_del_block_handle; + WriteRawBlock(rep_->range_del_block.Finish(), kNoCompression, + &range_del_block_handle); + meta_index_builder->Add(kRangeDelBlock, range_del_block_handle); + } +} + +void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle, + BlockHandle& index_block_handle) { + Rep* r = rep_; + // No need to write out new footer if we're using default checksum. + // We're writing legacy magic number because we want old versions of RocksDB + // be able to read files generated with new release (just in case if + // somebody wants to roll back after an upgrade) + // TODO(icanadi) at some point in the future, when we're absolutely sure + // nobody will roll back to RocksDB 2.x versions, retire the legacy magic + // number and always write new table files with new magic number + bool legacy = (r->table_options.format_version == 0); + // this is guaranteed by BlockBasedTableBuilder's constructor + assert(r->table_options.checksum == kCRC32c || + r->table_options.format_version != 0); + Footer footer( + legacy ? kLegacyBlockBasedTableMagicNumber : kBlockBasedTableMagicNumber, + r->table_options.format_version); + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(index_block_handle); + footer.set_checksum(r->table_options.checksum); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + assert(r->status.ok()); + r->status = r->file->Append(footer_encoding); + if (r->status.ok()) { + r->offset += footer_encoding.size(); + } +} + +void BlockBasedTableBuilder::EnterUnbuffered() { + Rep* r = rep_; + assert(r->state == Rep::State::kBuffered); + r->state = Rep::State::kUnbuffered; + const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0 + ? r->compression_opts.zstd_max_train_bytes + : r->compression_opts.max_dict_bytes; + Random64 generator{r->creation_time}; + std::string compression_dict_samples; + std::vector<size_t> compression_dict_sample_lens; + if (!r->data_block_and_keys_buffers.empty()) { + while (compression_dict_samples.size() < kSampleBytes) { + size_t rand_idx = + generator.Uniform(r->data_block_and_keys_buffers.size()); + size_t copy_len = + std::min(kSampleBytes - compression_dict_samples.size(), + r->data_block_and_keys_buffers[rand_idx].first.size()); + compression_dict_samples.append( + r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len); + compression_dict_sample_lens.emplace_back(copy_len); + } + } + + // final data block flushed, now we can generate dictionary from the samples. + // OK if compression_dict_samples is empty, we'll just get empty dictionary. + std::string dict; + if (r->compression_opts.zstd_max_train_bytes > 0) { + dict = ZSTD_TrainDictionary(compression_dict_samples, + compression_dict_sample_lens, + r->compression_opts.max_dict_bytes); + } else { + dict = std::move(compression_dict_samples); + } + r->compression_dict.reset(new CompressionDict(dict, r->compression_type, + r->compression_opts.level)); + r->verify_dict.reset(new UncompressionDict( + dict, r->compression_type == kZSTD || + r->compression_type == kZSTDNotFinalCompression)); + + for (size_t i = 0; ok() && i < r->data_block_and_keys_buffers.size(); ++i) { + const auto& data_block = r->data_block_and_keys_buffers[i].first; + auto& keys = r->data_block_and_keys_buffers[i].second; + assert(!data_block.empty()); + assert(!keys.empty()); + + for (const auto& key : keys) { + if (r->filter_builder != nullptr) { + r->filter_builder->Add(ExtractUserKey(key)); + } + r->index_builder->OnKeyAdded(key); + } + WriteBlock(Slice(data_block), &r->pending_handle, true /* is_data_block */); + if (ok() && i + 1 < r->data_block_and_keys_buffers.size()) { + Slice first_key_in_next_block = + r->data_block_and_keys_buffers[i + 1].second.front(); + Slice* first_key_in_next_block_ptr = &first_key_in_next_block; + r->index_builder->AddIndexEntry(&keys.back(), first_key_in_next_block_ptr, + r->pending_handle); + } + } + r->data_block_and_keys_buffers.clear(); +} + +Status BlockBasedTableBuilder::Finish() { + Rep* r = rep_; + assert(r->state != Rep::State::kClosed); + bool empty_data_block = r->data_block.empty(); + Flush(); + if (r->state == Rep::State::kBuffered) { + EnterUnbuffered(); + } + // To make sure properties block is able to keep the accurate size of index + // block, we will finish writing all index entries first. + if (ok() && !empty_data_block) { + r->index_builder->AddIndexEntry( + &r->last_key, nullptr /* no next data block */, r->pending_handle); + } + + // Write meta blocks, metaindex block and footer in the following order. + // 1. [meta block: filter] + // 2. [meta block: index] + // 3. [meta block: compression dictionary] + // 4. [meta block: range deletion tombstone] + // 5. [meta block: properties] + // 6. [metaindex block] + // 7. Footer + BlockHandle metaindex_block_handle, index_block_handle; + MetaIndexBuilder meta_index_builder; + WriteFilterBlock(&meta_index_builder); + WriteIndexBlock(&meta_index_builder, &index_block_handle); + WriteCompressionDictBlock(&meta_index_builder); + WriteRangeDelBlock(&meta_index_builder); + WritePropertiesBlock(&meta_index_builder); + if (ok()) { + // flush the meta index block + WriteRawBlock(meta_index_builder.Finish(), kNoCompression, + &metaindex_block_handle); + } + if (ok()) { + WriteFooter(metaindex_block_handle, index_block_handle); + } + r->state = Rep::State::kClosed; + return r->status; +} + +void BlockBasedTableBuilder::Abandon() { + assert(rep_->state != Rep::State::kClosed); + rep_->state = Rep::State::kClosed; +} + +uint64_t BlockBasedTableBuilder::NumEntries() const { + return rep_->props.num_entries; +} + +uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; } + +bool BlockBasedTableBuilder::NeedCompact() const { + for (const auto& collector : rep_->table_properties_collectors) { + if (collector->NeedCompact()) { + return true; + } + } + return false; +} + +TableProperties BlockBasedTableBuilder::GetTableProperties() const { + TableProperties ret = rep_->props; + for (const auto& collector : rep_->table_properties_collectors) { + for (const auto& prop : collector->GetReadableProperties()) { + ret.readable_properties.insert(prop); + } + collector->Finish(&ret.user_collected_properties); + } + return ret; +} + +const std::string BlockBasedTable::kFilterBlockPrefix = "filter."; +const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter."; +const std::string BlockBasedTable::kPartitionedFilterBlockPrefix = + "partitionedfilter."; +} // namespace rocksdb diff --git a/src/rocksdb/table/block_based_table_builder.h b/src/rocksdb/table/block_based_table_builder.h new file mode 100644 index 00000000..b10494e7 --- /dev/null +++ b/src/rocksdb/table/block_based_table_builder.h @@ -0,0 +1,145 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <stdint.h> +#include <limits> +#include <string> +#include <utility> +#include <vector> + +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "table/meta_blocks.h" +#include "table/table_builder.h" +#include "util/compression.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; +struct BlockBasedTableOptions; + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kLegacyBlockBasedTableMagicNumber; + +class BlockBasedTableBuilder : public TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). + BlockBasedTableBuilder( + const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, WritableFileWriter* file, + const CompressionType compression_type, + const uint64_t sample_for_compression, + const CompressionOptions& compression_opts, const bool skip_filters, + const std::string& column_family_name, const uint64_t creation_time = 0, + const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~BlockBasedTableBuilder(); + + // No copying allowed + BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete; + BlockBasedTableBuilder& operator=(const BlockBasedTableBuilder&) = delete; + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + bool NeedCompact() const override; + + // Get table properties + TableProperties GetTableProperties() const override; + + private: + bool ok() const { return status().ok(); } + + // Transition state from buffered to unbuffered. See `Rep::State` API comment + // for details of the states. + // REQUIRES: `rep_->state == kBuffered` + void EnterUnbuffered(); + + // Call block's Finish() method + // and then write the compressed block contents to file. + void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block); + + // Compress and write block content to the file. + void WriteBlock(const Slice& block_contents, BlockHandle* handle, + bool is_data_block); + // Directly write data to the file. + void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle, + bool is_data_block = false); + Status InsertBlockInCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle); + + void WriteFilterBlock(MetaIndexBuilder* meta_index_builder); + void WriteIndexBlock(MetaIndexBuilder* meta_index_builder, + BlockHandle* index_block_handle); + void WritePropertiesBlock(MetaIndexBuilder* meta_index_builder); + void WriteCompressionDictBlock(MetaIndexBuilder* meta_index_builder); + void WriteRangeDelBlock(MetaIndexBuilder* meta_index_builder); + void WriteFooter(BlockHandle& metaindex_block_handle, + BlockHandle& index_block_handle); + + struct Rep; + class BlockBasedTablePropertiesCollectorFactory; + class BlockBasedTablePropertiesCollector; + Rep* rep_; + + // Advanced operation: flush any buffered key/value pairs to file. + // Can be used to ensure that two adjacent entries never live in + // the same data block. Most clients should not need to use this method. + // REQUIRES: Finish(), Abandon() have not been called + void Flush(); + + // Some compression libraries fail when the raw size is bigger than int. If + // uncompressed size is bigger than kCompressionSizeLimit, don't compress it + const uint64_t kCompressionSizeLimit = std::numeric_limits<int>::max(); +}; + +Slice CompressBlock(const Slice& raw, const CompressionInfo& info, + CompressionType* type, uint32_t format_version, + bool do_sample, std::string* compressed_output, + std::string* sampled_output_fast, + std::string* sampled_output_slow); + +} // namespace rocksdb diff --git a/src/rocksdb/table/block_based_table_factory.cc b/src/rocksdb/table/block_based_table_factory.cc new file mode 100644 index 00000000..cda8d1e2 --- /dev/null +++ b/src/rocksdb/table/block_based_table_factory.cc @@ -0,0 +1,627 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_table_factory.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include <inttypes.h> +#include <stdint.h> + +#include <memory> +#include <string> + +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/flush_block_policy.h" +#include "table/block_based_table_builder.h" +#include "table/block_based_table_reader.h" +#include "table/format.h" +#include "util/mutexlock.h" +#include "util/string_util.h" + +namespace rocksdb { + +void TailPrefetchStats::RecordEffectiveSize(size_t len) { + MutexLock l(&mutex_); + if (num_records_ < kNumTracked) { + num_records_++; + } + records_[next_++] = len; + if (next_ == kNumTracked) { + next_ = 0; + } +} + +size_t TailPrefetchStats::GetSuggestedPrefetchSize() { + std::vector<size_t> sorted; + { + MutexLock l(&mutex_); + + if (num_records_ == 0) { + return 0; + } + sorted.assign(records_, records_ + num_records_); + } + + // Of the historic size, we find the maximum one that satisifis the condtiion + // that if prefetching all, less than 1/8 will be wasted. + std::sort(sorted.begin(), sorted.end()); + + // Assuming we have 5 data points, and after sorting it looks like this: + // + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // +---+ | | | | + // | | | | | | + // +---+ | | | | | | + // | | | | | | | | + // +---+ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // and we use every of the value as a candidate, and estimate how much we + // wasted, compared to read. For example, when we use the 3rd record + // as candiate. This area is what we read: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ***+ *** *** *** *** ** + // * | | | | | | + // +---+ | | | | | * + // * | | | | | | | | + // +---+ | | | | | | | * + // * | | | | X | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // *** *** ***-*** ***--*** ***--*** +**** + // which is (size of the record) X (number of records). + // + // While wasted is this area: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ****---+ | | | | + // * * | | | | | + // * *-*** *** | | | | | + // * * | | | | | | | + // *--** *** | | | | | | | + // | | | | | X | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // Which can be calculated iteratively. + // The difference between wasted using 4st and 3rd record, will + // be following area: + // +---+ + // +--+ +-+ ++ +-+ +-+ +---+ | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // | xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // +-+ +-+ +-+ ++ +---+ +--+ | | | + // | | | | | | | + // +---+ ++ | | | | | | + // | | | | | | X | | | + // +---+ ++ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // which will be the size difference between 4st and 3rd record, + // times 3, which is number of records before the 4st. + // Here we assume that all data within the prefetch range will be useful. In + // reality, it may not be the case when a partial block is inside the range, + // or there are data in the middle that is not read. We ignore those cases + // for simplicity. + assert(!sorted.empty()); + size_t prev_size = sorted[0]; + size_t max_qualified_size = sorted[0]; + size_t wasted = 0; + for (size_t i = 1; i < sorted.size(); i++) { + size_t read = sorted[i] * sorted.size(); + wasted += (sorted[i] - prev_size) * i; + if (wasted <= read / 8) { + max_qualified_size = sorted[i]; + } + prev_size = sorted[i]; + } + const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB + return std::min(kMaxPrefetchSize, max_qualified_size); +} + +BlockBasedTableFactory::BlockBasedTableFactory( + const BlockBasedTableOptions& _table_options) + : table_options_(_table_options) { + if (table_options_.flush_block_policy_factory == nullptr) { + table_options_.flush_block_policy_factory.reset( + new FlushBlockBySizePolicyFactory()); + } + if (table_options_.no_block_cache) { + table_options_.block_cache.reset(); + } else if (table_options_.block_cache == nullptr) { + table_options_.block_cache = NewLRUCache(8 << 20); + } + if (table_options_.block_size_deviation < 0 || + table_options_.block_size_deviation > 100) { + table_options_.block_size_deviation = 0; + } + if (table_options_.block_restart_interval < 1) { + table_options_.block_restart_interval = 1; + } + if (table_options_.index_block_restart_interval < 1) { + table_options_.index_block_restart_interval = 1; + } + if (table_options_.partition_filters && + table_options_.index_type != + BlockBasedTableOptions::kTwoLevelIndexSearch) { + // We do not support partitioned filters without partitioning indexes + table_options_.partition_filters = false; + } +} + +Status BlockBasedTableFactory::NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, + bool prefetch_index_and_filter_in_cache) const { + return BlockBasedTable::Open( + table_reader_options.ioptions, table_reader_options.env_options, + table_options_, table_reader_options.internal_comparator, std::move(file), + file_size, table_reader, table_reader_options.prefix_extractor, + prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, + table_reader_options.level, table_reader_options.immortal, + table_reader_options.largest_seqno, &tail_prefetch_stats_); +} + +TableBuilder* BlockBasedTableFactory::NewTableBuilder( + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + WritableFileWriter* file) const { + auto table_builder = new BlockBasedTableBuilder( + table_builder_options.ioptions, table_builder_options.moptions, + table_options_, table_builder_options.internal_comparator, + table_builder_options.int_tbl_prop_collector_factories, column_family_id, + file, table_builder_options.compression_type, + table_builder_options.sample_for_compression, + table_builder_options.compression_opts, + table_builder_options.skip_filters, + table_builder_options.column_family_name, + table_builder_options.creation_time, + table_builder_options.oldest_key_time, + table_builder_options.target_file_size); + + return table_builder; +} + +Status BlockBasedTableFactory::SanitizeOptions( + const DBOptions& /*db_opts*/, const ColumnFamilyOptions& cf_opts) const { + if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && + cf_opts.prefix_extractor == nullptr) { + return Status::InvalidArgument( + "Hash index is specified for block-based " + "table, but prefix_extractor is not given"); + } + if (table_options_.cache_index_and_filter_blocks && + table_options_.no_block_cache) { + return Status::InvalidArgument( + "Enable cache_index_and_filter_blocks, " + ", but block cache is disabled"); + } + if (table_options_.pin_l0_filter_and_index_blocks_in_cache && + table_options_.no_block_cache) { + return Status::InvalidArgument( + "Enable pin_l0_filter_and_index_blocks_in_cache, " + ", but block cache is disabled"); + } + if (!BlockBasedTableSupportedVersion(table_options_.format_version)) { + return Status::InvalidArgument( + "Unsupported BlockBasedTable format_version. Please check " + "include/rocksdb/table.h for more info"); + } + if (table_options_.block_align && (cf_opts.compression != kNoCompression)) { + return Status::InvalidArgument( + "Enable block_align, but compression " + "enabled"); + } + if (table_options_.block_align && + (table_options_.block_size & (table_options_.block_size - 1))) { + return Status::InvalidArgument( + "Block alignment requested but block size is not a power of 2"); + } + if (table_options_.data_block_index_type == + BlockBasedTableOptions::kDataBlockBinaryAndHash && + table_options_.data_block_hash_table_util_ratio <= 0) { + return Status::InvalidArgument( + "data_block_hash_table_util_ratio should be greater than 0 when " + "data_block_index_type is set to kDataBlockBinaryAndHash"); + } + return Status::OK(); +} + +std::string BlockBasedTableFactory::GetPrintableTableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n", + table_options_.flush_block_policy_factory->Name(), + static_cast<void*>(table_options_.flush_block_policy_factory.get())); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n", + table_options_.cache_index_and_filter_blocks); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " cache_index_and_filter_blocks_with_high_priority: %d\n", + table_options_.cache_index_and_filter_blocks_with_high_priority); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " pin_l0_filter_and_index_blocks_in_cache: %d\n", + table_options_.pin_l0_filter_and_index_blocks_in_cache); + ret.append(buffer); + snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n", + table_options_.pin_top_level_index_and_filter); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_type: %d\n", + table_options_.index_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " data_block_index_type: %d\n", + table_options_.data_block_index_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " data_block_hash_table_util_ratio: %lf\n", + table_options_.data_block_hash_table_util_ratio); + ret.append(buffer); + snprintf(buffer, kBufferSize, " hash_index_allow_collision: %d\n", + table_options_.hash_index_allow_collision); + ret.append(buffer); + snprintf(buffer, kBufferSize, " checksum: %d\n", table_options_.checksum); + ret.append(buffer); + snprintf(buffer, kBufferSize, " no_block_cache: %d\n", + table_options_.no_block_cache); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_cache: %p\n", + static_cast<void*>(table_options_.block_cache.get())); + ret.append(buffer); + if (table_options_.block_cache) { + const char* block_cache_name = table_options_.block_cache->Name(); + if (block_cache_name != nullptr) { + snprintf(buffer, kBufferSize, " block_cache_name: %s\n", + block_cache_name); + ret.append(buffer); + } + ret.append(" block_cache_options:\n"); + ret.append(table_options_.block_cache->GetPrintableOptions()); + } + snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n", + static_cast<void*>(table_options_.block_cache_compressed.get())); + ret.append(buffer); + if (table_options_.block_cache_compressed) { + const char* block_cache_compressed_name = + table_options_.block_cache_compressed->Name(); + if (block_cache_compressed_name != nullptr) { + snprintf(buffer, kBufferSize, " block_cache_name: %s\n", + block_cache_compressed_name); + ret.append(buffer); + } + ret.append(" block_cache_compressed_options:\n"); + ret.append(table_options_.block_cache_compressed->GetPrintableOptions()); + } + snprintf(buffer, kBufferSize, " persistent_cache: %p\n", + static_cast<void*>(table_options_.persistent_cache.get())); + ret.append(buffer); + if (table_options_.persistent_cache) { + snprintf(buffer, kBufferSize, " persistent_cache_options:\n"); + ret.append(buffer); + ret.append(table_options_.persistent_cache->GetPrintableOptions()); + } + snprintf(buffer, kBufferSize, " block_size: %" ROCKSDB_PRIszt "\n", + table_options_.block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_size_deviation: %d\n", + table_options_.block_size_deviation); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_restart_interval: %d\n", + table_options_.block_restart_interval); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n", + table_options_.index_block_restart_interval); + ret.append(buffer); + snprintf(buffer, kBufferSize, " metadata_block_size: %" PRIu64 "\n", + table_options_.metadata_block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " partition_filters: %d\n", + table_options_.partition_filters); + ret.append(buffer); + snprintf(buffer, kBufferSize, " use_delta_encoding: %d\n", + table_options_.use_delta_encoding); + ret.append(buffer); + snprintf(buffer, kBufferSize, " filter_policy: %s\n", + table_options_.filter_policy == nullptr + ? "nullptr" + : table_options_.filter_policy->Name()); + ret.append(buffer); + snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n", + table_options_.whole_key_filtering); + ret.append(buffer); + snprintf(buffer, kBufferSize, " verify_compression: %d\n", + table_options_.verify_compression); + ret.append(buffer); + snprintf(buffer, kBufferSize, " read_amp_bytes_per_bit: %d\n", + table_options_.read_amp_bytes_per_bit); + ret.append(buffer); + snprintf(buffer, kBufferSize, " format_version: %d\n", + table_options_.format_version); + ret.append(buffer); + snprintf(buffer, kBufferSize, " enable_index_compression: %d\n", + table_options_.enable_index_compression); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_align: %d\n", + table_options_.block_align); + ret.append(buffer); + return ret; +} + +#ifndef ROCKSDB_LITE +namespace { +bool SerializeSingleBlockBasedTableOption( + std::string* opt_string, const BlockBasedTableOptions& bbt_options, + const std::string& name, const std::string& delimiter) { + auto iter = block_based_table_type_info.find(name); + if (iter == block_based_table_type_info.end()) { + return false; + } + auto& opt_info = iter->second; + const char* opt_address = + reinterpret_cast<const char*>(&bbt_options) + opt_info.offset; + std::string value; + bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value); + if (result) { + *opt_string = name + "=" + value + delimiter; + } + return result; +} +} // namespace + +Status BlockBasedTableFactory::GetOptionString( + std::string* opt_string, const std::string& delimiter) const { + assert(opt_string); + opt_string->clear(); + for (auto iter = block_based_table_type_info.begin(); + iter != block_based_table_type_info.end(); ++iter) { + if (iter->second.verification == OptionVerificationType::kDeprecated) { + // If the option is no longer used in rocksdb and marked as deprecated, + // we skip it in the serialization. + continue; + } + std::string single_output; + bool result = SerializeSingleBlockBasedTableOption( + &single_output, table_options_, iter->first, delimiter); + assert(result); + if (result) { + opt_string->append(single_output); + } + } + return Status::OK(); +} +#else +Status BlockBasedTableFactory::GetOptionString( + std::string* /*opt_string*/, const std::string& /*delimiter*/) const { + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +const BlockBasedTableOptions& BlockBasedTableFactory::table_options() const { + return table_options_; +} + +#ifndef ROCKSDB_LITE +namespace { +std::string ParseBlockBasedTableOption(const std::string& name, + const std::string& org_value, + BlockBasedTableOptions* new_options, + bool input_strings_escaped = false, + bool ignore_unknown_options = false) { + const std::string& value = + input_strings_escaped ? UnescapeOptionString(org_value) : org_value; + if (!input_strings_escaped) { + // if the input string is not escaped, it means this function is + // invoked from SetOptions, which takes the old format. + if (name == "block_cache" || name == "block_cache_compressed") { + // cache options can be specified in the following format + // "block_cache={capacity=1M;num_shard_bits=4; + // strict_capacity_limit=true;high_pri_pool_ratio=0.5;}" + // To support backward compatibility, the following format + // is also supported. + // "block_cache=1M" + std::shared_ptr<Cache> cache; + // block_cache is specified in format block_cache=<cache_size>. + if (value.find('=') == std::string::npos) { + cache = NewLRUCache(ParseSizeT(value)); + } else { + LRUCacheOptions cache_opts; + if (!ParseOptionHelper(reinterpret_cast<char*>(&cache_opts), + OptionType::kLRUCacheOptions, value)) { + return "Invalid cache options"; + } + cache = NewLRUCache(cache_opts); + } + + if (name == "block_cache") { + new_options->block_cache = cache; + } else { + new_options->block_cache_compressed = cache; + } + return ""; + } else if (name == "filter_policy") { + // Expect the following format + // bloomfilter:int:bool + const std::string kName = "bloomfilter:"; + if (value.compare(0, kName.size(), kName) != 0) { + return "Invalid filter policy name"; + } + size_t pos = value.find(':', kName.size()); + if (pos == std::string::npos) { + return "Invalid filter policy config, missing bits_per_key"; + } + int bits_per_key = + ParseInt(trim(value.substr(kName.size(), pos - kName.size()))); + bool use_block_based_builder = + ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1))); + new_options->filter_policy.reset( + NewBloomFilterPolicy(bits_per_key, use_block_based_builder)); + return ""; + } + } + const auto iter = block_based_table_type_info.find(name); + if (iter == block_based_table_type_info.end()) { + if (ignore_unknown_options) { + return ""; + } else { + return "Unrecognized option"; + } + } + const auto& opt_info = iter->second; + if (opt_info.verification != OptionVerificationType::kDeprecated && + !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset, + opt_info.type, value)) { + return "Invalid value"; + } + return ""; +} +} // namespace + +Status GetBlockBasedTableOptionsFromString( + const BlockBasedTableOptions& table_options, const std::string& opts_str, + BlockBasedTableOptions* new_table_options) { + std::unordered_map<std::string, std::string> opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + return s; + } + + return GetBlockBasedTableOptionsFromMap(table_options, opts_map, + new_table_options); +} + +Status GetBlockBasedTableOptionsFromMap( + const BlockBasedTableOptions& table_options, + const std::unordered_map<std::string, std::string>& opts_map, + BlockBasedTableOptions* new_table_options, bool input_strings_escaped, + bool ignore_unknown_options) { + assert(new_table_options); + *new_table_options = table_options; + for (const auto& o : opts_map) { + auto error_message = ParseBlockBasedTableOption( + o.first, o.second, new_table_options, input_strings_escaped, + ignore_unknown_options); + if (error_message != "") { + const auto iter = block_based_table_type_info.find(o.first); + if (iter == block_based_table_type_info.end() || + !input_strings_escaped || // !input_strings_escaped indicates + // the old API, where everything is + // parsable. + (iter->second.verification != OptionVerificationType::kByName && + iter->second.verification != + OptionVerificationType::kByNameAllowNull && + iter->second.verification != + OptionVerificationType::kByNameAllowFromNull && + iter->second.verification != OptionVerificationType::kDeprecated)) { + // Restore "new_options" to the default "base_options". + *new_table_options = table_options; + return Status::InvalidArgument("Can't parse BlockBasedTableOptions:", + o.first + " " + error_message); + } + } + } + return Status::OK(); +} + +Status VerifyBlockBasedTableFactory( + const BlockBasedTableFactory* base_tf, + const BlockBasedTableFactory* file_tf, + OptionsSanityCheckLevel sanity_check_level) { + if ((base_tf != nullptr) != (file_tf != nullptr) && + sanity_check_level > kSanityLevelNone) { + return Status::Corruption( + "[RocksDBOptionsParser]: Inconsistent TableFactory class type"); + } + if (base_tf == nullptr) { + return Status::OK(); + } + assert(file_tf != nullptr); + + const auto& base_opt = base_tf->table_options(); + const auto& file_opt = file_tf->table_options(); + + for (auto& pair : block_based_table_type_info) { + if (pair.second.verification == OptionVerificationType::kDeprecated) { + // We skip checking deprecated variables as they might + // contain random values since they might not be initialized + continue; + } + if (BBTOptionSanityCheckLevel(pair.first) <= sanity_check_level) { + if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt), + reinterpret_cast<const char*>(&file_opt), + pair.second, pair.first, nullptr)) { + return Status::Corruption( + "[RocksDBOptionsParser]: " + "failed the verification on BlockBasedTableOptions::", + pair.first); + } + } + } + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& _table_options) { + return new BlockBasedTableFactory(_table_options); +} + +const std::string BlockBasedTableFactory::kName = "BlockBasedTable"; +const std::string BlockBasedTablePropertyNames::kIndexType = + "rocksdb.block.based.table.index.type"; +const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering = + "rocksdb.block.based.table.whole.key.filtering"; +const std::string BlockBasedTablePropertyNames::kPrefixFiltering = + "rocksdb.block.based.table.prefix.filtering"; +const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes"; +const std::string kHashIndexPrefixesMetadataBlock = + "rocksdb.hashindex.metadata"; +const std::string kPropTrue = "1"; +const std::string kPropFalse = "0"; + +} // namespace rocksdb diff --git a/src/rocksdb/table/block_based_table_factory.h b/src/rocksdb/table/block_based_table_factory.h new file mode 100644 index 00000000..100bb0bc --- /dev/null +++ b/src/rocksdb/table/block_based_table_factory.h @@ -0,0 +1,191 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <stdint.h> + +#include <memory> +#include <string> + +#include "db/dbformat.h" +#include "options/options_helper.h" +#include "options/options_parser.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +struct EnvOptions; + +class BlockBasedTableBuilder; + +// A class used to track actual bytes written from the tail in the recent SST +// file opens, and provide a suggestion for following open. +class TailPrefetchStats { + public: + void RecordEffectiveSize(size_t len); + // 0 indicates no information to determine. + size_t GetSuggestedPrefetchSize(); + + private: + const static size_t kNumTracked = 32; + size_t records_[kNumTracked]; + port::Mutex mutex_; + size_t next_ = 0; + size_t num_records_ = 0; +}; + +class BlockBasedTableFactory : public TableFactory { + public: + explicit BlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); + + ~BlockBasedTableFactory() {} + + const char* Name() const override { return kName.c_str(); } + + Status NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, + bool prefetch_index_and_filter_in_cache = true) const override; + + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + uint32_t column_family_id, WritableFileWriter* file) const override; + + // Sanitizes the specified DB Options. + Status SanitizeOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override; + + std::string GetPrintableTableOptions() const override; + + Status GetOptionString(std::string* opt_string, + const std::string& delimiter) const override; + + const BlockBasedTableOptions& table_options() const; + + void* GetOptions() override { return &table_options_; } + + bool IsDeleteRangeSupported() const override { return true; } + + static const std::string kName; + + private: + BlockBasedTableOptions table_options_; + mutable TailPrefetchStats tail_prefetch_stats_; +}; + +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; +extern const std::string kPropTrue; +extern const std::string kPropFalse; + +#ifndef ROCKSDB_LITE +extern Status VerifyBlockBasedTableFactory( + const BlockBasedTableFactory* base_tf, + const BlockBasedTableFactory* file_tf, + OptionsSanityCheckLevel sanity_check_level); + +static std::unordered_map<std::string, OptionTypeInfo> + block_based_table_type_info = { + /* currently not supported + std::shared_ptr<Cache> block_cache = nullptr; + std::shared_ptr<Cache> block_cache_compressed = nullptr; + */ + {"flush_block_policy_factory", + {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory), + OptionType::kFlushBlockPolicyFactory, OptionVerificationType::kByName, + false, 0}}, + {"cache_index_and_filter_blocks", + {offsetof(struct BlockBasedTableOptions, + cache_index_and_filter_blocks), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"cache_index_and_filter_blocks_with_high_priority", + {offsetof(struct BlockBasedTableOptions, + cache_index_and_filter_blocks_with_high_priority), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"pin_l0_filter_and_index_blocks_in_cache", + {offsetof(struct BlockBasedTableOptions, + pin_l0_filter_and_index_blocks_in_cache), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"index_type", + {offsetof(struct BlockBasedTableOptions, index_type), + OptionType::kBlockBasedTableIndexType, + OptionVerificationType::kNormal, false, 0}}, + {"hash_index_allow_collision", + {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"data_block_index_type", + {offsetof(struct BlockBasedTableOptions, data_block_index_type), + OptionType::kBlockBasedTableDataBlockIndexType, + OptionVerificationType::kNormal, false, 0}}, + {"data_block_hash_table_util_ratio", + {offsetof(struct BlockBasedTableOptions, + data_block_hash_table_util_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, false, 0}}, + {"checksum", + {offsetof(struct BlockBasedTableOptions, checksum), + OptionType::kChecksumType, OptionVerificationType::kNormal, false, + 0}}, + {"no_block_cache", + {offsetof(struct BlockBasedTableOptions, no_block_cache), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"block_size", + {offsetof(struct BlockBasedTableOptions, block_size), + OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, + {"block_size_deviation", + {offsetof(struct BlockBasedTableOptions, block_size_deviation), + OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, + {"block_restart_interval", + {offsetof(struct BlockBasedTableOptions, block_restart_interval), + OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, + {"index_block_restart_interval", + {offsetof(struct BlockBasedTableOptions, index_block_restart_interval), + OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, + {"index_per_partition", + {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, false, + 0}}, + {"metadata_block_size", + {offsetof(struct BlockBasedTableOptions, metadata_block_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, + {"partition_filters", + {offsetof(struct BlockBasedTableOptions, partition_filters), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"filter_policy", + {offsetof(struct BlockBasedTableOptions, filter_policy), + OptionType::kFilterPolicy, OptionVerificationType::kByName, false, + 0}}, + {"whole_key_filtering", + {offsetof(struct BlockBasedTableOptions, whole_key_filtering), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"skip_table_builder_flush", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, + 0}}, + {"format_version", + {offsetof(struct BlockBasedTableOptions, format_version), + OptionType::kUInt32T, OptionVerificationType::kNormal, false, 0}}, + {"verify_compression", + {offsetof(struct BlockBasedTableOptions, verify_compression), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"read_amp_bytes_per_bit", + {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit), + OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, + {"enable_index_compression", + {offsetof(struct BlockBasedTableOptions, enable_index_compression), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"block_align", + {offsetof(struct BlockBasedTableOptions, block_align), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"pin_top_level_index_and_filter", + {offsetof(struct BlockBasedTableOptions, + pin_top_level_index_and_filter), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}}; +#endif // !ROCKSDB_LITE +} // namespace rocksdb diff --git a/src/rocksdb/table/block_based_table_reader.cc b/src/rocksdb/table/block_based_table_reader.cc new file mode 100644 index 00000000..dc2d4263 --- /dev/null +++ b/src/rocksdb/table/block_based_table_reader.cc @@ -0,0 +1,3507 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based_table_reader.h" + +#include <algorithm> +#include <array> +#include <limits> +#include <string> +#include <utility> +#include <vector> + +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" + +#include "table/block.h" +#include "table/block_based_filter_block.h" +#include "table/block_based_table_factory.h" +#include "table/block_fetcher.h" +#include "table/block_prefix_index.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/full_filter_block.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" +#include "table/partitioned_filter_block.h" +#include "table/persistent_cache_helper.h" +#include "table/sst_file_writer_collectors.h" +#include "table/two_level_iterator.h" + +#include "monitoring/perf_context_imp.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/file_reader_writer.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/sync_point.h" +#include "util/xxhash.h" + +namespace rocksdb { + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; + +typedef BlockBasedTable::IndexReader IndexReader; + +BlockBasedTable::~BlockBasedTable() { + Close(); + delete rep_; +} + +std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0); + +namespace { +// Read the block identified by "handle" from "file". +// The only relevant option is options.verify_checksums for now. +// On failure return non-OK. +// On success fill *result and return OK - caller owns *result +// @param uncompression_dict Data for presetting the compression library's +// dictionary. +Status ReadBlockFromFile( + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const ReadOptions& options, const BlockHandle& handle, + std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions, + bool do_uncompress, bool maybe_compressed, + const UncompressionDict& uncompression_dict, + const PersistentCacheOptions& cache_options, SequenceNumber global_seqno, + size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) { + BlockContents contents; + BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle, + &contents, ioptions, do_uncompress, + maybe_compressed, uncompression_dict, + cache_options, memory_allocator); + Status s = block_fetcher.ReadBlockContents(); + if (s.ok()) { + result->reset(new Block(std::move(contents), global_seqno, + read_amp_bytes_per_bit, ioptions.statistics)); + } + + return s; +} + +inline MemoryAllocator* GetMemoryAllocator( + const BlockBasedTableOptions& table_options) { + return table_options.block_cache.get() + ? table_options.block_cache->memory_allocator() + : nullptr; +} + +inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock( + const BlockBasedTableOptions& table_options) { + return table_options.block_cache_compressed.get() + ? table_options.block_cache_compressed->memory_allocator() + : nullptr; +} + +// Delete the resource that is held by the iterator. +template <class ResourceType> +void DeleteHeldResource(void* arg, void* /*ignored*/) { + delete reinterpret_cast<ResourceType*>(arg); +} + +// Delete the entry resided in the cache. +template <class Entry> +void DeleteCachedEntry(const Slice& /*key*/, void* value) { + auto entry = reinterpret_cast<Entry*>(value); + delete entry; +} + +void DeleteCachedFilterEntry(const Slice& key, void* value); +void DeleteCachedIndexEntry(const Slice& key, void* value); +void DeleteCachedUncompressionDictEntry(const Slice& key, void* value); + +// Release the cached entry and decrement its ref count. +void ReleaseCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast<Cache*>(arg); + Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h); + cache->Release(handle); +} + +// Release the cached entry and decrement its ref count. +void ForceReleaseCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast<Cache*>(arg); + Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h); + cache->Release(handle, true /* force_erase */); +} + +Slice GetCacheKeyFromOffset(const char* cache_key_prefix, + size_t cache_key_prefix_size, uint64_t offset, + char* cache_key) { + assert(cache_key != nullptr); + assert(cache_key_prefix_size != 0); + assert(cache_key_prefix_size <= BlockBasedTable::kMaxCacheKeyPrefixSize); + memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); + char* end = EncodeVarint64(cache_key + cache_key_prefix_size, offset); + return Slice(cache_key, static_cast<size_t>(end - cache_key)); +} + +Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, + int level, Tickers block_cache_miss_ticker, + Tickers block_cache_hit_ticker, + uint64_t* block_cache_miss_stats, + uint64_t* block_cache_hit_stats, + Statistics* statistics, + GetContext* get_context) { + auto cache_handle = block_cache->Lookup(key, statistics); + if (cache_handle != nullptr) { + PERF_COUNTER_ADD(block_cache_hit_count, 1); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, + static_cast<uint32_t>(level)); + if (get_context != nullptr) { + // overall cache hit + get_context->get_context_stats_.num_cache_hit++; + // total bytes read from cache + get_context->get_context_stats_.num_cache_bytes_read += + block_cache->GetUsage(cache_handle); + // block-type specific cache hit + (*block_cache_hit_stats)++; + } else { + // overall cache hit + RecordTick(statistics, BLOCK_CACHE_HIT); + // total bytes read from cache + RecordTick(statistics, BLOCK_CACHE_BYTES_READ, + block_cache->GetUsage(cache_handle)); + RecordTick(statistics, block_cache_hit_ticker); + } + } else { + PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1, + static_cast<uint32_t>(level)); + if (get_context != nullptr) { + // overall cache miss + get_context->get_context_stats_.num_cache_miss++; + // block-type specific cache miss + (*block_cache_miss_stats)++; + } else { + RecordTick(statistics, BLOCK_CACHE_MISS); + RecordTick(statistics, block_cache_miss_ticker); + } + } + + return cache_handle; +} + +// For hash based index, return true if prefix_extractor and +// prefix_extractor_block mismatch, false otherwise. This flag will be used +// as total_order_seek via NewIndexIterator +bool PrefixExtractorChanged(const TableProperties* table_properties, + const SliceTransform* prefix_extractor) { + // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set. + // Turn off hash index in prefix_extractor is not set; if prefix_extractor + // is set but prefix_extractor_block is not set, also disable hash index + if (prefix_extractor == nullptr || table_properties == nullptr || + table_properties->prefix_extractor_name.empty()) { + return true; + } + + // prefix_extractor and prefix_extractor_block are both non-empty + if (table_properties->prefix_extractor_name.compare( + prefix_extractor->Name()) != 0) { + return true; + } else { + return false; + } +} + +} // namespace + +// Index that allows binary search lookup in a two-level index structure. +class PartitionIndexReader : public IndexReader, public Cleanable { + public: + // Read the partition index from the file and create an instance for + // `PartitionIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(BlockBasedTable* table, RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const BlockHandle& index_handle, + const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icomparator, + IndexReader** index_reader, + const PersistentCacheOptions& cache_options, + const int level, const bool index_key_includes_seq, + const bool index_value_is_full, + MemoryAllocator* memory_allocator) { + std::unique_ptr<Block> index_block; + auto s = ReadBlockFromFile( + file, prefetch_buffer, footer, ReadOptions(), index_handle, + &index_block, ioptions, true /* decompress */, + true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), + cache_options, kDisableGlobalSequenceNumber, + 0 /* read_amp_bytes_per_bit */, memory_allocator); + + if (s.ok()) { + *index_reader = new PartitionIndexReader( + table, icomparator, std::move(index_block), ioptions.statistics, + level, index_key_includes_seq, index_value_is_full); + } + + return s; + } + + // return a two-level iterator: first level is on the partition index + InternalIteratorBase<BlockHandle>* NewIterator( + IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true, + bool fill_cache = true) override { + Statistics* kNullStats = nullptr; + // Filters are already checked before seeking the index + if (!partition_map_.empty()) { + // We don't return pinned datat from index blocks, so no need + // to set `block_contents_pinned`. + return NewTwoLevelIterator( + new BlockBasedTable::PartitionedIndexIteratorState( + table_, &partition_map_, index_key_includes_seq_, + index_value_is_full_), + index_block_->NewIterator<IndexBlockIter>( + icomparator_, icomparator_->user_comparator(), nullptr, + kNullStats, true, index_key_includes_seq_, index_value_is_full_)); + } else { + auto ro = ReadOptions(); + ro.fill_cache = fill_cache; + bool kIsIndex = true; + // We don't return pinned datat from index blocks, so no need + // to set `block_contents_pinned`. + return new BlockBasedTableIterator<IndexBlockIter, BlockHandle>( + table_, ro, *icomparator_, + index_block_->NewIterator<IndexBlockIter>( + icomparator_, icomparator_->user_comparator(), nullptr, + kNullStats, true, index_key_includes_seq_, index_value_is_full_), + false, true, /* prefix_extractor */ nullptr, kIsIndex, + index_key_includes_seq_, index_value_is_full_); + } + // TODO(myabandeh): Update TwoLevelIterator to be able to make use of + // on-stack BlockIter while the state is on heap. Currentlly it assumes + // the first level iter is always on heap and will attempt to delete it + // in its destructor. + } + + void CacheDependencies(bool pin) override { + // Before read partitions, prefetch them to avoid lots of IOs + auto rep = table_->rep_; + IndexBlockIter biter; + BlockHandle handle; + Statistics* kNullStats = nullptr; + // We don't return pinned datat from index blocks, so no need + // to set `block_contents_pinned`. + index_block_->NewIterator<IndexBlockIter>( + icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true, + index_key_includes_seq_, index_value_is_full_); + // Index partitions are assumed to be consecuitive. Prefetch them all. + // Read the first block offset + biter.SeekToFirst(); + if (!biter.Valid()) { + // Empty index. + return; + } + handle = biter.value(); + uint64_t prefetch_off = handle.offset(); + + // Read the last block's offset + biter.SeekToLast(); + if (!biter.Valid()) { + // Empty index. + return; + } + handle = biter.value(); + uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; + uint64_t prefetch_len = last_off - prefetch_off; + std::unique_ptr<FilePrefetchBuffer> prefetch_buffer; + auto& file = table_->rep_->file; + prefetch_buffer.reset(new FilePrefetchBuffer()); + Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off, + static_cast<size_t>(prefetch_len)); + + // After prefetch, read the partitions one by one + biter.SeekToFirst(); + auto ro = ReadOptions(); + Cache* block_cache = rep->table_options.block_cache.get(); + for (; biter.Valid(); biter.Next()) { + handle = biter.value(); + BlockBasedTable::CachableEntry<Block> block; + const bool is_index = true; + // TODO: Support counter batch update for partitioned index and + // filter blocks + s = table_->MaybeReadBlockAndLoadToCache( + prefetch_buffer.get(), rep, ro, handle, + UncompressionDict::GetEmptyDict(), &block, is_index, + nullptr /* get_context */); + + assert(s.ok() || block.value == nullptr); + if (s.ok() && block.value != nullptr) { + if (block.cache_handle != nullptr) { + if (pin) { + partition_map_[handle.offset()] = block; + RegisterCleanup(&ReleaseCachedEntry, block_cache, + block.cache_handle); + } else { + block_cache->Release(block.cache_handle); + } + } else { + delete block.value; + } + } + } + } + + size_t size() const override { return index_block_->size(); } + size_t usable_size() const override { return index_block_->usable_size(); } + + size_t ApproximateMemoryUsage() const override { + assert(index_block_); + size_t usage = index_block_->ApproximateMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + // TODO(myabandeh): more accurate estimate of partition_map_ mem usage + return usage; + } + + private: + PartitionIndexReader(BlockBasedTable* table, + const InternalKeyComparator* icomparator, + std::unique_ptr<Block>&& index_block, Statistics* stats, + const int /*level*/, const bool index_key_includes_seq, + const bool index_value_is_full) + : IndexReader(icomparator, stats), + table_(table), + index_block_(std::move(index_block)), + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { + assert(index_block_ != nullptr); + } + BlockBasedTable* table_; + std::unique_ptr<Block> index_block_; + std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>> + partition_map_; + const bool index_key_includes_seq_; + const bool index_value_is_full_; +}; + +// Index that allows binary search lookup for the first key of each block. +// This class can be viewed as a thin wrapper for `Block` class which already +// supports binary search. +class BinarySearchIndexReader : public IndexReader { + public: + // Read index from the file and create an intance for + // `BinarySearchIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const BlockHandle& index_handle, + const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icomparator, + IndexReader** index_reader, + const PersistentCacheOptions& cache_options, + const bool index_key_includes_seq, + const bool index_value_is_full, + MemoryAllocator* memory_allocator) { + std::unique_ptr<Block> index_block; + auto s = ReadBlockFromFile( + file, prefetch_buffer, footer, ReadOptions(), index_handle, + &index_block, ioptions, true /* decompress */, + true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), + cache_options, kDisableGlobalSequenceNumber, + 0 /* read_amp_bytes_per_bit */, memory_allocator); + + if (s.ok()) { + *index_reader = new BinarySearchIndexReader( + icomparator, std::move(index_block), ioptions.statistics, + index_key_includes_seq, index_value_is_full); + } + + return s; + } + + InternalIteratorBase<BlockHandle>* NewIterator( + IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true, + bool /*dont_care*/ = true) override { + Statistics* kNullStats = nullptr; + // We don't return pinned datat from index blocks, so no need + // to set `block_contents_pinned`. + return index_block_->NewIterator<IndexBlockIter>( + icomparator_, icomparator_->user_comparator(), iter, kNullStats, true, + index_key_includes_seq_, index_value_is_full_); + } + + size_t size() const override { return index_block_->size(); } + size_t usable_size() const override { return index_block_->usable_size(); } + + size_t ApproximateMemoryUsage() const override { + assert(index_block_); + size_t usage = index_block_->ApproximateMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + } + + private: + BinarySearchIndexReader(const InternalKeyComparator* icomparator, + std::unique_ptr<Block>&& index_block, + Statistics* stats, const bool index_key_includes_seq, + const bool index_value_is_full) + : IndexReader(icomparator, stats), + index_block_(std::move(index_block)), + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { + assert(index_block_ != nullptr); + } + std::unique_ptr<Block> index_block_; + const bool index_key_includes_seq_; + const bool index_value_is_full_; +}; + +// Index that leverages an internal hash table to quicken the lookup for a given +// key. +class HashIndexReader : public IndexReader { + public: + static Status Create( + const SliceTransform* hash_key_extractor, const Footer& footer, + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icomparator, const BlockHandle& index_handle, + InternalIterator* meta_index_iter, IndexReader** index_reader, + bool /*hash_index_allow_collision*/, + const PersistentCacheOptions& cache_options, + const bool index_key_includes_seq, const bool index_value_is_full, + MemoryAllocator* memory_allocator) { + std::unique_ptr<Block> index_block; + auto s = ReadBlockFromFile( + file, prefetch_buffer, footer, ReadOptions(), index_handle, + &index_block, ioptions, true /* decompress */, + true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), + cache_options, kDisableGlobalSequenceNumber, + 0 /* read_amp_bytes_per_bit */, memory_allocator); + + if (!s.ok()) { + return s; + } + + // Note, failure to create prefix hash index does not need to be a + // hard error. We can still fall back to the original binary search index. + // So, Create will succeed regardless, from this point on. + + auto new_index_reader = new HashIndexReader( + icomparator, std::move(index_block), ioptions.statistics, + index_key_includes_seq, index_value_is_full); + *index_reader = new_index_reader; + + // Get prefixes block + BlockHandle prefixes_handle; + s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, + &prefixes_handle); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + // Get index metadata block + BlockHandle prefixes_meta_handle; + s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock, + &prefixes_meta_handle); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + // Read contents for the blocks + BlockContents prefixes_contents; + BlockFetcher prefixes_block_fetcher( + file, prefetch_buffer, footer, ReadOptions(), prefixes_handle, + &prefixes_contents, ioptions, true /*decompress*/, + true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), + cache_options, memory_allocator); + s = prefixes_block_fetcher.ReadBlockContents(); + if (!s.ok()) { + return s; + } + BlockContents prefixes_meta_contents; + BlockFetcher prefixes_meta_block_fetcher( + file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle, + &prefixes_meta_contents, ioptions, true /*decompress*/, + true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), + cache_options, memory_allocator); + s = prefixes_meta_block_fetcher.ReadBlockContents(); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + BlockPrefixIndex* prefix_index = nullptr; + s = BlockPrefixIndex::Create(hash_key_extractor, prefixes_contents.data, + prefixes_meta_contents.data, &prefix_index); + // TODO: log error + if (s.ok()) { + new_index_reader->prefix_index_.reset(prefix_index); + } + + return Status::OK(); + } + + InternalIteratorBase<BlockHandle>* NewIterator( + IndexBlockIter* iter = nullptr, bool total_order_seek = true, + bool /*dont_care*/ = true) override { + Statistics* kNullStats = nullptr; + // We don't return pinned datat from index blocks, so no need + // to set `block_contents_pinned`. + return index_block_->NewIterator<IndexBlockIter>( + icomparator_, icomparator_->user_comparator(), iter, kNullStats, + total_order_seek, index_key_includes_seq_, index_value_is_full_, + false /* block_contents_pinned */, prefix_index_.get()); + } + + size_t size() const override { return index_block_->size(); } + size_t usable_size() const override { return index_block_->usable_size(); } + + size_t ApproximateMemoryUsage() const override { + assert(index_block_); + size_t usage = index_block_->ApproximateMemoryUsage(); + usage += prefixes_contents_.usable_size(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + if (prefix_index_) { + usage += prefix_index_->ApproximateMemoryUsage(); + } + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + } + + private: + HashIndexReader(const InternalKeyComparator* icomparator, + std::unique_ptr<Block>&& index_block, Statistics* stats, + const bool index_key_includes_seq, + const bool index_value_is_full) + : IndexReader(icomparator, stats), + index_block_(std::move(index_block)), + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { + assert(index_block_ != nullptr); + } + + ~HashIndexReader() override {} + + std::unique_ptr<Block> index_block_; + std::unique_ptr<BlockPrefixIndex> prefix_index_; + BlockContents prefixes_contents_; + const bool index_key_includes_seq_; + const bool index_value_is_full_; +}; + +// Helper function to setup the cache key's prefix for the Table. +void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) { + assert(kMaxCacheKeyPrefixSize >= 10); + rep->cache_key_prefix_size = 0; + rep->compressed_cache_key_prefix_size = 0; + if (rep->table_options.block_cache != nullptr) { + GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(), + &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); + // Create dummy offset of index reader which is beyond the file size. + rep->dummy_index_reader_offset = + file_size + rep->table_options.block_cache->NewId(); + } + if (rep->table_options.persistent_cache != nullptr) { + GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(), + &rep->persistent_cache_key_prefix[0], + &rep->persistent_cache_key_prefix_size); + } + if (rep->table_options.block_cache_compressed != nullptr) { + GenerateCachePrefix(rep->table_options.block_cache_compressed.get(), + rep->file->file(), &rep->compressed_cache_key_prefix[0], + &rep->compressed_cache_key_prefix_size); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file, + char* buffer, size_t* size) { + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (cc && *size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast<size_t>(end - buffer); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file, + char* buffer, size_t* size) { + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (*size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast<size_t>(end - buffer); + } +} + +namespace { +// Return True if table_properties has `user_prop_name` has a `true` value +// or it doesn't contain this property (for backward compatible). +bool IsFeatureSupported(const TableProperties& table_properties, + const std::string& user_prop_name, Logger* info_log) { + auto& props = table_properties.user_collected_properties; + auto pos = props.find(user_prop_name); + // Older version doesn't have this value set. Skip this check. + if (pos != props.end()) { + if (pos->second == kPropFalse) { + return false; + } else if (pos->second != kPropTrue) { + ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s", + user_prop_name.c_str(), pos->second.c_str()); + } + } + return true; +} + +// Caller has to ensure seqno is not nullptr. +Status GetGlobalSequenceNumber(const TableProperties& table_properties, + SequenceNumber largest_seqno, + SequenceNumber* seqno) { + const auto& props = table_properties.user_collected_properties; + const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion); + const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno); + + *seqno = kDisableGlobalSequenceNumber; + if (version_pos == props.end()) { + if (seqno_pos != props.end()) { + std::array<char, 200> msg_buf; + // This is not an external sst file, global_seqno is not supported. + snprintf( + msg_buf.data(), msg_buf.max_size(), + "A non-external sst file have global seqno property with value %s", + seqno_pos->second.c_str()); + return Status::Corruption(msg_buf.data()); + } + return Status::OK(); + } + + uint32_t version = DecodeFixed32(version_pos->second.c_str()); + if (version < 2) { + if (seqno_pos != props.end() || version != 1) { + std::array<char, 200> msg_buf; + // This is a v1 external sst file, global_seqno is not supported. + snprintf(msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno " + "property with value %s", + version, seqno_pos->second.c_str()); + return Status::Corruption(msg_buf.data()); + } + return Status::OK(); + } + + // Since we have a plan to deprecate global_seqno, we do not return failure + // if seqno_pos == props.end(). We rely on version_pos to detect whether the + // SST is external. + SequenceNumber global_seqno(0); + if (seqno_pos != props.end()) { + global_seqno = DecodeFixed64(seqno_pos->second.c_str()); + } + // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno + // to denote it is unknown. + if (largest_seqno < kMaxSequenceNumber) { + if (global_seqno == 0) { + global_seqno = largest_seqno; + } + if (global_seqno != largest_seqno) { + std::array<char, 200> msg_buf; + snprintf( + msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno property " + "with value %s, while largest seqno in the file is %llu", + version, seqno_pos->second.c_str(), + static_cast<unsigned long long>(largest_seqno)); + return Status::Corruption(msg_buf.data()); + } + } + *seqno = global_seqno; + + if (global_seqno > kMaxSequenceNumber) { + std::array<char, 200> msg_buf; + snprintf(msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno property " + "with value %llu, which is greater than kMaxSequenceNumber", + version, static_cast<unsigned long long>(global_seqno)); + return Status::Corruption(msg_buf.data()); + } + + return Status::OK(); +} +} // namespace + +Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix, + size_t cache_key_prefix_size, + const BlockHandle& handle, char* cache_key) { + assert(cache_key != nullptr); + assert(cache_key_prefix_size != 0); + assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); + memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); + char* end = + EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset()); + return Slice(cache_key, static_cast<size_t>(end - cache_key)); +} + +Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr<RandomAccessFileReader>&& file, + uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, + const SliceTransform* prefix_extractor, + const bool prefetch_index_and_filter_in_cache, + const bool skip_filters, const int level, + const bool immortal_table, + const SequenceNumber largest_seqno, + TailPrefetchStats* tail_prefetch_stats) { + table_reader->reset(); + + Status s; + Footer footer; + std::unique_ptr<FilePrefetchBuffer> prefetch_buffer; + + // prefetch both index and filters, down to all partitions + const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; + const bool preload_all = !table_options.cache_index_and_filter_blocks; + + s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all, + preload_all, &prefetch_buffer); + + // Read in the following order: + // 1. Footer + // 2. [metaindex block] + // 3. [meta block: properties] + // 4. [meta block: range deletion tombstone] + // 5. [meta block: compression dictionary] + // 6. [meta block: index] + // 7. [meta block: filter] + s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer, + kBlockBasedTableMagicNumber); + if (!s.ok()) { + return s; + } + if (!BlockBasedTableSupportedVersion(footer.version())) { + return Status::Corruption( + "Unknown Footer version. Maybe this file was created with newer " + "version of RocksDB?"); + } + + // We've successfully read the footer. We are ready to serve requests. + // Better not mutate rep_ after the creation. eg. internal_prefix_transform + // raw pointer will be used to create HashIndexReader, whose reset may + // access a dangling pointer. + Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, + internal_comparator, skip_filters, level, + immortal_table); + rep->file = std::move(file); + rep->footer = footer; + rep->index_type = table_options.index_type; + rep->hash_index_allow_collision = table_options.hash_index_allow_collision; + // We need to wrap data with internal_prefix_transform to make sure it can + // handle prefix correctly. + rep->internal_prefix_transform.reset( + new InternalKeySliceTransform(prefix_extractor)); + SetupCacheKeyPrefix(rep, file_size); + std::unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep)); + + // page cache options + rep->persistent_cache_options = + PersistentCacheOptions(rep->table_options.persistent_cache, + std::string(rep->persistent_cache_key_prefix, + rep->persistent_cache_key_prefix_size), + rep->ioptions.statistics); + + // Read metaindex + std::unique_ptr<Block> meta; + std::unique_ptr<InternalIterator> meta_iter; + s = ReadMetaBlock(rep, prefetch_buffer.get(), &meta, &meta_iter); + if (!s.ok()) { + return s; + } + + s = ReadPropertiesBlock(rep, prefetch_buffer.get(), meta_iter.get(), + largest_seqno); + if (!s.ok()) { + return s; + } + s = ReadRangeDelBlock(rep, prefetch_buffer.get(), meta_iter.get(), + internal_comparator); + if (!s.ok()) { + return s; + } + s = PrefetchIndexAndFilterBlocks(rep, prefetch_buffer.get(), meta_iter.get(), + new_table.get(), prefix_extractor, + prefetch_all, table_options, level, + prefetch_index_and_filter_in_cache); + + if (s.ok()) { + // Update tail prefetch stats + assert(prefetch_buffer.get() != nullptr); + if (tail_prefetch_stats != nullptr) { + assert(prefetch_buffer->min_offset_read() < file_size); + tail_prefetch_stats->RecordEffectiveSize( + static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read()); + } + + *table_reader = std::move(new_table); + } + + return s; +} + +Status BlockBasedTable::PrefetchTail( + RandomAccessFileReader* file, uint64_t file_size, + TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, + const bool preload_all, + std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) { + size_t tail_prefetch_size = 0; + if (tail_prefetch_stats != nullptr) { + // Multiple threads may get a 0 (no history) when running in parallel, + // but it will get cleared after the first of them finishes. + tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); + } + if (tail_prefetch_size == 0) { + // Before read footer, readahead backwards to prefetch data. Do more + // readahead if we're going to read index/filter. + // TODO: This may incorrectly select small readahead in case partitioned + // index/filter is enabled and top-level partition pinning is enabled. + // That's because we need to issue readahead before we read the properties, + // at which point we don't yet know the index type. + tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + } + size_t prefetch_off; + size_t prefetch_len; + if (file_size < tail_prefetch_size) { + prefetch_off = 0; + prefetch_len = static_cast<size_t>(file_size); + } else { + prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size); + prefetch_len = tail_prefetch_size; + } + TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen", + &tail_prefetch_size); + Status s; + // TODO should not have this special logic in the future. + if (!file->use_direct_io()) { + prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true)); + s = file->Prefetch(prefetch_off, prefetch_len); + } else { + prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true)); + s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len); + } + return s; +} + +Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len, + uint32_t expected) { + Status s; + uint32_t actual = 0; + switch (type) { + case kNoChecksum: + break; + case kCRC32c: + expected = crc32c::Unmask(expected); + actual = crc32c::Value(buf, len); + break; + case kxxHash: + actual = XXH32(buf, static_cast<int>(len), 0); + break; + case kxxHash64: + actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) & + uint64_t{0xffffffff}); + break; + default: + s = Status::Corruption("unknown checksum type"); + } + if (s.ok() && actual != expected) { + s = Status::Corruption("properties block checksum mismatched"); + } + return s; +} + +Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno( + Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value, + TableProperties** table_properties) { + assert(table_properties != nullptr); + // If this is an external SST file ingested with write_global_seqno set to + // true, then we expect the checksum mismatch because checksum was written + // by SstFileWriter, but its global seqno in the properties block may have + // been changed during ingestion. In this case, we read the properties + // block, copy it to a memory buffer, change the global seqno to its + // original value, i.e. 0, and verify the checksum again. + BlockHandle props_block_handle; + CacheAllocationPtr tmp_buf; + Status s = ReadProperties(handle_value, rep->file.get(), prefetch_buffer, + rep->footer, rep->ioptions, table_properties, + false /* verify_checksum */, &props_block_handle, + &tmp_buf, false /* compression_type_missing */, + nullptr /* memory_allocator */); + if (s.ok() && tmp_buf) { + const auto seqno_pos_iter = + (*table_properties) + ->properties_offsets.find( + ExternalSstFilePropertyNames::kGlobalSeqno); + size_t block_size = props_block_handle.size(); + if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) { + uint64_t global_seqno_offset = seqno_pos_iter->second; + EncodeFixed64( + tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0); + } + uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1); + s = rocksdb::VerifyChecksum(rep->footer.checksum(), tmp_buf.get(), + block_size + 1, value); + } + return s; +} + +Status BlockBasedTable::ReadPropertiesBlock( + Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + const SequenceNumber largest_seqno) { + bool found_properties_block = true; + Status s; + s = SeekToPropertiesBlock(meta_iter, &found_properties_block); + + if (!s.ok()) { + ROCKS_LOG_WARN(rep->ioptions.info_log, + "Error when seeking to properties block from file: %s", + s.ToString().c_str()); + } else if (found_properties_block) { + s = meta_iter->status(); + TableProperties* table_properties = nullptr; + if (s.ok()) { + s = ReadProperties( + meta_iter->value(), rep->file.get(), prefetch_buffer, rep->footer, + rep->ioptions, &table_properties, true /* verify_checksum */, + nullptr /* ret_block_handle */, nullptr /* ret_block_contents */, + false /* compression_type_missing */, nullptr /* memory_allocator */); + } + + if (s.IsCorruption()) { + s = TryReadPropertiesWithGlobalSeqno( + rep, prefetch_buffer, meta_iter->value(), &table_properties); + } + std::unique_ptr<TableProperties> props_guard; + if (table_properties != nullptr) { + props_guard.reset(table_properties); + } + + if (!s.ok()) { + ROCKS_LOG_WARN(rep->ioptions.info_log, + "Encountered error while reading data from properties " + "block %s", + s.ToString().c_str()); + } else { + assert(table_properties != nullptr); + rep->table_properties.reset(props_guard.release()); + rep->blocks_maybe_compressed = rep->table_properties->compression_name != + CompressionTypeToString(kNoCompression); + rep->blocks_definitely_zstd_compressed = + (rep->table_properties->compression_name == + CompressionTypeToString(kZSTD) || + rep->table_properties->compression_name == + CompressionTypeToString(kZSTDNotFinalCompression)); + } + } else { + ROCKS_LOG_ERROR(rep->ioptions.info_log, + "Cannot find Properties block from file."); + } +#ifndef ROCKSDB_LITE + if (rep->table_properties) { + ParseSliceTransform(rep->table_properties->prefix_extractor_name, + &(rep->table_prefix_extractor)); + } +#endif // ROCKSDB_LITE + + // Read the table properties, if provided. + if (rep->table_properties) { + rep->whole_key_filtering &= + IsFeatureSupported(*(rep->table_properties), + BlockBasedTablePropertyNames::kWholeKeyFiltering, + rep->ioptions.info_log); + rep->prefix_filtering &= IsFeatureSupported( + *(rep->table_properties), + BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log); + + s = GetGlobalSequenceNumber(*(rep->table_properties), largest_seqno, + &(rep->global_seqno)); + if (!s.ok()) { + ROCKS_LOG_ERROR(rep->ioptions.info_log, "%s", s.ToString().c_str()); + } + } + return s; +} + +Status BlockBasedTable::ReadRangeDelBlock( + Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + const InternalKeyComparator& internal_comparator) { + Status s; + bool found_range_del_block; + BlockHandle range_del_handle; + s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle); + if (!s.ok()) { + ROCKS_LOG_WARN( + rep->ioptions.info_log, + "Error when seeking to range delete tombstones block from file: %s", + s.ToString().c_str()); + } else if (found_range_del_block && !range_del_handle.IsNull()) { + ReadOptions read_options; + std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>( + rep, read_options, range_del_handle, nullptr /* input_iter */, + false /* is_index */, true /* key_includes_seq */, + true /* index_key_is_full */, nullptr /* get_context */, Status(), + prefetch_buffer)); + assert(iter != nullptr); + s = iter->status(); + if (!s.ok()) { + ROCKS_LOG_WARN( + rep->ioptions.info_log, + "Encountered error while reading data from range del block %s", + s.ToString().c_str()); + } else { + rep->fragmented_range_dels = + std::make_shared<FragmentedRangeTombstoneList>(std::move(iter), + internal_comparator); + } + } + return s; +} + +Status BlockBasedTable::ReadCompressionDictBlock( + Rep* rep, FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr<const BlockContents>* compression_dict_block) { + assert(compression_dict_block != nullptr); + Status s; + if (!rep->compression_dict_handle.IsNull()) { + std::unique_ptr<BlockContents> compression_dict_cont{new BlockContents()}; + PersistentCacheOptions cache_options; + ReadOptions read_options; + read_options.verify_checksums = true; + BlockFetcher compression_block_fetcher( + rep->file.get(), prefetch_buffer, rep->footer, read_options, + rep->compression_dict_handle, compression_dict_cont.get(), + rep->ioptions, false /* decompress */, false /*maybe_compressed*/, + UncompressionDict::GetEmptyDict(), cache_options); + s = compression_block_fetcher.ReadBlockContents(); + + if (!s.ok()) { + ROCKS_LOG_WARN( + rep->ioptions.info_log, + "Encountered error while reading data from compression dictionary " + "block %s", + s.ToString().c_str()); + } else { + *compression_dict_block = std::move(compression_dict_cont); + } + } + return s; +} + +Status BlockBasedTable::PrefetchIndexAndFilterBlocks( + Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + BlockBasedTable* new_table, const SliceTransform* prefix_extractor, + bool prefetch_all, const BlockBasedTableOptions& table_options, + const int level, const bool prefetch_index_and_filter_in_cache) { + Status s; + + // Find filter handle and filter type + if (rep->filter_policy) { + for (auto filter_type : + {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter, + Rep::FilterType::kBlockFilter}) { + std::string prefix; + switch (filter_type) { + case Rep::FilterType::kFullFilter: + prefix = kFullFilterBlockPrefix; + break; + case Rep::FilterType::kPartitionedFilter: + prefix = kPartitionedFilterBlockPrefix; + break; + case Rep::FilterType::kBlockFilter: + prefix = kFilterBlockPrefix; + break; + default: + assert(0); + } + std::string filter_block_key = prefix; + filter_block_key.append(rep->filter_policy->Name()); + if (FindMetaBlock(meta_iter, filter_block_key, &rep->filter_handle) + .ok()) { + rep->filter_type = filter_type; + break; + } + } + } + + { + // Find compression dictionary handle + bool found_compression_dict; + s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict, + &rep->compression_dict_handle); + } + + bool need_upper_bound_check = + PrefixExtractorChanged(rep->table_properties.get(), prefix_extractor); + + BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType(); + // prefetch the first level of index + const bool prefetch_index = + prefetch_all || + (table_options.pin_top_level_index_and_filter && + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + // prefetch the first level of filter + const bool prefetch_filter = + prefetch_all || (table_options.pin_top_level_index_and_filter && + rep->filter_type == Rep::FilterType::kPartitionedFilter); + // Partition fitlers cannot be enabled without partition indexes + assert(!prefetch_filter || prefetch_index); + // pin both index and filters, down to all partitions + const bool pin_all = + rep->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; + // pin the first level of index + const bool pin_index = + pin_all || (table_options.pin_top_level_index_and_filter && + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + // pin the first level of filter + const bool pin_filter = + pin_all || (table_options.pin_top_level_index_and_filter && + rep->filter_type == Rep::FilterType::kPartitionedFilter); + // pre-fetching of blocks is turned on + // Will use block cache for meta-blocks access + // Always prefetch index and filter for level 0 + // TODO(ajkr): also prefetch compression dictionary block + if (table_options.cache_index_and_filter_blocks) { + assert(table_options.block_cache != nullptr); + if (prefetch_index) { + // Hack: Call NewIndexIterator() to implicitly add index to the + // block_cache + CachableEntry<IndexReader> index_entry; + // check prefix_extractor match only if hash based index is used + bool disable_prefix_seek = + rep->index_type == BlockBasedTableOptions::kHashSearch && + need_upper_bound_check; + if (s.ok()) { + std::unique_ptr<InternalIteratorBase<BlockHandle>> iter( + new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek, + nullptr, &index_entry)); + s = iter->status(); + } + if (s.ok()) { + // This is the first call to NewIndexIterator() since we're in Open(). + // On success it should give us ownership of the `CachableEntry` by + // populating `index_entry`. + assert(index_entry.value != nullptr); + if (prefetch_all) { + index_entry.value->CacheDependencies(pin_all); + } + if (pin_index) { + rep->index_entry = std::move(index_entry); + } else { + index_entry.Release(table_options.block_cache.get()); + } + } + } + if (s.ok() && prefetch_filter) { + // Hack: Call GetFilter() to implicitly add filter to the block_cache + auto filter_entry = + new_table->GetFilter(rep->table_prefix_extractor.get()); + if (filter_entry.value != nullptr && prefetch_all) { + filter_entry.value->CacheDependencies( + pin_all, rep->table_prefix_extractor.get()); + } + // if pin_filter is true then save it in rep_->filter_entry; it will be + // released in the destructor only, hence it will be pinned in the + // cache while this reader is alive + if (pin_filter) { + rep->filter_entry = filter_entry; + } else { + filter_entry.Release(table_options.block_cache.get()); + } + } + } else { + // If we don't use block cache for meta-block access, we'll pre-load these + // blocks, which will kept in member variables in Rep and with a same life- + // time as this table object. + IndexReader* index_reader = nullptr; + if (s.ok()) { + s = new_table->CreateIndexReader(prefetch_buffer, &index_reader, + meta_iter, level); + } + std::unique_ptr<const BlockContents> compression_dict_block; + if (s.ok()) { + rep->index_reader.reset(index_reader); + // The partitions of partitioned index are always stored in cache. They + // are hence follow the configuration for pin and prefetch regardless of + // the value of cache_index_and_filter_blocks + if (prefetch_index_and_filter_in_cache || level == 0) { + rep->index_reader->CacheDependencies(pin_all); + } + + // Set filter block + if (rep->filter_policy) { + const bool is_a_filter_partition = true; + auto filter = new_table->ReadFilter(prefetch_buffer, rep->filter_handle, + !is_a_filter_partition, + rep->table_prefix_extractor.get()); + rep->filter.reset(filter); + // Refer to the comment above about paritioned indexes always being + // cached + if (filter && (prefetch_index_and_filter_in_cache || level == 0)) { + filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get()); + } + } + s = ReadCompressionDictBlock(rep, prefetch_buffer, + &compression_dict_block); + } else { + delete index_reader; + } + if (s.ok() && !rep->compression_dict_handle.IsNull()) { + assert(compression_dict_block != nullptr); + // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy + rep->uncompression_dict.reset(new UncompressionDict( + compression_dict_block->data.ToString(), + rep->blocks_definitely_zstd_compressed, rep->ioptions.statistics)); + } + } + return s; +} + +void BlockBasedTable::SetupForCompaction() { + switch (rep_->ioptions.access_hint_on_compaction_start) { + case Options::NONE: + break; + case Options::NORMAL: + rep_->file->file()->Hint(RandomAccessFile::NORMAL); + break; + case Options::SEQUENTIAL: + rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL); + break; + case Options::WILLNEED: + rep_->file->file()->Hint(RandomAccessFile::WILLNEED); + break; + default: + assert(false); + } +} + +std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties() + const { + return rep_->table_properties; +} + +size_t BlockBasedTable::ApproximateMemoryUsage() const { + size_t usage = 0; + if (rep_->filter) { + usage += rep_->filter->ApproximateMemoryUsage(); + } + if (rep_->index_reader) { + usage += rep_->index_reader->ApproximateMemoryUsage(); + } + if (rep_->uncompression_dict) { + usage += rep_->uncompression_dict->ApproximateMemoryUsage(); + } + return usage; +} + +// Load the meta-block from the file. On success, return the loaded meta block +// and its iterator. +Status BlockBasedTable::ReadMetaBlock(Rep* rep, + FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr<Block>* meta_block, + std::unique_ptr<InternalIterator>* iter) { + // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates + // it is an empty block. + std::unique_ptr<Block> meta; + Status s = ReadBlockFromFile( + rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(), + rep->footer.metaindex_handle(), &meta, rep->ioptions, + true /* decompress */, true /*maybe_compressed*/, + UncompressionDict::GetEmptyDict(), rep->persistent_cache_options, + kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */, + GetMemoryAllocator(rep->table_options)); + + if (!s.ok()) { + ROCKS_LOG_ERROR(rep->ioptions.info_log, + "Encountered error while reading data from properties" + " block %s", + s.ToString().c_str()); + return s; + } + + *meta_block = std::move(meta); + // meta block uses bytewise comparator. + iter->reset(meta_block->get()->NewIterator<DataBlockIter>( + BytewiseComparator(), BytewiseComparator())); + return Status::OK(); +} + +Status BlockBasedTable::GetDataBlockFromCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, Rep* rep, + const ReadOptions& read_options, + BlockBasedTable::CachableEntry<Block>* block, + const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit, + bool is_index, GetContext* get_context) { + Status s; + BlockContents* compressed_block = nullptr; + Cache::Handle* block_cache_compressed_handle = nullptr; + Statistics* statistics = rep->ioptions.statistics; + + // Lookup uncompressed cache first + if (block_cache != nullptr) { + block->cache_handle = GetEntryFromCache( + block_cache, block_cache_key, rep->level, + is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS, + is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT, + get_context + ? (is_index ? &get_context->get_context_stats_.num_cache_index_miss + : &get_context->get_context_stats_.num_cache_data_miss) + : nullptr, + get_context + ? (is_index ? &get_context->get_context_stats_.num_cache_index_hit + : &get_context->get_context_stats_.num_cache_data_hit) + : nullptr, + statistics, get_context); + if (block->cache_handle != nullptr) { + block->value = + reinterpret_cast<Block*>(block_cache->Value(block->cache_handle)); + return s; + } + } + + // If not found, search from the compressed block cache. + assert(block->cache_handle == nullptr && block->value == nullptr); + + if (block_cache_compressed == nullptr) { + return s; + } + + assert(!compressed_block_cache_key.empty()); + block_cache_compressed_handle = + block_cache_compressed->Lookup(compressed_block_cache_key); + // if we found in the compressed cache, then uncompress and insert into + // uncompressed cache + if (block_cache_compressed_handle == nullptr) { + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); + return s; + } + + // found compressed block + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT); + compressed_block = reinterpret_cast<BlockContents*>( + block_cache_compressed->Value(block_cache_compressed_handle)); + CompressionType compression_type = compressed_block->get_compression_type(); + assert(compression_type != kNoCompression); + + // Retrieve the uncompressed contents into a new buffer + BlockContents contents; + UncompressionContext context(compression_type); + UncompressionInfo info(context, uncompression_dict, compression_type); + s = UncompressBlockContents(info, compressed_block->data.data(), + compressed_block->data.size(), &contents, + rep->table_options.format_version, rep->ioptions, + GetMemoryAllocator(rep->table_options)); + + // Insert uncompressed block into block cache + if (s.ok()) { + block->value = + new Block(std::move(contents), rep->get_global_seqno(is_index), + read_amp_bytes_per_bit, + statistics); // uncompressed block + if (block_cache != nullptr && block->value->own_bytes() && + read_options.fill_cache) { + size_t charge = block->value->ApproximateMemoryUsage(); + s = block_cache->Insert(block_cache_key, block->value, charge, + &DeleteCachedEntry<Block>, + &(block->cache_handle)); +#ifndef NDEBUG + block_cache->TEST_mark_as_data_block(block_cache_key, charge); +#endif // NDEBUG + if (s.ok()) { + if (get_context != nullptr) { + get_context->get_context_stats_.num_cache_add++; + get_context->get_context_stats_.num_cache_bytes_write += charge; + } else { + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); + } + if (is_index) { + if (get_context != nullptr) { + get_context->get_context_stats_.num_cache_index_add++; + get_context->get_context_stats_.num_cache_index_bytes_insert += + charge; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); + RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); + } + } else { + if (get_context != nullptr) { + get_context->get_context_stats_.num_cache_data_add++; + get_context->get_context_stats_.num_cache_data_bytes_insert += + charge; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_ADD); + RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge); + } + } + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + delete block->value; + block->value = nullptr; + } + } + } + + // Release hold on compressed cache entry + block_cache_compressed->Release(block_cache_compressed_handle); + return s; +} + +Status BlockBasedTable::PutDataBlockToCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + const ReadOptions& /*read_options*/, const ImmutableCFOptions& ioptions, + CachableEntry<Block>* cached_block, BlockContents* raw_block_contents, + CompressionType raw_block_comp_type, uint32_t format_version, + const UncompressionDict& uncompression_dict, SequenceNumber seq_no, + size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, + bool is_index, Cache::Priority priority, GetContext* get_context) { + assert(raw_block_comp_type == kNoCompression || + block_cache_compressed != nullptr); + + Status s; + // Retrieve the uncompressed contents into a new buffer + BlockContents uncompressed_block_contents; + Statistics* statistics = ioptions.statistics; + if (raw_block_comp_type != kNoCompression) { + UncompressionContext context(raw_block_comp_type); + UncompressionInfo info(context, uncompression_dict, raw_block_comp_type); + s = UncompressBlockContents(info, raw_block_contents->data.data(), + raw_block_contents->data.size(), + &uncompressed_block_contents, format_version, + ioptions, memory_allocator); + } + if (!s.ok()) { + return s; + } + + if (raw_block_comp_type != kNoCompression) { + cached_block->value = new Block(std::move(uncompressed_block_contents), + seq_no, read_amp_bytes_per_bit, + statistics); // uncompressed block + } else { + cached_block->value = + new Block(std::move(*raw_block_contents), seq_no, + read_amp_bytes_per_bit, ioptions.statistics); + } + + // Insert compressed block into compressed block cache. + // Release the hold on the compressed cache entry immediately. + if (block_cache_compressed != nullptr && + raw_block_comp_type != kNoCompression && raw_block_contents != nullptr && + raw_block_contents->own_bytes()) { +#ifndef NDEBUG + assert(raw_block_contents->is_raw_block); +#endif // NDEBUG + + // We cannot directly put raw_block_contents because this could point to + // an object in the stack. + BlockContents* block_cont_for_comp_cache = + new BlockContents(std::move(*raw_block_contents)); + s = block_cache_compressed->Insert( + compressed_block_cache_key, block_cont_for_comp_cache, + block_cont_for_comp_cache->ApproximateMemoryUsage(), + &DeleteCachedEntry<BlockContents>); + if (s.ok()) { + // Avoid the following code to delete this cached block. + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD); + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + delete block_cont_for_comp_cache; + } + } + + // insert into uncompressed block cache + if (block_cache != nullptr && cached_block->value->own_bytes()) { + size_t charge = cached_block->value->ApproximateMemoryUsage(); + s = block_cache->Insert(block_cache_key, cached_block->value, charge, + &DeleteCachedEntry<Block>, + &(cached_block->cache_handle), priority); +#ifndef NDEBUG + block_cache->TEST_mark_as_data_block(block_cache_key, charge); +#endif // NDEBUG + if (s.ok()) { + assert(cached_block->cache_handle != nullptr); + if (get_context != nullptr) { + get_context->get_context_stats_.num_cache_add++; + get_context->get_context_stats_.num_cache_bytes_write += charge; + } else { + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); + } + if (is_index) { + if (get_context != nullptr) { + get_context->get_context_stats_.num_cache_index_add++; + get_context->get_context_stats_.num_cache_index_bytes_insert += + charge; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); + RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); + } + } else { + if (get_context != nullptr) { + get_context->get_context_stats_.num_cache_data_add++; + get_context->get_context_stats_.num_cache_data_bytes_insert += charge; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_ADD); + RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge); + } + } + assert(reinterpret_cast<Block*>(block_cache->Value( + cached_block->cache_handle)) == cached_block->value); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + delete cached_block->value; + cached_block->value = nullptr; + } + } + + return s; +} + +FilterBlockReader* BlockBasedTable::ReadFilter( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle, + const bool is_a_filter_partition, + const SliceTransform* prefix_extractor) const { + auto& rep = rep_; + // TODO: We might want to unify with ReadBlockFromFile() if we start + // requiring checksum verification in Table::Open. + if (rep->filter_type == Rep::FilterType::kNoFilter) { + return nullptr; + } + BlockContents block; + + BlockFetcher block_fetcher( + rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(), + filter_handle, &block, rep->ioptions, false /* decompress */, + false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), + rep->persistent_cache_options, GetMemoryAllocator(rep->table_options)); + Status s = block_fetcher.ReadBlockContents(); + + if (!s.ok()) { + // Error reading the block + return nullptr; + } + + assert(rep->filter_policy); + + auto filter_type = rep->filter_type; + if (rep->filter_type == Rep::FilterType::kPartitionedFilter && + is_a_filter_partition) { + filter_type = Rep::FilterType::kFullFilter; + } + + switch (filter_type) { + case Rep::FilterType::kPartitionedFilter: { + return new PartitionedFilterBlockReader( + rep->prefix_filtering ? prefix_extractor : nullptr, + rep->whole_key_filtering, std::move(block), nullptr, + rep->ioptions.statistics, rep->internal_comparator, this, + rep_->table_properties == nullptr || + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0); + } + + case Rep::FilterType::kBlockFilter: + return new BlockBasedFilterBlockReader( + rep->prefix_filtering ? prefix_extractor : nullptr, + rep->table_options, rep->whole_key_filtering, std::move(block), + rep->ioptions.statistics); + + case Rep::FilterType::kFullFilter: { + auto filter_bits_reader = + rep->filter_policy->GetFilterBitsReader(block.data); + assert(filter_bits_reader != nullptr); + return new FullFilterBlockReader( + rep->prefix_filtering ? prefix_extractor : nullptr, + rep->whole_key_filtering, std::move(block), filter_bits_reader, + rep->ioptions.statistics); + } + + default: + // filter_type is either kNoFilter (exited the function at the first if), + // or it must be covered in this switch block + assert(false); + return nullptr; + } +} + +BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter( + const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer, + bool no_io, GetContext* get_context) const { + const BlockHandle& filter_blk_handle = rep_->filter_handle; + const bool is_a_filter_partition = true; + return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition, + no_io, get_context, prefix_extractor); +} + +BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, + const bool is_a_filter_partition, bool no_io, GetContext* get_context, + const SliceTransform* prefix_extractor) const { + // If cache_index_and_filter_blocks is false, filter should be pre-populated. + // We will return rep_->filter anyway. rep_->filter can be nullptr if filter + // read fails at Open() time. We don't want to reload again since it will + // most probably fail again. + if (!is_a_filter_partition && + !rep_->table_options.cache_index_and_filter_blocks) { + return {rep_->filter.get(), nullptr /* cache handle */}; + } + + Cache* block_cache = rep_->table_options.block_cache.get(); + if (rep_->filter_policy == nullptr /* do not use filter */ || + block_cache == nullptr /* no block cache at all */) { + return {nullptr /* filter */, nullptr /* cache handle */}; + } + + if (!is_a_filter_partition && rep_->filter_entry.IsSet()) { + return rep_->filter_entry; + } + + PERF_TIMER_GUARD(read_filter_block_nanos); + + // Fetching from the cache + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + filter_blk_handle, cache_key); + + Statistics* statistics = rep_->ioptions.statistics; + auto cache_handle = GetEntryFromCache( + block_cache, key, rep_->level, BLOCK_CACHE_FILTER_MISS, + BLOCK_CACHE_FILTER_HIT, + get_context ? &get_context->get_context_stats_.num_cache_filter_miss + : nullptr, + get_context ? &get_context->get_context_stats_.num_cache_filter_hit + : nullptr, + statistics, get_context); + + FilterBlockReader* filter = nullptr; + if (cache_handle != nullptr) { + PERF_COUNTER_ADD(block_cache_filter_hit_count, 1); + filter = + reinterpret_cast<FilterBlockReader*>(block_cache->Value(cache_handle)); + } else if (no_io) { + // Do not invoke any io. + return CachableEntry<FilterBlockReader>(); + } else { + filter = ReadFilter(prefetch_buffer, filter_blk_handle, + is_a_filter_partition, prefix_extractor); + if (filter != nullptr) { + size_t usage = filter->ApproximateMemoryUsage(); + Status s = block_cache->Insert( + key, filter, usage, &DeleteCachedFilterEntry, &cache_handle, + rep_->table_options.cache_index_and_filter_blocks_with_high_priority + ? Cache::Priority::HIGH + : Cache::Priority::LOW); + if (s.ok()) { + PERF_COUNTER_ADD(filter_block_read_count, 1); + if (get_context != nullptr) { + get_context->get_context_stats_.num_cache_add++; + get_context->get_context_stats_.num_cache_bytes_write += usage; + get_context->get_context_stats_.num_cache_filter_add++; + get_context->get_context_stats_.num_cache_filter_bytes_insert += + usage; + } else { + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage); + RecordTick(statistics, BLOCK_CACHE_FILTER_ADD); + RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage); + } + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + delete filter; + return CachableEntry<FilterBlockReader>(); + } + } + } + + return {filter, cache_handle}; +} + +BlockBasedTable::CachableEntry<UncompressionDict> +BlockBasedTable::GetUncompressionDict(Rep* rep, + FilePrefetchBuffer* prefetch_buffer, + bool no_io, GetContext* get_context) { + if (!rep->table_options.cache_index_and_filter_blocks) { + // block cache is either disabled or not used for meta-blocks. In either + // case, BlockBasedTableReader is the owner of the uncompression dictionary. + return {rep->uncompression_dict.get(), nullptr /* cache handle */}; + } + if (rep->compression_dict_handle.IsNull()) { + return {nullptr, nullptr}; + } + char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto cache_key = + GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size, + rep->compression_dict_handle, cache_key_buf); + auto cache_handle = GetEntryFromCache( + rep->table_options.block_cache.get(), cache_key, rep->level, + BLOCK_CACHE_COMPRESSION_DICT_MISS, BLOCK_CACHE_COMPRESSION_DICT_HIT, + get_context + ? &get_context->get_context_stats_.num_cache_compression_dict_miss + : nullptr, + get_context + ? &get_context->get_context_stats_.num_cache_compression_dict_hit + : nullptr, + rep->ioptions.statistics, get_context); + UncompressionDict* dict = nullptr; + if (cache_handle != nullptr) { + dict = reinterpret_cast<UncompressionDict*>( + rep->table_options.block_cache->Value(cache_handle)); + } else if (no_io) { + // Do not invoke any io. + } else { + std::unique_ptr<const BlockContents> compression_dict_block; + Status s = + ReadCompressionDictBlock(rep, prefetch_buffer, &compression_dict_block); + size_t usage = 0; + if (s.ok()) { + assert(compression_dict_block != nullptr); + // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy + dict = new UncompressionDict(compression_dict_block->data.ToString(), + rep->blocks_definitely_zstd_compressed, + rep->ioptions.statistics); + usage = dict->ApproximateMemoryUsage(); + s = rep->table_options.block_cache->Insert( + cache_key, dict, usage, &DeleteCachedUncompressionDictEntry, + &cache_handle, + rep->table_options.cache_index_and_filter_blocks_with_high_priority + ? Cache::Priority::HIGH + : Cache::Priority::LOW); + } + if (s.ok()) { + PERF_COUNTER_ADD(compression_dict_block_read_count, 1); + if (get_context != nullptr) { + get_context->get_context_stats_.num_cache_add++; + get_context->get_context_stats_.num_cache_bytes_write += usage; + get_context->get_context_stats_.num_cache_compression_dict_add++; + get_context->get_context_stats_ + .num_cache_compression_dict_bytes_insert += usage; + } else { + RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD); + RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_WRITE, usage); + RecordTick(rep->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD); + RecordTick(rep->ioptions.statistics, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, usage); + } + } else { + // There should be no way to get here if block cache insertion succeeded. + // Though it is still possible something failed earlier. + RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); + delete dict; + dict = nullptr; + assert(cache_handle == nullptr); + } + } + return {dict, cache_handle}; +} + +// disable_prefix_seek should be set to true when prefix_extractor found in SST +// differs from the one in mutable_cf_options and index type is HashBasedIndex +InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* input_iter, CachableEntry<IndexReader>* index_entry, + GetContext* get_context) { + // index reader has already been pre-populated. + if (rep_->index_reader) { + // We don't return pinned datat from index blocks, so no need + // to set `block_contents_pinned`. + return rep_->index_reader->NewIterator( + input_iter, read_options.total_order_seek || disable_prefix_seek, + read_options.fill_cache); + } + // we have a pinned index block + if (rep_->index_entry.IsSet()) { + // We don't return pinned datat from index blocks, so no need + // to set `block_contents_pinned`. + return rep_->index_entry.value->NewIterator( + input_iter, read_options.total_order_seek || disable_prefix_seek, + read_options.fill_cache); + } + + PERF_TIMER_GUARD(read_index_block_nanos); + + const bool no_io = read_options.read_tier == kBlockCacheTier; + Cache* block_cache = rep_->table_options.block_cache.get(); + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = + GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->dummy_index_reader_offset, cache_key); + Statistics* statistics = rep_->ioptions.statistics; + auto cache_handle = GetEntryFromCache( + block_cache, key, rep_->level, BLOCK_CACHE_INDEX_MISS, + BLOCK_CACHE_INDEX_HIT, + get_context ? &get_context->get_context_stats_.num_cache_index_miss + : nullptr, + get_context ? &get_context->get_context_stats_.num_cache_index_hit + : nullptr, + statistics, get_context); + + if (cache_handle == nullptr && no_io) { + if (input_iter != nullptr) { + input_iter->Invalidate(Status::Incomplete("no blocking io")); + return input_iter; + } else { + return NewErrorInternalIterator<BlockHandle>( + Status::Incomplete("no blocking io")); + } + } + + IndexReader* index_reader = nullptr; + if (cache_handle != nullptr) { + PERF_COUNTER_ADD(block_cache_index_hit_count, 1); + index_reader = + reinterpret_cast<IndexReader*>(block_cache->Value(cache_handle)); + } else { + // Create index reader and put it in the cache. + Status s; + TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:2"); + s = CreateIndexReader(nullptr /* prefetch_buffer */, &index_reader); + TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1"); + TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3"); + TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4"); + size_t charge = 0; + if (s.ok()) { + assert(index_reader != nullptr); + charge = index_reader->ApproximateMemoryUsage(); + s = block_cache->Insert( + key, index_reader, charge, &DeleteCachedIndexEntry, &cache_handle, + rep_->table_options.cache_index_and_filter_blocks_with_high_priority + ? Cache::Priority::HIGH + : Cache::Priority::LOW); + } + + if (s.ok()) { + if (get_context != nullptr) { + get_context->get_context_stats_.num_cache_add++; + get_context->get_context_stats_.num_cache_bytes_write += charge; + } else { + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); + } + PERF_COUNTER_ADD(index_block_read_count, 1); + RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); + RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); + } else { + if (index_reader != nullptr) { + delete index_reader; + } + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + // make sure if something goes wrong, index_reader shall remain intact. + if (input_iter != nullptr) { + input_iter->Invalidate(s); + return input_iter; + } else { + return NewErrorInternalIterator<BlockHandle>(s); + } + } + } + + assert(cache_handle); + // We don't return pinned datat from index blocks, so no need + // to set `block_contents_pinned`. + auto* iter = index_reader->NewIterator( + input_iter, read_options.total_order_seek || disable_prefix_seek); + + // the caller would like to take ownership of the index block + // don't call RegisterCleanup() in this case, the caller will take care of it + if (index_entry != nullptr) { + *index_entry = {index_reader, cache_handle}; + } else { + iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); + } + + return iter; +} + +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +// If input_iter is null, new a iterator +// If input_iter is not null, update this iter and return it +template <typename TBlockIter> +TBlockIter* BlockBasedTable::NewDataBlockIterator( + Rep* rep, const ReadOptions& ro, const BlockHandle& handle, + TBlockIter* input_iter, bool is_index, bool key_includes_seq, + bool index_key_is_full, GetContext* get_context, Status s, + FilePrefetchBuffer* prefetch_buffer) { + PERF_TIMER_GUARD(new_table_block_iter_nanos); + + Cache* block_cache = rep->table_options.block_cache.get(); + CachableEntry<Block> block; + TBlockIter* iter; + { + const bool no_io = (ro.read_tier == kBlockCacheTier); + auto uncompression_dict_storage = + GetUncompressionDict(rep, prefetch_buffer, no_io, get_context); + const UncompressionDict& uncompression_dict = + uncompression_dict_storage.value == nullptr + ? UncompressionDict::GetEmptyDict() + : *uncompression_dict_storage.value; + if (s.ok()) { + s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle, + uncompression_dict, &block, is_index, + get_context); + } + + if (input_iter != nullptr) { + iter = input_iter; + } else { + iter = new TBlockIter; + } + // Didn't get any data from block caches. + if (s.ok() && block.value == nullptr) { + if (no_io) { + // Could not read from block_cache and can't do IO + iter->Invalidate(Status::Incomplete("no blocking io")); + return iter; + } + std::unique_ptr<Block> block_value; + { + StopWatch sw(rep->ioptions.env, rep->ioptions.statistics, + READ_BLOCK_GET_MICROS); + s = ReadBlockFromFile( + rep->file.get(), prefetch_buffer, rep->footer, ro, handle, + &block_value, rep->ioptions, + rep->blocks_maybe_compressed /*do_decompress*/, + rep->blocks_maybe_compressed, uncompression_dict, + rep->persistent_cache_options, + is_index ? kDisableGlobalSequenceNumber : rep->global_seqno, + rep->table_options.read_amp_bytes_per_bit, + GetMemoryAllocator(rep->table_options)); + } + if (s.ok()) { + block.value = block_value.release(); + } + } + // TODO(ajkr): also pin compression dictionary block when + // `pin_l0_filter_and_index_blocks_in_cache == true`. + uncompression_dict_storage.Release(block_cache); + } + + if (s.ok()) { + assert(block.value != nullptr); + const bool kTotalOrderSeek = true; + // Block contents are pinned and it is still pinned after the iterator + // is destroyed as long as cleanup functions are moved to another object, + // when: + // 1. block cache handle is set to be released in cleanup function, or + // 2. it's pointing to immortal source. If own_bytes is true then we are + // not reading data from the original source, whether immortal or not. + // Otherwise, the block is pinned iff the source is immortal. + bool block_contents_pinned = + (block.cache_handle != nullptr || + (!block.value->own_bytes() && rep->immortal_table)); + iter = block.value->NewIterator<TBlockIter>( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq, + index_key_is_full, block_contents_pinned); + if (block.cache_handle != nullptr) { + iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, + block.cache_handle); + } else { + if (!ro.fill_cache && rep->cache_key_prefix_size != 0) { + // insert a dummy record to block cache to track the memory usage + Cache::Handle* cache_handle; + // There are two other types of cache keys: 1) SST cache key added in + // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in + // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate + // from SST cache key(31 bytes), and use non-zero prefix to + // differentiate from `write_buffer_manager` + const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; + char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; + // Prefix: use rep->cache_key_prefix padded by 0s + memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); + assert(rep->cache_key_prefix_size != 0); + assert(rep->cache_key_prefix_size <= kExtraCacheKeyPrefix); + memcpy(cache_key, rep->cache_key_prefix, rep->cache_key_prefix_size); + char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, + next_cache_key_id_++); + assert(end - cache_key <= + static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length)); + Slice unique_key = + Slice(cache_key, static_cast<size_t>(end - cache_key)); + s = block_cache->Insert(unique_key, nullptr, + block.value->ApproximateMemoryUsage(), nullptr, + &cache_handle); + if (s.ok()) { + if (cache_handle != nullptr) { + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, + cache_handle); + } + } + } + iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr); + } + } else { + assert(block.value == nullptr); + iter->Invalidate(s); + } + return iter; +} + +Status BlockBasedTable::MaybeReadBlockAndLoadToCache( + FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry<Block>* block_entry, bool is_index, GetContext* get_context) { + assert(block_entry != nullptr); + const bool no_io = (ro.read_tier == kBlockCacheTier); + Cache* block_cache = rep->table_options.block_cache.get(); + + // No point to cache compressed blocks if it never goes away + Cache* block_cache_compressed = + rep->immortal_table ? nullptr + : rep->table_options.block_cache_compressed.get(); + + // First, try to get the block from the cache + // + // If either block cache is enabled, we'll try to read from it. + Status s; + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice key /* key to the block cache */; + Slice ckey /* key to the compressed block cache */; + if (block_cache != nullptr || block_cache_compressed != nullptr) { + // create key for block cache + if (block_cache != nullptr) { + key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size, + handle, cache_key); + } + + if (block_cache_compressed != nullptr) { + ckey = GetCacheKey(rep->compressed_cache_key_prefix, + rep->compressed_cache_key_prefix_size, handle, + compressed_cache_key); + } + + s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, + rep, ro, block_entry, uncompression_dict, + rep->table_options.read_amp_bytes_per_bit, + is_index, get_context); + + // Can't find the block from the cache. If I/O is allowed, read from the + // file. + if (block_entry->value == nullptr && !no_io && ro.fill_cache) { + Statistics* statistics = rep->ioptions.statistics; + bool do_decompress = + block_cache_compressed == nullptr && rep->blocks_maybe_compressed; + CompressionType raw_block_comp_type; + BlockContents raw_block_contents; + { + StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS); + BlockFetcher block_fetcher( + rep->file.get(), prefetch_buffer, rep->footer, ro, handle, + &raw_block_contents, rep->ioptions, + do_decompress /* do uncompress */, rep->blocks_maybe_compressed, + uncompression_dict, rep->persistent_cache_options, + GetMemoryAllocator(rep->table_options), + GetMemoryAllocatorForCompressedBlock(rep->table_options)); + s = block_fetcher.ReadBlockContents(); + raw_block_comp_type = block_fetcher.get_compression_type(); + } + + if (s.ok()) { + SequenceNumber seq_no = rep->get_global_seqno(is_index); + // If filling cache is allowed and a cache is configured, try to put the + // block to the cache. + s = PutDataBlockToCache( + key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions, + block_entry, &raw_block_contents, raw_block_comp_type, + rep->table_options.format_version, uncompression_dict, seq_no, + rep->table_options.read_amp_bytes_per_bit, + GetMemoryAllocator(rep->table_options), is_index, + is_index && rep->table_options + .cache_index_and_filter_blocks_with_high_priority + ? Cache::Priority::HIGH + : Cache::Priority::LOW, + get_context); + } + } + } + assert(s.ok() || block_entry->value == nullptr); + return s; +} + +BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( + BlockBasedTable* table, + std::unordered_map<uint64_t, CachableEntry<Block>>* block_map, + bool index_key_includes_seq, bool index_key_is_full) + : table_(table), + block_map_(block_map), + index_key_includes_seq_(index_key_includes_seq), + index_key_is_full_(index_key_is_full) {} + +template <class TBlockIter, typename TValue> +const size_t BlockBasedTableIterator<TBlockIter, TValue>::kMaxReadaheadSize = + 256 * 1024; + +InternalIteratorBase<BlockHandle>* +BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( + const BlockHandle& handle) { + // Return a block iterator on the index partition + auto rep = table_->get_rep(); + auto block = block_map_->find(handle.offset()); + // This is a possible scenario since block cache might not have had space + // for the partition + if (block != block_map_->end()) { + PERF_COUNTER_ADD(block_cache_hit_count, 1); + RecordTick(rep->ioptions.statistics, BLOCK_CACHE_INDEX_HIT); + RecordTick(rep->ioptions.statistics, BLOCK_CACHE_HIT); + Cache* block_cache = rep->table_options.block_cache.get(); + assert(block_cache); + RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ, + block_cache->GetUsage(block->second.cache_handle)); + Statistics* kNullStats = nullptr; + // We don't return pinned datat from index blocks, so no need + // to set `block_contents_pinned`. + return block->second.value->NewIterator<IndexBlockIter>( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_); + } + // Create an empty iterator + return new IndexBlockIter(); +} + +// This will be broken if the user specifies an unusual implementation +// of Options.comparator, or if the user specifies an unusual +// definition of prefixes in BlockBasedTableOptions.filter_policy. +// In particular, we require the following three properties: +// +// 1) key.starts_with(prefix(key)) +// 2) Compare(prefix(key), key) <= 0. +// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 +// +// Otherwise, this method guarantees no I/O will be incurred. +// +// REQUIRES: this method shouldn't be called while the DB lock is held. +bool BlockBasedTable::PrefixMayMatch( + const Slice& internal_key, const ReadOptions& read_options, + const SliceTransform* options_prefix_extractor, + const bool need_upper_bound_check) { + if (!rep_->filter_policy) { + return true; + } + + const SliceTransform* prefix_extractor; + + if (rep_->table_prefix_extractor == nullptr) { + if (need_upper_bound_check) { + return true; + } + prefix_extractor = options_prefix_extractor; + } else { + prefix_extractor = rep_->table_prefix_extractor.get(); + } + auto user_key = ExtractUserKey(internal_key); + if (!prefix_extractor->InDomain(user_key)) { + return true; + } + + bool may_match = true; + Status s; + + // First, try check with full filter + auto filter_entry = GetFilter(prefix_extractor); + FilterBlockReader* filter = filter_entry.value; + bool filter_checked = true; + if (filter != nullptr) { + if (!filter->IsBlockBased()) { + const Slice* const const_ikey_ptr = &internal_key; + may_match = filter->RangeMayExist( + read_options.iterate_upper_bound, user_key, prefix_extractor, + rep_->internal_comparator.user_comparator(), const_ikey_ptr, + &filter_checked, need_upper_bound_check); + } else { + // if prefix_extractor changed for block based filter, skip filter + if (need_upper_bound_check) { + if (!rep_->filter_entry.IsSet()) { + filter_entry.Release(rep_->table_options.block_cache.get()); + } + return true; + } + auto prefix = prefix_extractor->Transform(user_key); + InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); + auto internal_prefix = internal_key_prefix.Encode(); + + // To prevent any io operation in this method, we set `read_tier` to make + // sure we always read index or filter only when they have already been + // loaded to memory. + ReadOptions no_io_read_options; + no_io_read_options.read_tier = kBlockCacheTier; + + // Then, try find it within each block + // we already know prefix_extractor and prefix_extractor_name must match + // because `CheckPrefixMayMatch` first checks `check_filter_ == true` + std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter( + NewIndexIterator(no_io_read_options, + /* need_upper_bound_check */ false)); + iiter->Seek(internal_prefix); + + if (!iiter->Valid()) { + // we're past end of file + // if it's incomplete, it means that we avoided I/O + // and we're not really sure that we're past the end + // of the file + may_match = iiter->status().IsIncomplete(); + } else if ((rep_->table_properties && + rep_->table_properties->index_key_is_user_key + ? iiter->key() + : ExtractUserKey(iiter->key())) + .starts_with(ExtractUserKey(internal_prefix))) { + // we need to check for this subtle case because our only + // guarantee is that "the key is a string >= last key in that data + // block" according to the doc/table_format.txt spec. + // + // Suppose iiter->key() starts with the desired prefix; it is not + // necessarily the case that the corresponding data block will + // contain the prefix, since iiter->key() need not be in the + // block. However, the next data block may contain the prefix, so + // we return true to play it safe. + may_match = true; + } else if (filter->IsBlockBased()) { + // iiter->key() does NOT start with the desired prefix. Because + // Seek() finds the first key that is >= the seek target, this + // means that iiter->key() > prefix. Thus, any data blocks coming + // after the data block corresponding to iiter->key() cannot + // possibly contain the key. Thus, the corresponding data block + // is the only on could potentially contain the prefix. + BlockHandle handle = iiter->value(); + may_match = + filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset()); + } + } + } + + if (filter_checked) { + Statistics* statistics = rep_->ioptions.statistics; + RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); + if (!may_match) { + RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); + } + } + + // if rep_->filter_entry is not set, we should call Release(); otherwise + // don't call, in this case we have a local copy in rep_->filter_entry, + // it's pinned to the cache and will be released in the destructor + if (!rep_->filter_entry.IsSet()) { + filter_entry.Release(rep_->table_options.block_cache.get()); + } + return may_match; +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) { + is_out_of_bound_ = false; + if (!CheckPrefixMayMatch(target)) { + ResetDataIter(); + return; + } + + SavePrevIndexValue(); + + index_iter_->Seek(target); + + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + + InitDataBlock(); + + block_iter_.Seek(target); + + FindKeyForward(); + assert( + !block_iter_.Valid() || + (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) || + (!key_includes_seq_ && user_comparator_.Compare(ExtractUserKey(target), + block_iter_.key()) <= 0)); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev( + const Slice& target) { + is_out_of_bound_ = false; + if (!CheckPrefixMayMatch(target)) { + ResetDataIter(); + return; + } + + SavePrevIndexValue(); + + // Call Seek() rather than SeekForPrev() in the index block, because the + // target data block will likely to contain the position for `target`, the + // same as Seek(), rather than than before. + // For example, if we have three data blocks, each containing two keys: + // [2, 4] [6, 8] [10, 12] + // (the keys in the index block would be [4, 8, 12]) + // and the user calls SeekForPrev(7), we need to go to the second block, + // just like if they call Seek(7). + // The only case where the block is difference is when they seek to a position + // in the boundary. For example, if they SeekForPrev(5), we should go to the + // first block, rather than the second. However, we don't have the information + // to distinguish the two unless we read the second block. In this case, we'll + // end up with reading two blocks. + index_iter_->Seek(target); + + if (!index_iter_->Valid()) { + index_iter_->SeekToLast(); + if (!index_iter_->Valid()) { + ResetDataIter(); + block_iter_points_to_real_block_ = false; + return; + } + } + + InitDataBlock(); + + block_iter_.SeekForPrev(target); + + FindKeyBackward(); + assert(!block_iter_.Valid() || + icomp_.Compare(target, block_iter_.key()) >= 0); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() { + is_out_of_bound_ = false; + SavePrevIndexValue(); + index_iter_->SeekToFirst(); + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + InitDataBlock(); + block_iter_.SeekToFirst(); + FindKeyForward(); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() { + is_out_of_bound_ = false; + SavePrevIndexValue(); + index_iter_->SeekToLast(); + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + InitDataBlock(); + block_iter_.SeekToLast(); + FindKeyBackward(); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::Next() { + assert(block_iter_points_to_real_block_); + block_iter_.Next(); + FindKeyForward(); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::Prev() { + assert(block_iter_points_to_real_block_); + block_iter_.Prev(); + FindKeyBackward(); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() { + BlockHandle data_block_handle = index_iter_->value(); + if (!block_iter_points_to_real_block_ || + data_block_handle.offset() != prev_index_value_.offset() || + // if previous attempt of reading the block missed cache, try again + block_iter_.status().IsIncomplete()) { + if (block_iter_points_to_real_block_) { + ResetDataIter(); + } + auto* rep = table_->get_rep(); + + // Automatically prefetch additional data when a range scan (iterator) does + // more than 2 sequential IOs. This is enabled only for user reads and when + // ReadOptions.readahead_size is 0. + if (!for_compaction_ && read_options_.readahead_size == 0) { + num_file_reads_++; + if (num_file_reads_ > 2) { + if (!rep->file->use_direct_io() && + (data_block_handle.offset() + + static_cast<size_t>(data_block_handle.size()) + + kBlockTrailerSize > + readahead_limit_)) { + // Buffered I/O + // Discarding the return status of Prefetch calls intentionally, as we + // can fallback to reading from disk if Prefetch fails. + rep->file->Prefetch(data_block_handle.offset(), readahead_size_); + readahead_limit_ = + static_cast<size_t>(data_block_handle.offset() + readahead_size_); + // Keep exponentially increasing readahead size until + // kMaxReadaheadSize. + readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_ * 2); + } else if (rep->file->use_direct_io() && !prefetch_buffer_) { + // Direct I/O + // Let FilePrefetchBuffer take care of the readahead. + prefetch_buffer_.reset(new FilePrefetchBuffer( + rep->file.get(), kInitReadaheadSize, kMaxReadaheadSize)); + } + } + } + + Status s; + BlockBasedTable::NewDataBlockIterator<TBlockIter>( + rep, read_options_, data_block_handle, &block_iter_, is_index_, + key_includes_seq_, index_key_is_full_, + /* get_context */ nullptr, s, prefetch_buffer_.get()); + block_iter_points_to_real_block_ = true; + } +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() { + assert(!is_out_of_bound_); + // TODO the while loop inherits from two-level-iterator. We don't know + // whether a block can be empty so it can be replaced by an "if". + while (!block_iter_.Valid()) { + if (!block_iter_.status().ok()) { + return; + } + ResetDataIter(); + // We used to check the current index key for upperbound. + // It will only save a data reading for a small percentage of use cases, + // so for code simplicity, we removed it. We can add it back if there is a + // significnat performance regression. + index_iter_->Next(); + + if (index_iter_->Valid()) { + InitDataBlock(); + block_iter_.SeekToFirst(); + } else { + return; + } + } + + // Check upper bound on the current key + bool reached_upper_bound = + (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && block_iter_.Valid() && + user_comparator_.Compare(ExtractUserKey(block_iter_.key()), + *read_options_.iterate_upper_bound) >= 0); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTable::BlockEntryIteratorState::KeyReachedUpperBound", + &reached_upper_bound); + if (reached_upper_bound) { + is_out_of_bound_ = true; + return; + } +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() { + assert(!is_out_of_bound_); + while (!block_iter_.Valid()) { + if (!block_iter_.status().ok()) { + return; + } + + ResetDataIter(); + index_iter_->Prev(); + + if (index_iter_->Valid()) { + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + return; + } + } + + // We could have check lower bound here too, but we opt not to do it for + // code simplicity. +} + +InternalIterator* BlockBasedTable::NewIterator( + const ReadOptions& read_options, const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, bool for_compaction) { + bool need_upper_bound_check = + PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); + const bool kIsNotIndex = false; + if (arena == nullptr) { + return new BlockBasedTableIterator<DataBlockIter>( + this, read_options, rep_->internal_comparator, + NewIndexIterator( + read_options, + need_upper_bound_check && + rep_->index_type == BlockBasedTableOptions::kHashSearch), + !skip_filters && !read_options.total_order_seek && + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, kIsNotIndex, + true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction); + } else { + auto* mem = + arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>)); + return new (mem) BlockBasedTableIterator<DataBlockIter>( + this, read_options, rep_->internal_comparator, + NewIndexIterator(read_options, need_upper_bound_check), + !skip_filters && !read_options.total_order_seek && + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, kIsNotIndex, + true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction); + } +} + +FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator( + const ReadOptions& read_options) { + if (rep_->fragmented_range_dels == nullptr) { + return nullptr; + } + SequenceNumber snapshot = kMaxSequenceNumber; + if (read_options.snapshot != nullptr) { + snapshot = read_options.snapshot->GetSequenceNumber(); + } + return new FragmentedRangeTombstoneIterator( + rep_->fragmented_range_dels, rep_->internal_comparator, snapshot); +} + +bool BlockBasedTable::FullFilterKeyMayMatch( + const ReadOptions& read_options, FilterBlockReader* filter, + const Slice& internal_key, const bool no_io, + const SliceTransform* prefix_extractor) const { + if (filter == nullptr || filter->IsBlockBased()) { + return true; + } + Slice user_key = ExtractUserKey(internal_key); + const Slice* const const_ikey_ptr = &internal_key; + bool may_match = true; + if (filter->whole_key_filtering()) { + may_match = filter->KeyMayMatch(user_key, prefix_extractor, kNotValid, + no_io, const_ikey_ptr); + } else if (!read_options.total_order_seek && prefix_extractor && + rep_->table_properties->prefix_extractor_name.compare( + prefix_extractor->Name()) == 0 && + prefix_extractor->InDomain(user_key) && + !filter->PrefixMayMatch(prefix_extractor->Transform(user_key), + prefix_extractor, kNotValid, false, + const_ikey_ptr)) { + may_match = false; + } + if (may_match) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); + } + return may_match; +} + +Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, + GetContext* get_context, + const SliceTransform* prefix_extractor, + bool skip_filters) { + assert(key.size() >= 8); // key must be internal key + Status s; + const bool no_io = read_options.read_tier == kBlockCacheTier; + CachableEntry<FilterBlockReader> filter_entry; + if (!skip_filters) { + filter_entry = + GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr, + read_options.read_tier == kBlockCacheTier, get_context); + } + FilterBlockReader* filter = filter_entry.value; + + // First check the full filter + // If full filter not useful, Then go into each block + if (!FullFilterKeyMayMatch(read_options, filter, key, no_io, + prefix_extractor)) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + } else { + IndexBlockIter iiter_on_stack; + // if prefix_extractor found in block differs from options, disable + // BlockPrefixIndex. Only do this check when index_type is kHashSearch. + bool need_upper_bound_check = false; + if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { + need_upper_bound_check = PrefixExtractorChanged( + rep_->table_properties.get(), prefix_extractor); + } + auto iiter = + NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, + /* index_entry */ nullptr, get_context); + std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr.reset(iiter); + } + + bool matched = false; // if such user key mathced a key in SST + bool done = false; + for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { + BlockHandle handle = iiter->value(); + + bool not_exist_in_filter = + filter != nullptr && filter->IsBlockBased() == true && + !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor, + handle.offset(), no_io); + + if (not_exist_in_filter) { + // Not found + // TODO: think about interaction with Merge. If a user key cannot + // cross one data block, we should be fine. + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + break; + } else { + DataBlockIter biter; + NewDataBlockIterator<DataBlockIter>( + rep_, read_options, iiter->value(), &biter, false, + true /* key_includes_seq */, true /* index_key_is_full */, + get_context); + + if (read_options.read_tier == kBlockCacheTier && + biter.status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for + // whether we can guarantee the key is not there when "no_io" is set + get_context->MarkKeyMayExist(); + break; + } + if (!biter.status().ok()) { + s = biter.status(); + break; + } + + bool may_exist = biter.SeekForGet(key); + if (!may_exist) { + // HashSeek cannot find the key this block and the the iter is not + // the end of the block, i.e. cannot be in the following blocks + // either. In this case, the seek_key cannot be found, so we break + // from the top level for-loop. + break; + } + + // Call the *saver function on each entry/block until it returns false + for (; biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + + if (!get_context->SaveValue( + parsed_key, biter.value(), &matched, + biter.IsValuePinned() ? &biter : nullptr)) { + done = true; + break; + } + } + s = biter.status(); + } + if (done) { + // Avoid the extra Next which is expensive in two-level indexes + break; + } + } + if (matched && filter != nullptr && !filter->IsBlockBased()) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, + rep_->level); + } + if (s.ok()) { + s = iiter->status(); + } + } + + // if rep_->filter_entry is not set, we should call Release(); otherwise + // don't call, in this case we have a local copy in rep_->filter_entry, + // it's pinned to the cache and will be released in the destructor + if (!rep_->filter_entry.IsSet()) { + filter_entry.Release(rep_->table_options.block_cache.get()); + } + return s; +} + +Status BlockBasedTable::Prefetch(const Slice* const begin, + const Slice* const end) { + auto& comparator = rep_->internal_comparator; + auto user_comparator = comparator.user_comparator(); + // pre-condition + if (begin && end && comparator.Compare(*begin, *end) > 0) { + return Status::InvalidArgument(*begin, *end); + } + + IndexBlockIter iiter_on_stack; + auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack); + std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr = + std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter); + } + + if (!iiter->status().ok()) { + // error opening index iterator + return iiter->status(); + } + + // indicates if we are on the last page that need to be pre-fetched + bool prefetching_boundary_page = false; + + for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); + iiter->Next()) { + BlockHandle block_handle = iiter->value(); + const bool is_user_key = rep_->table_properties && + rep_->table_properties->index_key_is_user_key > 0; + if (end && + ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) || + (is_user_key && + user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) { + if (prefetching_boundary_page) { + break; + } + + // The index entry represents the last key in the data block. + // We should load this page into memory as well, but no more + prefetching_boundary_page = true; + } + + // Load the block specified by the block_handle into the block cache + DataBlockIter biter; + NewDataBlockIterator<DataBlockIter>(rep_, ReadOptions(), block_handle, + &biter); + + if (!biter.status().ok()) { + // there was an unexpected error while pre-fetching + return biter.status(); + } + } + + return Status::OK(); +} + +Status BlockBasedTable::VerifyChecksum() { + Status s; + // Check Meta blocks + std::unique_ptr<Block> meta; + std::unique_ptr<InternalIterator> meta_iter; + s = ReadMetaBlock(rep_, nullptr /* prefetch buffer */, &meta, &meta_iter); + if (s.ok()) { + s = VerifyChecksumInMetaBlocks(meta_iter.get()); + if (!s.ok()) { + return s; + } + } else { + return s; + } + // Check Data blocks + IndexBlockIter iiter_on_stack; + InternalIteratorBase<BlockHandle>* iiter = + NewIndexIterator(ReadOptions(), false, &iiter_on_stack); + std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr = + std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter); + } + if (!iiter->status().ok()) { + // error opening index iterator + return iiter->status(); + } + s = VerifyChecksumInBlocks(iiter); + return s; +} + +Status BlockBasedTable::VerifyChecksumInBlocks( + InternalIteratorBase<BlockHandle>* index_iter) { + Status s; + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); + if (!s.ok()) { + break; + } + BlockHandle handle = index_iter->value(); + BlockContents contents; + BlockFetcher block_fetcher( + rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, + ReadOptions(), handle, &contents, rep_->ioptions, + false /* decompress */, false /*maybe_compressed*/, + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + break; + } + } + return s; +} + +Status BlockBasedTable::VerifyChecksumInMetaBlocks( + InternalIteratorBase<Slice>* index_iter) { + Status s; + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); + if (!s.ok()) { + break; + } + BlockHandle handle; + Slice input = index_iter->value(); + s = handle.DecodeFrom(&input); + BlockContents contents; + BlockFetcher block_fetcher( + rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, + ReadOptions(), handle, &contents, rep_->ioptions, + false /* decompress */, false /*maybe_compressed*/, + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (s.IsCorruption() && index_iter->key() == kPropertiesBlock) { + TableProperties* table_properties; + s = TryReadPropertiesWithGlobalSeqno(rep_, nullptr /* prefetch_buffer */, + index_iter->value(), + &table_properties); + delete table_properties; + } + if (!s.ok()) { + break; + } + } + return s; +} + +bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, + const Slice& key) { + std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter( + NewIndexIterator(options)); + iiter->Seek(key); + assert(iiter->Valid()); + CachableEntry<Block> block; + + BlockHandle handle = iiter->value(); + Cache* block_cache = rep_->table_options.block_cache.get(); + assert(block_cache != nullptr); + + char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice cache_key = + GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, + cache_key_storage); + Slice ckey; + + Status s; + if (!rep_->compression_dict_handle.IsNull()) { + std::unique_ptr<const BlockContents> compression_dict_block; + s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */, + &compression_dict_block); + if (s.ok()) { + assert(compression_dict_block != nullptr); + UncompressionDict uncompression_dict( + compression_dict_block->data.ToString(), + rep_->blocks_definitely_zstd_compressed); + s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, rep_, + options, &block, uncompression_dict, + 0 /* read_amp_bytes_per_bit */); + } + } else { + s = GetDataBlockFromCache( + cache_key, ckey, block_cache, nullptr, rep_, options, &block, + UncompressionDict::GetEmptyDict(), 0 /* read_amp_bytes_per_bit */); + } + assert(s.ok()); + bool in_cache = block.value != nullptr; + if (in_cache) { + ReleaseCachedEntry(block_cache, block.cache_handle); + } + return in_cache; +} + +BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() { + // Some old version of block-based tables don't have index type present in + // table properties. If that's the case we can safely use the kBinarySearch. + BlockBasedTableOptions::IndexType index_type_on_file = + BlockBasedTableOptions::kBinarySearch; + if (rep_->table_properties) { + auto& props = rep_->table_properties->user_collected_properties; + auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (pos != props.end()) { + index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>( + DecodeFixed32(pos->second.c_str())); + // update index_type with the true type + rep_->index_type = index_type_on_file; + } + } + return index_type_on_file; +} + +// REQUIRES: The following fields of rep_ should have already been populated: +// 1. file +// 2. index_handle, +// 3. options +// 4. internal_comparator +// 5. index_type +Status BlockBasedTable::CreateIndexReader( + FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader, + InternalIterator* preloaded_meta_index_iter, int level) { + auto index_type_on_file = UpdateIndexType(); + + auto file = rep_->file.get(); + const InternalKeyComparator* icomparator = &rep_->internal_comparator; + const Footer& footer = rep_->footer; + + // kHashSearch requires non-empty prefix_extractor but bypass checking + // prefix_extractor here since we have no access to MutableCFOptions. + // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator. + // If prefix_extractor does not match prefix_extractor_name from table + // properties, turn off Hash Index by setting total_order_seek to true + + switch (index_type_on_file) { + case BlockBasedTableOptions::kTwoLevelIndexSearch: { + return PartitionIndexReader::Create( + this, file, prefetch_buffer, footer, footer.index_handle(), + rep_->ioptions, icomparator, index_reader, + rep_->persistent_cache_options, level, + rep_->table_properties == nullptr || + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0, + GetMemoryAllocator(rep_->table_options)); + } + case BlockBasedTableOptions::kBinarySearch: { + return BinarySearchIndexReader::Create( + file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions, + icomparator, index_reader, rep_->persistent_cache_options, + rep_->table_properties == nullptr || + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0, + GetMemoryAllocator(rep_->table_options)); + } + case BlockBasedTableOptions::kHashSearch: { + std::unique_ptr<Block> meta_guard; + std::unique_ptr<InternalIterator> meta_iter_guard; + auto meta_index_iter = preloaded_meta_index_iter; + if (meta_index_iter == nullptr) { + auto s = + ReadMetaBlock(rep_, prefetch_buffer, &meta_guard, &meta_iter_guard); + if (!s.ok()) { + // we simply fall back to binary search in case there is any + // problem with prefix hash index loading. + ROCKS_LOG_WARN(rep_->ioptions.info_log, + "Unable to read the metaindex block." + " Fall back to binary search index."); + return BinarySearchIndexReader::Create( + file, prefetch_buffer, footer, footer.index_handle(), + rep_->ioptions, icomparator, index_reader, + rep_->persistent_cache_options, + rep_->table_properties == nullptr || + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0, + GetMemoryAllocator(rep_->table_options)); + } + meta_index_iter = meta_iter_guard.get(); + } + + return HashIndexReader::Create( + rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer, + rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter, + index_reader, rep_->hash_index_allow_collision, + rep_->persistent_cache_options, + rep_->table_properties == nullptr || + rep_->table_properties->index_key_is_user_key == 0, + rep_->table_properties == nullptr || + rep_->table_properties->index_value_is_delta_encoded == 0, + GetMemoryAllocator(rep_->table_options)); + } + default: { + std::string error_message = + "Unrecognized index type: " + ToString(index_type_on_file); + return Status::InvalidArgument(error_message.c_str()); + } + } +} + +uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { + std::unique_ptr<InternalIteratorBase<BlockHandle>> index_iter( + NewIndexIterator(ReadOptions())); + + index_iter->Seek(key); + uint64_t result; + if (index_iter->Valid()) { + BlockHandle handle = index_iter->value(); + result = handle.offset(); + } else { + // key is past the last key in the file. If table_properties is not + // available, approximate the offset by returning the offset of the + // metaindex block (which is right near the end of the file). + result = 0; + if (rep_->table_properties) { + result = rep_->table_properties->data_size; + } + // table_properties is not present in the table. + if (result == 0) { + result = rep_->footer.metaindex_handle().offset(); + } + } + return result; +} + +bool BlockBasedTable::TEST_filter_block_preloaded() const { + return rep_->filter != nullptr; +} + +bool BlockBasedTable::TEST_index_reader_preloaded() const { + return rep_->index_reader != nullptr; +} + +Status BlockBasedTable::GetKVPairsFromDataBlocks( + std::vector<KVPairBlock>* kv_pair_blocks) { + std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter( + NewIndexIterator(ReadOptions())); + + Status s = blockhandles_iter->status(); + if (!s.ok()) { + // Cannot read Index Block + return s; + } + + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + + if (!s.ok()) { + break; + } + + std::unique_ptr<InternalIterator> datablock_iter; + datablock_iter.reset(NewDataBlockIterator<DataBlockIter>( + rep_, ReadOptions(), blockhandles_iter->value())); + s = datablock_iter->status(); + + if (!s.ok()) { + // Error reading the block - Skipped + continue; + } + + KVPairBlock kv_pair_block; + for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); + datablock_iter->Next()) { + s = datablock_iter->status(); + if (!s.ok()) { + // Error reading the block - Skipped + break; + } + const Slice& key = datablock_iter->key(); + const Slice& value = datablock_iter->value(); + std::string key_copy = std::string(key.data(), key.size()); + std::string value_copy = std::string(value.data(), value.size()); + + kv_pair_block.push_back( + std::make_pair(std::move(key_copy), std::move(value_copy))); + } + kv_pair_blocks->push_back(std::move(kv_pair_block)); + } + return Status::OK(); +} + +Status BlockBasedTable::DumpTable(WritableFile* out_file, + const SliceTransform* prefix_extractor) { + // Output Footer + out_file->Append( + "Footer Details:\n" + "--------------------------------------\n" + " "); + out_file->Append(rep_->footer.ToString().c_str()); + out_file->Append("\n"); + + // Output MetaIndex + out_file->Append( + "Metaindex Details:\n" + "--------------------------------------\n"); + std::unique_ptr<Block> meta; + std::unique_ptr<InternalIterator> meta_iter; + Status s = + ReadMetaBlock(rep_, nullptr /* prefetch_buffer */, &meta, &meta_iter); + if (s.ok()) { + for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) { + s = meta_iter->status(); + if (!s.ok()) { + return s; + } + if (meta_iter->key() == rocksdb::kPropertiesBlock) { + out_file->Append(" Properties block handle: "); + out_file->Append(meta_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } else if (meta_iter->key() == rocksdb::kCompressionDictBlock) { + out_file->Append(" Compression dictionary block handle: "); + out_file->Append(meta_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } else if (strstr(meta_iter->key().ToString().c_str(), + "filter.rocksdb.") != nullptr) { + out_file->Append(" Filter block handle: "); + out_file->Append(meta_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } else if (meta_iter->key() == rocksdb::kRangeDelBlock) { + out_file->Append(" Range deletion block handle: "); + out_file->Append(meta_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } + } + out_file->Append("\n"); + } else { + return s; + } + + // Output TableProperties + const rocksdb::TableProperties* table_properties; + table_properties = rep_->table_properties.get(); + + if (table_properties != nullptr) { + out_file->Append( + "Table Properties:\n" + "--------------------------------------\n" + " "); + out_file->Append(table_properties->ToString("\n ", ": ").c_str()); + out_file->Append("\n"); + + // Output Filter blocks + if (!rep_->filter && !table_properties->filter_policy_name.empty()) { + // Support only BloomFilter as off now + rocksdb::BlockBasedTableOptions table_options; + table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1)); + if (table_properties->filter_policy_name.compare( + table_options.filter_policy->Name()) == 0) { + std::string filter_block_key = kFilterBlockPrefix; + filter_block_key.append(table_properties->filter_policy_name); + BlockHandle handle; + if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) { + BlockContents block; + BlockFetcher block_fetcher( + rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer, + ReadOptions(), handle, &block, rep_->ioptions, + false /*decompress*/, false /*maybe_compressed*/, + UncompressionDict::GetEmptyDict(), + rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + rep_->filter.reset(new BlockBasedFilterBlockReader( + prefix_extractor, table_options, + table_options.whole_key_filtering, std::move(block), + rep_->ioptions.statistics)); + } + } + } + } + } + if (rep_->filter) { + out_file->Append( + "Filter Details:\n" + "--------------------------------------\n" + " "); + out_file->Append(rep_->filter->ToString().c_str()); + out_file->Append("\n"); + } + + // Output Index block + s = DumpIndexBlock(out_file); + if (!s.ok()) { + return s; + } + + // Output compression dictionary + if (!rep_->compression_dict_handle.IsNull()) { + std::unique_ptr<const BlockContents> compression_dict_block; + s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */, + &compression_dict_block); + if (!s.ok()) { + return s; + } + assert(compression_dict_block != nullptr); + auto compression_dict = compression_dict_block->data; + out_file->Append( + "Compression Dictionary:\n" + "--------------------------------------\n"); + out_file->Append(" size (bytes): "); + out_file->Append(rocksdb::ToString(compression_dict.size())); + out_file->Append("\n\n"); + out_file->Append(" HEX "); + out_file->Append(compression_dict.ToString(true).c_str()); + out_file->Append("\n\n"); + } + + // Output range deletions block + auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions()); + if (range_del_iter != nullptr) { + range_del_iter->SeekToFirst(); + if (range_del_iter->Valid()) { + out_file->Append( + "Range deletions:\n" + "--------------------------------------\n" + " "); + for (; range_del_iter->Valid(); range_del_iter->Next()) { + DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file); + } + out_file->Append("\n"); + } + delete range_del_iter; + } + // Output Data blocks + s = DumpDataBlocks(out_file); + + return s; +} + +void BlockBasedTable::Close() { + if (rep_->closed) { + return; + } + + Cache* const cache = rep_->table_options.block_cache.get(); + + rep_->filter_entry.Release(cache); + rep_->index_entry.Release(cache); + + // cleanup index, filter, and compression dictionary blocks + // to avoid accessing dangling pointers + if (!rep_->table_options.no_block_cache) { + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + + // Get the filter block key + auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->filter_handle, cache_key); + cache->Erase(key); + + // Get the index block key + key = GetCacheKeyFromOffset(rep_->cache_key_prefix, + rep_->cache_key_prefix_size, + rep_->dummy_index_reader_offset, cache_key); + cache->Erase(key); + + if (!rep_->compression_dict_handle.IsNull()) { + // Get the compression dictionary block key + key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->compression_dict_handle, cache_key); + cache->Erase(key); + } + } + + rep_->closed = true; +} + +Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { + out_file->Append( + "Index Details:\n" + "--------------------------------------\n"); + std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter( + NewIndexIterator(ReadOptions())); + Status s = blockhandles_iter->status(); + if (!s.ok()) { + out_file->Append("Can not read Index Block \n\n"); + return s; + } + + out_file->Append(" Block key hex dump: Data block handle\n"); + out_file->Append(" Block key ascii\n\n"); + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + if (!s.ok()) { + break; + } + Slice key = blockhandles_iter->key(); + Slice user_key; + InternalKey ikey; + if (rep_->table_properties && + rep_->table_properties->index_key_is_user_key != 0) { + user_key = key; + } else { + ikey.DecodeFrom(key); + user_key = ikey.user_key(); + } + + out_file->Append(" HEX "); + out_file->Append(user_key.ToString(true).c_str()); + out_file->Append(": "); + out_file->Append(blockhandles_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + + std::string str_key = user_key.ToString(); + std::string res_key(""); + char cspace = ' '; + for (size_t i = 0; i < str_key.size(); i++) { + res_key.append(&str_key[i], 1); + res_key.append(1, cspace); + } + out_file->Append(" ASCII "); + out_file->Append(res_key.c_str()); + out_file->Append("\n ------\n"); + } + out_file->Append("\n"); + return Status::OK(); +} + +Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { + std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter( + NewIndexIterator(ReadOptions())); + Status s = blockhandles_iter->status(); + if (!s.ok()) { + out_file->Append("Can not read Index Block \n\n"); + return s; + } + + uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max(); + uint64_t datablock_size_max = 0; + uint64_t datablock_size_sum = 0; + + size_t block_id = 1; + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + block_id++, blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + if (!s.ok()) { + break; + } + + BlockHandle bh = blockhandles_iter->value(); + uint64_t datablock_size = bh.size(); + datablock_size_min = std::min(datablock_size_min, datablock_size); + datablock_size_max = std::max(datablock_size_max, datablock_size); + datablock_size_sum += datablock_size; + + out_file->Append("Data Block # "); + out_file->Append(rocksdb::ToString(block_id)); + out_file->Append(" @ "); + out_file->Append(blockhandles_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + out_file->Append("--------------------------------------\n"); + + std::unique_ptr<InternalIterator> datablock_iter; + datablock_iter.reset(NewDataBlockIterator<DataBlockIter>( + rep_, ReadOptions(), blockhandles_iter->value())); + s = datablock_iter->status(); + + if (!s.ok()) { + out_file->Append("Error reading the block - Skipped \n\n"); + continue; + } + + for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); + datablock_iter->Next()) { + s = datablock_iter->status(); + if (!s.ok()) { + out_file->Append("Error reading the block - Skipped \n"); + break; + } + DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file); + } + out_file->Append("\n"); + } + + uint64_t num_datablocks = block_id - 1; + if (num_datablocks) { + double datablock_size_avg = + static_cast<double>(datablock_size_sum) / num_datablocks; + out_file->Append("Data Block Summary:\n"); + out_file->Append("--------------------------------------"); + out_file->Append("\n # data blocks: "); + out_file->Append(rocksdb::ToString(num_datablocks)); + out_file->Append("\n min data block size: "); + out_file->Append(rocksdb::ToString(datablock_size_min)); + out_file->Append("\n max data block size: "); + out_file->Append(rocksdb::ToString(datablock_size_max)); + out_file->Append("\n avg data block size: "); + out_file->Append(rocksdb::ToString(datablock_size_avg)); + out_file->Append("\n"); + } + + return Status::OK(); +} + +void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, + WritableFile* out_file) { + InternalKey ikey; + ikey.DecodeFrom(key); + + out_file->Append(" HEX "); + out_file->Append(ikey.user_key().ToString(true).c_str()); + out_file->Append(": "); + out_file->Append(value.ToString(true).c_str()); + out_file->Append("\n"); + + std::string str_key = ikey.user_key().ToString(); + std::string str_value = value.ToString(); + std::string res_key(""), res_value(""); + char cspace = ' '; + for (size_t i = 0; i < str_key.size(); i++) { + if (str_key[i] == '\0') { + res_key.append("\\0", 2); + } else { + res_key.append(&str_key[i], 1); + } + res_key.append(1, cspace); + } + for (size_t i = 0; i < str_value.size(); i++) { + if (str_value[i] == '\0') { + res_value.append("\\0", 2); + } else { + res_value.append(&str_value[i], 1); + } + res_value.append(1, cspace); + } + + out_file->Append(" ASCII "); + out_file->Append(res_key.c_str()); + out_file->Append(": "); + out_file->Append(res_value.c_str()); + out_file->Append("\n ------\n"); +} + +namespace { + +void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) { + FilterBlockReader* filter = reinterpret_cast<FilterBlockReader*>(value); + if (filter->statistics() != nullptr) { + RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT, + filter->ApproximateMemoryUsage()); + } + delete filter; +} + +void DeleteCachedIndexEntry(const Slice& /*key*/, void* value) { + IndexReader* index_reader = reinterpret_cast<IndexReader*>(value); + if (index_reader->statistics() != nullptr) { + RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT, + index_reader->ApproximateMemoryUsage()); + } + delete index_reader; +} + +void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) { + UncompressionDict* dict = reinterpret_cast<UncompressionDict*>(value); + RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, + dict->ApproximateMemoryUsage()); + delete dict; +} + +} // anonymous namespace + +} // namespace rocksdb diff --git a/src/rocksdb/table/block_based_table_reader.h b/src/rocksdb/table/block_based_table_reader.h new file mode 100644 index 00000000..f0b5cdb1 --- /dev/null +++ b/src/rocksdb/table/block_based_table_reader.h @@ -0,0 +1,710 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <stdint.h> +#include <memory> +#include <set> +#include <string> +#include <utility> +#include <vector> + +#include "db/range_tombstone_fragmenter.h" +#include "options/cf_options.h" +#include "rocksdb/options.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/block_based_table_factory.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/persistent_cache_helper.h" +#include "table/table_properties_internal.h" +#include "table/table_reader.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/file_reader_writer.h" +#include "util/user_comparator_wrapper.h" + +namespace rocksdb { + +class BlockHandle; +class Cache; +class FilterBlockReader; +class BlockBasedFilterBlockReader; +class FullFilterBlockReader; +class Footer; +class InternalKeyComparator; +class Iterator; +class RandomAccessFile; +class TableCache; +class TableReader; +class WritableFile; +struct BlockBasedTableOptions; +struct EnvOptions; +struct ReadOptions; +class GetContext; + +typedef std::vector<std::pair<std::string, std::string>> KVPairBlock; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. +class BlockBasedTable : public TableReader { + public: + static const std::string kFilterBlockPrefix; + static const std::string kFullFilterBlockPrefix; + static const std::string kPartitionedFilterBlockPrefix; + // The longest prefix of the cache key used to identify blocks. + // For Posix files the unique ID is three varints. + static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1; + + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table_reader" to the newly opened + // table. The client should delete "*table_reader" when no longer needed. + // If there was an error while initializing the table, sets "*table_reader" + // to nullptr and returns a non-ok status. + // + // @param file must remain live while this Table is in use. + // @param prefetch_index_and_filter_in_cache can be used to disable + // prefetching of + // index and filter blocks into block cache at startup + // @param skip_filters Disables loading/accessing the filter block. Overrides + // prefetch_index_and_filter_in_cache, so filter will be skipped if both + // are set. + static Status Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_key_comparator, + std::unique_ptr<RandomAccessFileReader>&& file, + uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, + const SliceTransform* prefix_extractor = nullptr, + bool prefetch_index_and_filter_in_cache = true, + bool skip_filters = false, int level = -1, + const bool immortal_table = false, + const SequenceNumber largest_seqno = 0, + TailPrefetchStats* tail_prefetch_stats = nullptr); + + bool PrefixMayMatch(const Slice& internal_key, + const ReadOptions& read_options, + const SliceTransform* options_prefix_extractor, + const bool need_upper_bound_check); + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // @param skip_filters Disables loading/accessing the filter block + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena = nullptr, + bool skip_filters = false, + bool for_compaction = false) override; + + FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( + const ReadOptions& read_options) override; + + // @param skip_filters Disables loading/accessing the filter block + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + // Pre-fetch the disk blocks that correspond to the key range specified by + // (kbegin, kend). The call will return error status in the event of + // IO or iteration error. + Status Prefetch(const Slice* begin, const Slice* end) override; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + uint64_t ApproximateOffsetOf(const Slice& key) override; + + // Returns true if the block for the specified key is in cache. + // REQUIRES: key is in this table && block cache enabled + bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + void SetupForCompaction() override; + + std::shared_ptr<const TableProperties> GetTableProperties() const override; + + size_t ApproximateMemoryUsage() const override; + + // convert SST file to a human readable form + Status DumpTable(WritableFile* out_file, + const SliceTransform* prefix_extractor = nullptr) override; + + Status VerifyChecksum() override; + + void Close() override; + + ~BlockBasedTable(); + + bool TEST_filter_block_preloaded() const; + bool TEST_index_reader_preloaded() const; + + // IndexReader is the interface that provide the functionality for index + // access. + class IndexReader { + public: + explicit IndexReader(const InternalKeyComparator* icomparator, + Statistics* stats) + : icomparator_(icomparator), statistics_(stats) {} + + virtual ~IndexReader() {} + + // Create an iterator for index access. + // If iter is null then a new object is created on heap and the callee will + // have the ownership. If a non-null iter is passed in it will be used, and + // the returned value is either the same as iter or a new on-heap object + // that + // wrapps the passed iter. In the latter case the return value would point + // to + // a different object then iter and the callee has the ownership of the + // returned object. + virtual InternalIteratorBase<BlockHandle>* NewIterator( + IndexBlockIter* iter = nullptr, bool total_order_seek = true, + bool fill_cache = true) = 0; + + // The size of the index. + virtual size_t size() const = 0; + // Memory usage of the index block + virtual size_t usable_size() const = 0; + // return the statistics pointer + virtual Statistics* statistics() const { return statistics_; } + // Report an approximation of how much memory has been used other than + // memory + // that was allocated in block cache. + virtual size_t ApproximateMemoryUsage() const = 0; + + virtual void CacheDependencies(bool /* unused */) {} + + // Prefetch all the blocks referenced by this index to the buffer + void PrefetchBlocks(FilePrefetchBuffer* buf); + + protected: + const InternalKeyComparator* icomparator_; + + private: + Statistics* statistics_; + }; + + static Slice GetCacheKey(const char* cache_key_prefix, + size_t cache_key_prefix_size, + const BlockHandle& handle, char* cache_key); + + // Retrieve all key value pairs from data blocks in the table. + // The key retrieved are internal keys. + Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks); + + template <class TValue> + struct CachableEntry; + struct Rep; + + Rep* get_rep() { return rep_; } + + // input_iter: if it is not null, update this one and return it as Iterator + template <typename TBlockIter> + static TBlockIter* NewDataBlockIterator( + Rep* rep, const ReadOptions& ro, const Slice& index_value, + TBlockIter* input_iter = nullptr, bool is_index = false, + bool key_includes_seq = true, bool index_key_is_full = true, + GetContext* get_context = nullptr, + FilePrefetchBuffer* prefetch_buffer = nullptr); + template <typename TBlockIter> + static TBlockIter* NewDataBlockIterator( + Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde, + TBlockIter* input_iter = nullptr, bool is_index = false, + bool key_includes_seq = true, bool index_key_is_full = true, + GetContext* get_context = nullptr, Status s = Status(), + FilePrefetchBuffer* prefetch_buffer = nullptr); + + class PartitionedIndexIteratorState; + + friend class PartitionIndexReader; + + protected: + Rep* rep_; + explicit BlockBasedTable(Rep* rep) : rep_(rep) {} + + private: + friend class MockedBlockBasedTable; + static std::atomic<uint64_t> next_cache_key_id_; + + // If block cache enabled (compressed or uncompressed), looks for the block + // identified by handle in (1) uncompressed cache, (2) compressed cache, and + // then (3) file. If found, inserts into the cache(s) that were searched + // unsuccessfully (e.g., if found in file, will add to both uncompressed and + // compressed caches if they're enabled). + // + // @param block_entry value is set to the uncompressed block if found. If + // in uncompressed block cache, also sets cache_handle to reference that + // block. + static Status MaybeReadBlockAndLoadToCache( + FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry<Block>* block_entry, bool is_index = false, + GetContext* get_context = nullptr); + + // For the following two functions: + // if `no_io == true`, we will not try to read filter/index from sst file + // were they not present in cache yet. + CachableEntry<FilterBlockReader> GetFilter( + const SliceTransform* prefix_extractor = nullptr, + FilePrefetchBuffer* prefetch_buffer = nullptr, bool no_io = false, + GetContext* get_context = nullptr) const; + virtual CachableEntry<FilterBlockReader> GetFilter( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, + const bool is_a_filter_partition, bool no_io, GetContext* get_context, + const SliceTransform* prefix_extractor = nullptr) const; + + static CachableEntry<UncompressionDict> GetUncompressionDict( + Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io, + GetContext* get_context); + + // Get the iterator from the index reader. + // If input_iter is not set, return new Iterator + // If input_iter is set, update it and return it as Iterator + // + // Note: ErrorIterator with Status::Incomplete shall be returned if all the + // following conditions are met: + // 1. We enabled table_options.cache_index_and_filter_blocks. + // 2. index is not present in block cache. + // 3. We disallowed any io to be performed, that is, read_options == + // kBlockCacheTier + InternalIteratorBase<BlockHandle>* NewIndexIterator( + const ReadOptions& read_options, bool need_upper_bound_check = false, + IndexBlockIter* input_iter = nullptr, + CachableEntry<IndexReader>* index_entry = nullptr, + GetContext* get_context = nullptr); + + // Read block cache from block caches (if set): block_cache and + // block_cache_compressed. + // On success, Status::OK with be returned and @block will be populated with + // pointer to the block as well as its block handle. + // @param uncompression_dict Data for presetting the compression library's + // dictionary. + static Status GetDataBlockFromCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, Rep* rep, + const ReadOptions& read_options, + BlockBasedTable::CachableEntry<Block>* block, + const UncompressionDict& uncompression_dict, + size_t read_amp_bytes_per_bit, bool is_index = false, + GetContext* get_context = nullptr); + + // Put a raw block (maybe compressed) to the corresponding block caches. + // This method will perform decompression against raw_block if needed and then + // populate the block caches. + // On success, Status::OK will be returned; also @block will be populated with + // uncompressed block and its cache handle. + // + // Allocated memory managed by raw_block_contents will be transferred to + // PutDataBlockToCache(). After the call, the object will be invalid. + // @param uncompression_dict Data for presetting the compression library's + // dictionary. + static Status PutDataBlockToCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + const ReadOptions& read_options, const ImmutableCFOptions& ioptions, + CachableEntry<Block>* block, BlockContents* raw_block_contents, + CompressionType raw_block_comp_type, uint32_t format_version, + const UncompressionDict& uncompression_dict, SequenceNumber seq_no, + size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, + bool is_index = false, Cache::Priority pri = Cache::Priority::LOW, + GetContext* get_context = nullptr); + + // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found + // after a call to Seek(key), until handle_result returns false. + // May not make such a call if filter policy says that key is not present. + friend class TableCache; + friend class BlockBasedTableBuilder; + + void ReadMeta(const Footer& footer); + + // Figure the index type, update it in rep_, and also return it. + BlockBasedTableOptions::IndexType UpdateIndexType(); + + // Create a index reader based on the index type stored in the table. + // Optionally, user can pass a preloaded meta_index_iter for the index that + // need to access extra meta blocks for index construction. This parameter + // helps avoid re-reading meta index block if caller already created one. + Status CreateIndexReader( + FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader, + InternalIterator* preloaded_meta_index_iter = nullptr, + const int level = -1); + + bool FullFilterKeyMayMatch( + const ReadOptions& read_options, FilterBlockReader* filter, + const Slice& user_key, const bool no_io, + const SliceTransform* prefix_extractor = nullptr) const; + + static Status PrefetchTail( + RandomAccessFileReader* file, uint64_t file_size, + TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, + const bool preload_all, + std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer); + static Status ReadMetaBlock(Rep* rep, FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr<Block>* meta_block, + std::unique_ptr<InternalIterator>* iter); + static Status TryReadPropertiesWithGlobalSeqno( + Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value, + TableProperties** table_properties); + static Status ReadPropertiesBlock(Rep* rep, + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, + const SequenceNumber largest_seqno); + static Status ReadRangeDelBlock( + Rep* rep, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, + const InternalKeyComparator& internal_comparator); + static Status ReadCompressionDictBlock( + Rep* rep, FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr<const BlockContents>* compression_dict_block); + static Status PrefetchIndexAndFilterBlocks( + Rep* rep, FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, BlockBasedTable* new_table, + const SliceTransform* prefix_extractor, bool prefetch_all, + const BlockBasedTableOptions& table_options, const int level, + const bool prefetch_index_and_filter_in_cache); + + Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter); + Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter); + + // Create the filter from the filter block. + virtual FilterBlockReader* ReadFilter( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle, + const bool is_a_filter_partition, + const SliceTransform* prefix_extractor = nullptr) const; + + static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size); + + // Generate a cache key prefix from the file + static void GenerateCachePrefix(Cache* cc, RandomAccessFile* file, + char* buffer, size_t* size); + static void GenerateCachePrefix(Cache* cc, WritableFile* file, char* buffer, + size_t* size); + + // Helper functions for DumpTable() + Status DumpIndexBlock(WritableFile* out_file); + Status DumpDataBlocks(WritableFile* out_file); + void DumpKeyValue(const Slice& key, const Slice& value, + WritableFile* out_file); + + // No copying allowed + explicit BlockBasedTable(const TableReader&) = delete; + void operator=(const TableReader&) = delete; + + friend class PartitionedFilterBlockReader; + friend class PartitionedFilterBlockTest; +}; + +// Maitaning state of a two-level iteration on a partitioned index structure +class BlockBasedTable::PartitionedIndexIteratorState + : public TwoLevelIteratorState { + public: + PartitionedIndexIteratorState( + BlockBasedTable* table, + std::unordered_map<uint64_t, CachableEntry<Block>>* block_map, + const bool index_key_includes_seq, const bool index_key_is_full); + InternalIteratorBase<BlockHandle>* NewSecondaryIterator( + const BlockHandle& index_value) override; + + private: + // Don't own table_ + BlockBasedTable* table_; + std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_; + bool index_key_includes_seq_; + bool index_key_is_full_; +}; + +// CachableEntry represents the entries that *may* be fetched from block cache. +// field `value` is the item we want to get. +// field `cache_handle` is the cache handle to the block cache. If the value +// was not read from cache, `cache_handle` will be nullptr. +template <class TValue> +struct BlockBasedTable::CachableEntry { + CachableEntry(TValue* _value, Cache::Handle* _cache_handle) + : value(_value), cache_handle(_cache_handle) {} + CachableEntry() : CachableEntry(nullptr, nullptr) {} + void Release(Cache* cache, bool force_erase = false) { + if (cache_handle) { + cache->Release(cache_handle, force_erase); + value = nullptr; + cache_handle = nullptr; + } + } + bool IsSet() const { return cache_handle != nullptr; } + + TValue* value = nullptr; + // if the entry is from the cache, cache_handle will be populated. + Cache::Handle* cache_handle = nullptr; +}; + +struct BlockBasedTable::Rep { + Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, + const BlockBasedTableOptions& _table_opt, + const InternalKeyComparator& _internal_comparator, bool skip_filters, + int _level, const bool _immortal_table) + : ioptions(_ioptions), + env_options(_env_options), + table_options(_table_opt), + filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()), + internal_comparator(_internal_comparator), + filter_type(FilterType::kNoFilter), + index_type(BlockBasedTableOptions::IndexType::kBinarySearch), + hash_index_allow_collision(false), + whole_key_filtering(_table_opt.whole_key_filtering), + prefix_filtering(true), + global_seqno(kDisableGlobalSequenceNumber), + level(_level), + immortal_table(_immortal_table) {} + + const ImmutableCFOptions& ioptions; + const EnvOptions& env_options; + const BlockBasedTableOptions table_options; + const FilterPolicy* const filter_policy; + const InternalKeyComparator& internal_comparator; + Status status; + std::unique_ptr<RandomAccessFileReader> file; + char cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t cache_key_prefix_size = 0; + char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t persistent_cache_key_prefix_size = 0; + char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t compressed_cache_key_prefix_size = 0; + uint64_t dummy_index_reader_offset = + 0; // ID that is unique for the block cache. + PersistentCacheOptions persistent_cache_options; + + // Footer contains the fixed table information + Footer footer; + // `index_reader`, `filter`, and `uncompression_dict` will be populated (i.e., + // non-nullptr) and used only when options.block_cache is nullptr or when + // `cache_index_and_filter_blocks == false`. Otherwise, we will get the index, + // filter, and compression dictionary blocks via the block cache. In that case + // `dummy_index_reader_offset`, `filter_handle`, and `compression_dict_handle` + // are used to lookup these meta-blocks in block cache. + std::unique_ptr<IndexReader> index_reader; + std::unique_ptr<FilterBlockReader> filter; + std::unique_ptr<UncompressionDict> uncompression_dict; + + enum class FilterType { + kNoFilter, + kFullFilter, + kBlockFilter, + kPartitionedFilter, + }; + FilterType filter_type; + BlockHandle filter_handle; + BlockHandle compression_dict_handle; + + std::shared_ptr<const TableProperties> table_properties; + BlockBasedTableOptions::IndexType index_type; + bool hash_index_allow_collision; + bool whole_key_filtering; + bool prefix_filtering; + // TODO(kailiu) It is very ugly to use internal key in table, since table + // module should not be relying on db module. However to make things easier + // and compatible with existing code, we introduce a wrapper that allows + // block to extract prefix without knowing if a key is internal or not. + std::unique_ptr<SliceTransform> internal_prefix_transform; + std::shared_ptr<const SliceTransform> table_prefix_extractor; + + // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is + // true or in all levels when pin_top_level_index_and_filter is set in + // combination with partitioned index/filters: then we do use the LRU cache, + // but we always keep the filter & index block's handle checked out here (=we + // don't call Release()), plus the parsed out objects the LRU cache will never + // push flush them out, hence they're pinned + CachableEntry<FilterBlockReader> filter_entry; + CachableEntry<IndexReader> index_entry; + std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels; + + // If global_seqno is used, all Keys in this file will have the same + // seqno with value `global_seqno`. + // + // A value of kDisableGlobalSequenceNumber means that this feature is disabled + // and every key have it's own seqno. + SequenceNumber global_seqno; + + // the level when the table is opened, could potentially change when trivial + // move is involved + int level; + + // If false, blocks in this file are definitely all uncompressed. Knowing this + // before reading individual blocks enables certain optimizations. + bool blocks_maybe_compressed = true; + + // If true, data blocks in this file are definitely ZSTD compressed. If false + // they might not be. When false we skip creating a ZSTD digested + // uncompression dictionary. Even if we get a false negative, things should + // still work, just not as quickly. + bool blocks_definitely_zstd_compressed = false; + + bool closed = false; + const bool immortal_table; + + SequenceNumber get_global_seqno(bool is_index) const { + return is_index ? kDisableGlobalSequenceNumber : global_seqno; + } +}; + +template <class TBlockIter, typename TValue = Slice> +class BlockBasedTableIterator : public InternalIteratorBase<TValue> { + public: + BlockBasedTableIterator(BlockBasedTable* table, + const ReadOptions& read_options, + const InternalKeyComparator& icomp, + InternalIteratorBase<BlockHandle>* index_iter, + bool check_filter, bool need_upper_bound_check, + const SliceTransform* prefix_extractor, bool is_index, + bool key_includes_seq = true, + bool index_key_is_full = true, + bool for_compaction = false) + : table_(table), + read_options_(read_options), + icomp_(icomp), + user_comparator_(icomp.user_comparator()), + index_iter_(index_iter), + pinned_iters_mgr_(nullptr), + block_iter_points_to_real_block_(false), + check_filter_(check_filter), + need_upper_bound_check_(need_upper_bound_check), + prefix_extractor_(prefix_extractor), + is_index_(is_index), + key_includes_seq_(key_includes_seq), + index_key_is_full_(index_key_is_full), + for_compaction_(for_compaction) {} + + ~BlockBasedTableIterator() { delete index_iter_; } + + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void SeekToFirst() override; + void SeekToLast() override; + void Next() override; + void Prev() override; + bool Valid() const override { + return !is_out_of_bound_ && block_iter_points_to_real_block_ && + block_iter_.Valid(); + } + Slice key() const override { + assert(Valid()); + return block_iter_.key(); + } + TValue value() const override { + assert(Valid()); + return block_iter_.value(); + } + Status status() const override { + if (!index_iter_->status().ok()) { + return index_iter_->status(); + } else if (block_iter_points_to_real_block_) { + return block_iter_.status(); + } else { + return Status::OK(); + } + } + + bool IsOutOfBound() override { return is_out_of_bound_; } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + bool IsKeyPinned() const override { + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + block_iter_points_to_real_block_ && block_iter_.IsKeyPinned(); + } + bool IsValuePinned() const override { + // BlockIter::IsValuePinned() is always true. No need to check + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + block_iter_points_to_real_block_; + } + + bool CheckPrefixMayMatch(const Slice& ikey) { + if (check_filter_ && + !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, + need_upper_bound_check_)) { + // TODO remember the iterator is invalidated because of prefix + // match. This can avoid the upper level file iterator to falsely + // believe the position is the end of the SST file and move to + // the first key of the next file. + ResetDataIter(); + return false; + } + return true; + } + + void ResetDataIter() { + if (block_iter_points_to_real_block_) { + if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) { + block_iter_.DelegateCleanupsTo(pinned_iters_mgr_); + } + block_iter_.Invalidate(Status::OK()); + block_iter_points_to_real_block_ = false; + } + } + + void SavePrevIndexValue() { + if (block_iter_points_to_real_block_) { + // Reseek. If they end up with the same data block, we shouldn't re-fetch + // the same data block. + prev_index_value_ = index_iter_->value(); + } + } + + void InitDataBlock(); + void FindKeyForward(); + void FindKeyBackward(); + + private: + BlockBasedTable* table_; + const ReadOptions read_options_; + const InternalKeyComparator& icomp_; + UserComparatorWrapper user_comparator_; + InternalIteratorBase<BlockHandle>* index_iter_; + PinnedIteratorsManager* pinned_iters_mgr_; + TBlockIter block_iter_; + bool block_iter_points_to_real_block_; + bool is_out_of_bound_ = false; + bool check_filter_; + // TODO(Zhongyi): pick a better name + bool need_upper_bound_check_; + const SliceTransform* prefix_extractor_; + // If the blocks over which we iterate are index blocks + bool is_index_; + // If the keys in the blocks over which we iterate include 8 byte sequence + bool key_includes_seq_; + bool index_key_is_full_; + // If this iterator is created for compaction + bool for_compaction_; + BlockHandle prev_index_value_; + + static const size_t kInitReadaheadSize = 8 * 1024; + // Found that 256 KB readahead size provides the best performance, based on + // experiments. + static const size_t kMaxReadaheadSize; + size_t readahead_size_ = kInitReadaheadSize; + size_t readahead_limit_ = 0; + int num_file_reads_ = 0; + std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/block_builder.cc b/src/rocksdb/table/block_builder.cc new file mode 100644 index 00000000..c14b4f6d --- /dev/null +++ b/src/rocksdb/table/block_builder.cc @@ -0,0 +1,196 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// BlockBuilder generates blocks where keys are prefix-compressed: +// +// When we store a key, we drop the prefix shared with the previous +// string. This helps reduce the space requirement significantly. +// Furthermore, once every K keys, we do not apply the prefix +// compression and store the entire key. We call this a "restart +// point". The tail end of the block stores the offsets of all of the +// restart points, and can be used to do a binary search when looking +// for a particular key. Values are stored as-is (without compression) +// immediately following the corresponding key. +// +// An entry for a particular key-value pair has the form: +// shared_bytes: varint32 +// unshared_bytes: varint32 +// value_length: varint32 +// key_delta: char[unshared_bytes] +// value: char[value_length] +// shared_bytes == 0 for restart points. +// +// The trailer of the block has the form: +// restarts: uint32[num_restarts] +// num_restarts: uint32 +// restarts[i] contains the offset within the block of the ith restart point. + +#include "table/block_builder.h" + +#include <assert.h> +#include <algorithm> +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "table/data_block_footer.h" +#include "util/coding.h" + +namespace rocksdb { + +BlockBuilder::BlockBuilder( + int block_restart_interval, bool use_delta_encoding, + bool use_value_delta_encoding, + BlockBasedTableOptions::DataBlockIndexType index_type, + double data_block_hash_table_util_ratio) + : block_restart_interval_(block_restart_interval), + use_delta_encoding_(use_delta_encoding), + use_value_delta_encoding_(use_value_delta_encoding), + restarts_(), + counter_(0), + finished_(false) { + switch (index_type) { + case BlockBasedTableOptions::kDataBlockBinarySearch: + break; + case BlockBasedTableOptions::kDataBlockBinaryAndHash: + data_block_hash_index_builder_.Initialize( + data_block_hash_table_util_ratio); + break; + default: + assert(0); + } + assert(block_restart_interval_ >= 1); + restarts_.push_back(0); // First restart point is at offset 0 + estimate_ = sizeof(uint32_t) + sizeof(uint32_t); +} + +void BlockBuilder::Reset() { + buffer_.clear(); + restarts_.clear(); + restarts_.push_back(0); // First restart point is at offset 0 + estimate_ = sizeof(uint32_t) + sizeof(uint32_t); + counter_ = 0; + finished_ = false; + last_key_.clear(); + if (data_block_hash_index_builder_.Valid()) { + data_block_hash_index_builder_.Reset(); + } +} + +size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, + const Slice& value) const { + size_t estimate = CurrentSizeEstimate(); + // Note: this is an imprecise estimate as it accounts for the whole key size + // instead of non-shared key size. + estimate += key.size(); + // In value delta encoding we estimate the value delta size as half the full + // value size since only the size field of block handle is encoded. + estimate += + !use_value_delta_encoding_ || (counter_ >= block_restart_interval_) + ? value.size() + : value.size() / 2; + + if (counter_ >= block_restart_interval_) { + estimate += sizeof(uint32_t); // a new restart entry. + } + + estimate += sizeof(int32_t); // varint for shared prefix length. + // Note: this is an imprecise estimate as we will have to encoded size, one + // for shared key and one for non-shared key. + estimate += VarintLength(key.size()); // varint for key length. + if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) { + estimate += VarintLength(value.size()); // varint for value length. + } + + return estimate; +} + +Slice BlockBuilder::Finish() { + // Append restart array + for (size_t i = 0; i < restarts_.size(); i++) { + PutFixed32(&buffer_, restarts_[i]); + } + + uint32_t num_restarts = static_cast<uint32_t>(restarts_.size()); + BlockBasedTableOptions::DataBlockIndexType index_type = + BlockBasedTableOptions::kDataBlockBinarySearch; + if (data_block_hash_index_builder_.Valid() && + CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) { + data_block_hash_index_builder_.Finish(buffer_); + index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; + } + + // footer is a packed format of data_block_index_type and num_restarts + uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts); + + PutFixed32(&buffer_, block_footer); + finished_ = true; + return Slice(buffer_); +} + +void BlockBuilder::Add(const Slice& key, const Slice& value, + const Slice* const delta_value) { + assert(!finished_); + assert(counter_ <= block_restart_interval_); + assert(!use_value_delta_encoding_ || delta_value); + size_t shared = 0; // number of bytes shared with prev key + if (counter_ >= block_restart_interval_) { + // Restart compression + restarts_.push_back(static_cast<uint32_t>(buffer_.size())); + estimate_ += sizeof(uint32_t); + counter_ = 0; + + if (use_delta_encoding_) { + // Update state + last_key_.assign(key.data(), key.size()); + } + } else if (use_delta_encoding_) { + Slice last_key_piece(last_key_); + // See how much sharing to do with previous string + shared = key.difference_offset(last_key_piece); + + // Update state + // We used to just copy the changed data here, but it appears to be + // faster to just copy the whole thing. + last_key_.assign(key.data(), key.size()); + } + + const size_t non_shared = key.size() - shared; + const size_t curr_size = buffer_.size(); + + if (use_value_delta_encoding_) { + // Add "<shared><non_shared>" to buffer_ + PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared), + static_cast<uint32_t>(non_shared)); + } else { + // Add "<shared><non_shared><value_size>" to buffer_ + PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared), + static_cast<uint32_t>(non_shared), + static_cast<uint32_t>(value.size())); + } + + // Add string delta to buffer_ followed by value + buffer_.append(key.data() + shared, non_shared); + // Use value delta encoding only when the key has shared bytes. This would + // simplify the decoding, where it can figure which decoding to use simply by + // looking at the shared bytes size. + if (shared != 0 && use_value_delta_encoding_) { + buffer_.append(delta_value->data(), delta_value->size()); + } else { + buffer_.append(value.data(), value.size()); + } + + if (data_block_hash_index_builder_.Valid()) { + data_block_hash_index_builder_.Add(ExtractUserKey(key), + restarts_.size() - 1); + } + + counter_++; + estimate_ += buffer_.size() - curr_size; +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/block_builder.h b/src/rocksdb/table/block_builder.h new file mode 100644 index 00000000..0576279f --- /dev/null +++ b/src/rocksdb/table/block_builder.h @@ -0,0 +1,75 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <vector> + +#include <stdint.h> +#include "rocksdb/slice.h" +#include "rocksdb/table.h" +#include "table/data_block_hash_index.h" + +namespace rocksdb { + +class BlockBuilder { + public: + BlockBuilder(const BlockBuilder&) = delete; + void operator=(const BlockBuilder&) = delete; + + explicit BlockBuilder(int block_restart_interval, + bool use_delta_encoding = true, + bool use_value_delta_encoding = false, + BlockBasedTableOptions::DataBlockIndexType index_type = + BlockBasedTableOptions::kDataBlockBinarySearch, + double data_block_hash_table_util_ratio = 0.75); + + // Reset the contents as if the BlockBuilder was just constructed. + void Reset(); + + // REQUIRES: Finish() has not been called since the last call to Reset(). + // REQUIRES: key is larger than any previously added key + void Add(const Slice& key, const Slice& value, + const Slice* const delta_value = nullptr); + + // Finish building the block and return a slice that refers to the + // block contents. The returned slice will remain valid for the + // lifetime of this builder or until Reset() is called. + Slice Finish(); + + // Returns an estimate of the current (uncompressed) size of the block + // we are building. + inline size_t CurrentSizeEstimate() const { + return estimate_ + (data_block_hash_index_builder_.Valid() + ? data_block_hash_index_builder_.EstimateSize() + : 0); + } + + // Returns an estimated block size after appending key and value. + size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const; + + // Return true iff no entries have been added since the last Reset() + bool empty() const { return buffer_.empty(); } + + private: + const int block_restart_interval_; + // TODO(myabandeh): put it into a separate IndexBlockBuilder + const bool use_delta_encoding_; + // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values + const bool use_value_delta_encoding_; + + std::string buffer_; // Destination buffer + std::vector<uint32_t> restarts_; // Restart points + size_t estimate_; + int counter_; // Number of entries emitted since restart + bool finished_; // Has Finish() been called? + std::string last_key_; + DataBlockHashIndexBuilder data_block_hash_index_builder_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/block_fetcher.cc b/src/rocksdb/table/block_fetcher.cc new file mode 100644 index 00000000..1f209210 --- /dev/null +++ b/src/rocksdb/table/block_fetcher.cc @@ -0,0 +1,265 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_fetcher.h" + +#include <inttypes.h> +#include <string> + +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "rocksdb/env.h" +#include "table/block.h" +#include "table/block_based_table_reader.h" +#include "table/format.h" +#include "table/persistent_cache_helper.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/file_reader_writer.h" +#include "util/logging.h" +#include "util/memory_allocator.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/xxhash.h" + +namespace rocksdb { + +inline void BlockFetcher::CheckBlockChecksum() { + // Check the crc of the type and the block contents + if (read_options_.verify_checksums) { + const char* data = slice_.data(); // Pointer to where Read put the data + PERF_TIMER_GUARD(block_checksum_time); + uint32_t value = DecodeFixed32(data + block_size_ + 1); + uint32_t actual = 0; + switch (footer_.checksum()) { + case kNoChecksum: + break; + case kCRC32c: + value = crc32c::Unmask(value); + actual = crc32c::Value(data, block_size_ + 1); + break; + case kxxHash: + actual = XXH32(data, static_cast<int>(block_size_) + 1, 0); + break; + case kxxHash64: + actual = static_cast<uint32_t>( + XXH64(data, static_cast<int>(block_size_) + 1, 0) & + uint64_t{0xffffffff}); + break; + default: + status_ = Status::Corruption( + "unknown checksum type " + ToString(footer_.checksum()) + " in " + + file_->file_name() + " offset " + ToString(handle_.offset()) + + " size " + ToString(block_size_)); + } + if (status_.ok() && actual != value) { + status_ = Status::Corruption( + "block checksum mismatch: expected " + ToString(actual) + ", got " + + ToString(value) + " in " + file_->file_name() + " offset " + + ToString(handle_.offset()) + " size " + ToString(block_size_)); + } + } +} + +inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() { + if (cache_options_.persistent_cache && + !cache_options_.persistent_cache->IsCompressed()) { + Status status = PersistentCacheHelper::LookupUncompressedPage( + cache_options_, handle_, contents_); + if (status.ok()) { + // uncompressed page is found for the block handle + return true; + } else { + // uncompressed page is not found + if (ioptions_.info_log && !status.IsNotFound()) { + assert(!status.ok()); + ROCKS_LOG_INFO(ioptions_.info_log, + "Error reading from persistent cache. %s", + status.ToString().c_str()); + } + } + } + return false; +} + +inline bool BlockFetcher::TryGetFromPrefetchBuffer() { + if (prefetch_buffer_ != nullptr && + prefetch_buffer_->TryReadFromCache( + handle_.offset(), + static_cast<size_t>(handle_.size()) + kBlockTrailerSize, &slice_)) { + block_size_ = static_cast<size_t>(handle_.size()); + CheckBlockChecksum(); + if (!status_.ok()) { + return true; + } + got_from_prefetch_buffer_ = true; + used_buf_ = const_cast<char*>(slice_.data()); + } + return got_from_prefetch_buffer_; +} + +inline bool BlockFetcher::TryGetCompressedBlockFromPersistentCache() { + if (cache_options_.persistent_cache && + cache_options_.persistent_cache->IsCompressed()) { + // lookup uncompressed cache mode p-cache + std::unique_ptr<char[]> raw_data; + status_ = PersistentCacheHelper::LookupRawPage( + cache_options_, handle_, &raw_data, block_size_ + kBlockTrailerSize); + if (status_.ok()) { + heap_buf_ = CacheAllocationPtr(raw_data.release()); + used_buf_ = heap_buf_.get(); + slice_ = Slice(heap_buf_.get(), block_size_); + return true; + } else if (!status_.IsNotFound() && ioptions_.info_log) { + assert(!status_.ok()); + ROCKS_LOG_INFO(ioptions_.info_log, + "Error reading from persistent cache. %s", + status_.ToString().c_str()); + } + } + return false; +} + +inline void BlockFetcher::PrepareBufferForBlockFromFile() { + // cache miss read from device + if (do_uncompress_ && + block_size_ + kBlockTrailerSize < kDefaultStackBufferSize) { + // If we've got a small enough hunk of data, read it in to the + // trivially allocated stack buffer instead of needing a full malloc() + used_buf_ = &stack_buf_[0]; + } else if (maybe_compressed_ && !do_uncompress_) { + compressed_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize, + memory_allocator_compressed_); + used_buf_ = compressed_buf_.get(); + } else { + heap_buf_ = + AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_); + used_buf_ = heap_buf_.get(); + } +} + +inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() { + if (status_.ok() && read_options_.fill_cache && + cache_options_.persistent_cache && + cache_options_.persistent_cache->IsCompressed()) { + // insert to raw cache + PersistentCacheHelper::InsertRawPage(cache_options_, handle_, used_buf_, + block_size_ + kBlockTrailerSize); + } +} + +inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() { + if (status_.ok() && !got_from_prefetch_buffer_ && read_options_.fill_cache && + cache_options_.persistent_cache && + !cache_options_.persistent_cache->IsCompressed()) { + // insert to uncompressed cache + PersistentCacheHelper::InsertUncompressedPage(cache_options_, handle_, + *contents_); + } +} + +inline void BlockFetcher::CopyBufferToHeap() { + assert(used_buf_ != heap_buf_.get()); + heap_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_); + memcpy(heap_buf_.get(), used_buf_, block_size_ + kBlockTrailerSize); +} + +inline void BlockFetcher::GetBlockContents() { + if (slice_.data() != used_buf_) { + // the slice content is not the buffer provided + *contents_ = BlockContents(Slice(slice_.data(), block_size_)); + } else { + // page can be either uncompressed or compressed, the buffer either stack + // or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096 + if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) { + CopyBufferToHeap(); + } else if (used_buf_ == compressed_buf_.get()) { + if (compression_type_ == kNoCompression && + memory_allocator_ != memory_allocator_compressed_) { + CopyBufferToHeap(); + } else { + heap_buf_ = std::move(compressed_buf_); + } + } + *contents_ = BlockContents(std::move(heap_buf_), block_size_); + } +#ifndef NDEBUG + contents_->is_raw_block = true; +#endif +} + +Status BlockFetcher::ReadBlockContents() { + block_size_ = static_cast<size_t>(handle_.size()); + + if (TryGetUncompressBlockFromPersistentCache()) { + compression_type_ = kNoCompression; +#ifndef NDEBUG + contents_->is_raw_block = true; +#endif // NDEBUG + return Status::OK(); + } + if (TryGetFromPrefetchBuffer()) { + if (!status_.ok()) { + return status_; + } + } else if (!TryGetCompressedBlockFromPersistentCache()) { + PrepareBufferForBlockFromFile(); + Status s; + + { + PERF_TIMER_GUARD(block_read_time); + // Actual file read + status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize, + &slice_, used_buf_); + } + PERF_COUNTER_ADD(block_read_count, 1); + PERF_COUNTER_ADD(block_read_byte, block_size_ + kBlockTrailerSize); + if (!status_.ok()) { + return status_; + } + + if (slice_.size() != block_size_ + kBlockTrailerSize) { + return Status::Corruption("truncated block read from " + + file_->file_name() + " offset " + + ToString(handle_.offset()) + ", expected " + + ToString(block_size_ + kBlockTrailerSize) + + " bytes, got " + ToString(slice_.size())); + } + + CheckBlockChecksum(); + if (status_.ok()) { + InsertCompressedBlockToPersistentCacheIfNeeded(); + } else { + return status_; + } + } + + PERF_TIMER_GUARD(block_decompress_time); + + compression_type_ = get_block_compression_type(slice_.data(), block_size_); + + if (do_uncompress_ && compression_type_ != kNoCompression) { + // compressed page, uncompress, update cache + UncompressionContext context(compression_type_); + UncompressionInfo info(context, uncompression_dict_, compression_type_); + status_ = UncompressBlockContents(info, slice_.data(), block_size_, + contents_, footer_.version(), ioptions_, + memory_allocator_); + compression_type_ = kNoCompression; + } else { + GetBlockContents(); + } + + InsertUncompressedBlockToPersistentCacheIfNeeded(); + + return status_; +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/block_fetcher.h b/src/rocksdb/table/block_fetcher.h new file mode 100644 index 00000000..b5fee941 --- /dev/null +++ b/src/rocksdb/table/block_fetcher.h @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "table/block.h" +#include "table/format.h" +#include "util/memory_allocator.h" + +namespace rocksdb { +class BlockFetcher { + public: + // Read the block identified by "handle" from "file". + // The only relevant option is options.verify_checksums for now. + // On failure return non-OK. + // On success fill *result and return OK - caller owns *result + // @param uncompression_dict Data for presetting the compression library's + // dictionary. + BlockFetcher(RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, const Footer& footer, + const ReadOptions& read_options, const BlockHandle& handle, + BlockContents* contents, const ImmutableCFOptions& ioptions, + bool do_uncompress, bool maybe_compressed, + const UncompressionDict& uncompression_dict, + const PersistentCacheOptions& cache_options, + MemoryAllocator* memory_allocator = nullptr, + MemoryAllocator* memory_allocator_compressed = nullptr) + : file_(file), + prefetch_buffer_(prefetch_buffer), + footer_(footer), + read_options_(read_options), + handle_(handle), + contents_(contents), + ioptions_(ioptions), + do_uncompress_(do_uncompress), + maybe_compressed_(maybe_compressed), + uncompression_dict_(uncompression_dict), + cache_options_(cache_options), + memory_allocator_(memory_allocator), + memory_allocator_compressed_(memory_allocator_compressed) {} + Status ReadBlockContents(); + CompressionType get_compression_type() const { return compression_type_; } + + private: + static const uint32_t kDefaultStackBufferSize = 5000; + + RandomAccessFileReader* file_; + FilePrefetchBuffer* prefetch_buffer_; + const Footer& footer_; + const ReadOptions read_options_; + const BlockHandle& handle_; + BlockContents* contents_; + const ImmutableCFOptions& ioptions_; + bool do_uncompress_; + bool maybe_compressed_; + const UncompressionDict& uncompression_dict_; + const PersistentCacheOptions& cache_options_; + MemoryAllocator* memory_allocator_; + MemoryAllocator* memory_allocator_compressed_; + Status status_; + Slice slice_; + char* used_buf_ = nullptr; + size_t block_size_; + CacheAllocationPtr heap_buf_; + CacheAllocationPtr compressed_buf_; + char stack_buf_[kDefaultStackBufferSize]; + bool got_from_prefetch_buffer_ = false; + rocksdb::CompressionType compression_type_; + + // return true if found + bool TryGetUncompressBlockFromPersistentCache(); + // return true if found + bool TryGetFromPrefetchBuffer(); + bool TryGetCompressedBlockFromPersistentCache(); + void PrepareBufferForBlockFromFile(); + // Copy content from used_buf_ to new heap buffer. + void CopyBufferToHeap(); + void GetBlockContents(); + void InsertCompressedBlockToPersistentCacheIfNeeded(); + void InsertUncompressedBlockToPersistentCacheIfNeeded(); + void CheckBlockChecksum(); +}; +} // namespace rocksdb diff --git a/src/rocksdb/table/block_prefix_index.cc b/src/rocksdb/table/block_prefix_index.cc new file mode 100644 index 00000000..67c749d4 --- /dev/null +++ b/src/rocksdb/table/block_prefix_index.cc @@ -0,0 +1,232 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_prefix_index.h" + +#include <vector> + +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "util/arena.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace rocksdb { + +inline uint32_t Hash(const Slice& s) { + return rocksdb::Hash(s.data(), s.size(), 0); +} + +inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) { + return Hash(prefix) % num_buckets; +} + +// The prefix block index is simply a bucket array, with each entry pointing to +// the blocks that span the prefixes hashed to this bucket. +// +// To reduce memory footprint, if there is only one block per bucket, the entry +// stores the block id directly. If there are more than one blocks per bucket, +// because of hash collision or a single prefix spanning multiple blocks, +// the entry points to an array of block ids. The block array is an array of +// uint32_t's. The first uint32_t indicates the total number of blocks, followed +// by the block ids. +// +// To differentiate the two cases, the high order bit of the entry indicates +// whether it is a 'pointer' into a separate block array. +// 0x7FFFFFFF is reserved for empty bucket. + +const uint32_t kNoneBlock = 0x7FFFFFFF; +const uint32_t kBlockArrayMask = 0x80000000; + +inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; } + +inline bool IsBlockId(uint32_t block_id) { + return (block_id & kBlockArrayMask) == 0; +} + +inline uint32_t DecodeIndex(uint32_t block_id) { + uint32_t index = block_id ^ kBlockArrayMask; + assert(index < kBlockArrayMask); + return index; +} + +inline uint32_t EncodeIndex(uint32_t index) { + assert(index < kBlockArrayMask); + return index | kBlockArrayMask; +} + +// temporary storage for prefix information during index building +struct PrefixRecord { + Slice prefix; + uint32_t start_block; + uint32_t end_block; + uint32_t num_blocks; + PrefixRecord* next; +}; + +class BlockPrefixIndex::Builder { + public: + explicit Builder(const SliceTransform* internal_prefix_extractor) + : internal_prefix_extractor_(internal_prefix_extractor) {} + + void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) { + PrefixRecord* record = reinterpret_cast<PrefixRecord*>( + arena_.AllocateAligned(sizeof(PrefixRecord))); + record->prefix = key_prefix; + record->start_block = start_block; + record->end_block = start_block + num_blocks - 1; + record->num_blocks = num_blocks; + prefixes_.push_back(record); + } + + BlockPrefixIndex* Finish() { + // For now, use roughly 1:1 prefix to bucket ratio. + uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1; + + // Collect prefix records that hash to the same bucket, into a single + // linklist. + std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr); + std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0); + for (PrefixRecord* current : prefixes_) { + uint32_t bucket = PrefixToBucket(current->prefix, num_buckets); + // merge the prefix block span if the first block of this prefix is + // connected to the last block of the previous prefix. + PrefixRecord* prev = prefixes_per_bucket[bucket]; + if (prev) { + assert(current->start_block >= prev->end_block); + auto distance = current->start_block - prev->end_block; + if (distance <= 1) { + prev->end_block = current->end_block; + prev->num_blocks = prev->end_block - prev->start_block + 1; + num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1); + continue; + } + } + current->next = prev; + prefixes_per_bucket[bucket] = current; + num_blocks_per_bucket[bucket] += current->num_blocks; + } + + // Calculate the block array buffer size + uint32_t total_block_array_entries = 0; + for (uint32_t i = 0; i < num_buckets; i++) { + uint32_t num_blocks = num_blocks_per_bucket[i]; + if (num_blocks > 1) { + total_block_array_entries += (num_blocks + 1); + } + } + + // Populate the final prefix block index + uint32_t* block_array_buffer = new uint32_t[total_block_array_entries]; + uint32_t* buckets = new uint32_t[num_buckets]; + uint32_t offset = 0; + for (uint32_t i = 0; i < num_buckets; i++) { + uint32_t num_blocks = num_blocks_per_bucket[i]; + if (num_blocks == 0) { + assert(prefixes_per_bucket[i] == nullptr); + buckets[i] = kNoneBlock; + } else if (num_blocks == 1) { + assert(prefixes_per_bucket[i] != nullptr); + assert(prefixes_per_bucket[i]->next == nullptr); + buckets[i] = prefixes_per_bucket[i]->start_block; + } else { + assert(total_block_array_entries > 0); + assert(prefixes_per_bucket[i] != nullptr); + buckets[i] = EncodeIndex(offset); + block_array_buffer[offset] = num_blocks; + uint32_t* last_block = &block_array_buffer[offset + num_blocks]; + auto current = prefixes_per_bucket[i]; + // populate block ids from largest to smallest + while (current != nullptr) { + for (uint32_t iter = 0; iter < current->num_blocks; iter++) { + *last_block = current->end_block - iter; + last_block--; + } + current = current->next; + } + assert(last_block == &block_array_buffer[offset]); + offset += (num_blocks + 1); + } + } + + assert(offset == total_block_array_entries); + + return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets, + buckets, total_block_array_entries, + block_array_buffer); + } + + private: + const SliceTransform* internal_prefix_extractor_; + + std::vector<PrefixRecord*> prefixes_; + Arena arena_; +}; + +Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, + const Slice& prefixes, const Slice& prefix_meta, + BlockPrefixIndex** prefix_index) { + uint64_t pos = 0; + auto meta_pos = prefix_meta; + Status s; + Builder builder(internal_prefix_extractor); + + while (!meta_pos.empty()) { + uint32_t prefix_size = 0; + uint32_t entry_index = 0; + uint32_t num_blocks = 0; + if (!GetVarint32(&meta_pos, &prefix_size) || + !GetVarint32(&meta_pos, &entry_index) || + !GetVarint32(&meta_pos, &num_blocks)) { + s = Status::Corruption( + "Corrupted prefix meta block: unable to read from it."); + break; + } + if (pos + prefix_size > prefixes.size()) { + s = Status::Corruption( + "Corrupted prefix meta block: size inconsistency."); + break; + } + Slice prefix(prefixes.data() + pos, prefix_size); + builder.Add(prefix, entry_index, num_blocks); + + pos += prefix_size; + } + + if (s.ok() && pos != prefixes.size()) { + s = Status::Corruption("Corrupted prefix meta block"); + } + + if (s.ok()) { + *prefix_index = builder.Finish(); + } + + return s; +} + +uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) { + Slice prefix = internal_prefix_extractor_->Transform(key); + + uint32_t bucket = PrefixToBucket(prefix, num_buckets_); + uint32_t block_id = buckets_[bucket]; + + if (IsNone(block_id)) { + return 0; + } else if (IsBlockId(block_id)) { + *blocks = &buckets_[bucket]; + return 1; + } else { + uint32_t index = DecodeIndex(block_id); + assert(index < num_block_array_buffer_entries_); + *blocks = &block_array_buffer_[index + 1]; + uint32_t num_blocks = block_array_buffer_[index]; + assert(num_blocks > 1); + assert(index + num_blocks < num_block_array_buffer_entries_); + return num_blocks; + } +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/block_prefix_index.h b/src/rocksdb/table/block_prefix_index.h new file mode 100644 index 00000000..105606db --- /dev/null +++ b/src/rocksdb/table/block_prefix_index.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <stdint.h> +#include "rocksdb/status.h" + +namespace rocksdb { + +class Comparator; +class Iterator; +class Slice; +class SliceTransform; + +// Build a hash-based index to speed up the lookup for "index block". +// BlockHashIndex accepts a key and, if found, returns its restart index within +// that index block. +class BlockPrefixIndex { + public: + // Maps a key to a list of data blocks that could potentially contain + // the key, based on the prefix. + // Returns the total number of relevant blocks, 0 means the key does + // not exist. + uint32_t GetBlocks(const Slice& key, uint32_t** blocks); + + size_t ApproximateMemoryUsage() const { + return sizeof(BlockPrefixIndex) + + (num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t); + } + + // Create hash index by reading from the metadata blocks. + // @params prefixes: a sequence of prefixes. + // @params prefix_meta: contains the "metadata" to of the prefixes. + static Status Create(const SliceTransform* hash_key_extractor, + const Slice& prefixes, const Slice& prefix_meta, + BlockPrefixIndex** prefix_index); + + ~BlockPrefixIndex() { + delete[] buckets_; + delete[] block_array_buffer_; + } + + private: + class Builder; + friend Builder; + + BlockPrefixIndex(const SliceTransform* internal_prefix_extractor, + uint32_t num_buckets, uint32_t* buckets, + uint32_t num_block_array_buffer_entries, + uint32_t* block_array_buffer) + : internal_prefix_extractor_(internal_prefix_extractor), + num_buckets_(num_buckets), + num_block_array_buffer_entries_(num_block_array_buffer_entries), + buckets_(buckets), + block_array_buffer_(block_array_buffer) {} + + const SliceTransform* internal_prefix_extractor_; + uint32_t num_buckets_; + uint32_t num_block_array_buffer_entries_; + uint32_t* buckets_; + uint32_t* block_array_buffer_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/block_test.cc b/src/rocksdb/table/block_test.cc new file mode 100644 index 00000000..3e0ff3ea --- /dev/null +++ b/src/rocksdb/table/block_test.cc @@ -0,0 +1,609 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include <stdio.h> +#include <algorithm> +#include <set> +#include <string> +#include <unordered_set> +#include <utility> +#include <vector> + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +static std::string RandomString(Random *rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} +std::string GenerateKey(int primary_key, int secondary_key, int padding_size, + Random *rnd) { + char buf[50]; + char *p = &buf[0]; + snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); + std::string k(p); + if (padding_size) { + k += RandomString(rnd, padding_size); + } + + return k; +} + +// Generate random key value pairs. +// The generated key will be sorted. You can tune the parameters to generated +// different kinds of test key/value pairs for different scenario. +void GenerateRandomKVs(std::vector<std::string> *keys, + std::vector<std::string> *values, const int from, + const int len, const int step = 1, + const int padding_size = 0, + const int keys_share_prefix = 1) { + Random rnd(302); + + // generate different prefix + for (int i = from; i < from + len; i += step) { + // generating keys that shares the prefix + for (int j = 0; j < keys_share_prefix; ++j) { + keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + + // 100 bytes values + values->emplace_back(RandomString(&rnd, 100)); + } + } +} + +// Same as GenerateRandomKVs but the values are BlockHandle +void GenerateRandomKBHs(std::vector<std::string> *keys, + std::vector<BlockHandle> *values, const int from, + const int len, const int step = 1, + const int padding_size = 0, + const int keys_share_prefix = 1) { + Random rnd(302); + uint64_t offset = 0; + + // generate different prefix + for (int i = from; i < from + len; i += step) { + // generate keys that shares the prefix + for (int j = 0; j < keys_share_prefix; ++j) { + keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + + uint64_t size = rnd.Uniform(1024 * 16); + BlockHandle handle(offset, size); + offset += size + kBlockTrailerSize; + values->emplace_back(handle); + } + } +} + +class BlockTest : public testing::Test {}; + +// block test +TEST_F(BlockTest, SimpleTest) { + Random rnd(301); + Options options = Options(); + std::unique_ptr<InternalKeyComparator> ic; + ic.reset(new test::PlainInternalKeyComparator(options.comparator)); + + std::vector<std::string> keys; + std::vector<std::string> values; + BlockBuilder builder(16); + int num_records = 100000; + + GenerateRandomKVs(&keys, &values, 0, num_records); + // add a bunch of records to a block + for (int i = 0; i < num_records; i++) { + builder.Add(keys[i], values[i]); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + // read contents of block sequentially + int count = 0; + InternalIterator *iter = + reader.NewIterator<DataBlockIter>(options.comparator, options.comparator); + for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { + // read kv from block + Slice k = iter->key(); + Slice v = iter->value(); + + // compare with lookaside array + ASSERT_EQ(k.ToString().compare(keys[count]), 0); + ASSERT_EQ(v.ToString().compare(values[count]), 0); + } + delete iter; + + // read block contents randomly + iter = + reader.NewIterator<DataBlockIter>(options.comparator, options.comparator); + for (int i = 0; i < num_records; i++) { + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(keys[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + Slice v = iter->value(); + ASSERT_EQ(v.ToString().compare(values[index]), 0); + } + delete iter; +} + +TEST_F(BlockTest, ValueDeltaEncodingTest) { + Random rnd(301); + Options options = Options(); + std::unique_ptr<InternalKeyComparator> ic; + ic.reset(new test::PlainInternalKeyComparator(options.comparator)); + + std::vector<std::string> keys; + std::vector<BlockHandle> values; + const bool kUseDeltaEncoding = true; + const bool kUseValueDeltaEncoding = true; + BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding); + int num_records = 100; + + GenerateRandomKBHs(&keys, &values, 0, num_records); + // add a bunch of records to a block + BlockHandle last_encoded_handle; + for (int i = 0; i < num_records; i++) { + auto block_handle = values[i]; + std::string handle_encoding; + block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64(&handle_delta_encoding, + block_handle.size() - last_encoded_handle.size()); + last_encoded_handle = block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + const bool kTotalOrderSeek = true; + const bool kIncludesSeq = true; + const bool kValueIsFull = !kUseValueDeltaEncoding; + IndexBlockIter *kNullIter = nullptr; + Statistics *kNullStats = nullptr; + // read contents of block sequentially + int count = 0; + InternalIteratorBase<BlockHandle> *iter = reader.NewIterator<IndexBlockIter>( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, kIncludesSeq, kValueIsFull); + for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { + // read kv from block + Slice k = iter->key(); + BlockHandle handle = iter->value(); + + // compare with lookaside array + ASSERT_EQ(k.ToString().compare(keys[count]), 0); + + ASSERT_EQ(values[count].offset(), handle.offset()); + ASSERT_EQ(values[count].size(), handle.size()); + } + delete iter; + + // read block contents randomly + iter = reader.NewIterator<IndexBlockIter>( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, kIncludesSeq, kValueIsFull); + for (int i = 0; i < num_records; i++) { + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(keys[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + BlockHandle handle = iter->value(); + ASSERT_EQ(values[index].offset(), handle.offset()); + ASSERT_EQ(values[index].size(), handle.size()); + } + delete iter; +} +// return the block contents +BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder, + const std::vector<std::string> &keys, + const std::vector<std::string> &values, + const int /*prefix_group_size*/ = 1) { + builder->reset(new BlockBuilder(1 /* restart interval */)); + + // Add only half of the keys + for (size_t i = 0; i < keys.size(); ++i) { + (*builder)->Add(keys[i], values[i]); + } + Slice rawblock = (*builder)->Finish(); + + BlockContents contents; + contents.data = rawblock; + + return contents; +} + +void CheckBlockContents(BlockContents contents, const int max_key, + const std::vector<std::string> &keys, + const std::vector<std::string> &values) { + const size_t prefix_size = 6; + // create block reader + BlockContents contents_ref(contents.data); + Block reader1(std::move(contents), kDisableGlobalSequenceNumber); + Block reader2(std::move(contents_ref), kDisableGlobalSequenceNumber); + + std::unique_ptr<const SliceTransform> prefix_extractor( + NewFixedPrefixTransform(prefix_size)); + + std::unique_ptr<InternalIterator> regular_iter( + reader2.NewIterator<DataBlockIter>(BytewiseComparator(), + BytewiseComparator())); + + // Seek existent keys + for (size_t i = 0; i < keys.size(); i++) { + regular_iter->Seek(keys[i]); + ASSERT_OK(regular_iter->status()); + ASSERT_TRUE(regular_iter->Valid()); + + Slice v = regular_iter->value(); + ASSERT_EQ(v.ToString().compare(values[i]), 0); + } + + // Seek non-existent keys. + // For hash index, if no key with a given prefix is not found, iterator will + // simply be set as invalid; whereas the binary search based iterator will + // return the one that is closest. + for (int i = 1; i < max_key - 1; i += 2) { + auto key = GenerateKey(i, 0, 0, nullptr); + regular_iter->Seek(key); + ASSERT_TRUE(regular_iter->Valid()); + } +} + +// In this test case, no two key share same prefix. +TEST_F(BlockTest, SimpleIndexHash) { + const int kMaxKey = 100000; + std::vector<std::string> keys; + std::vector<std::string> values; + GenerateRandomKVs(&keys, &values, 0 /* first key id */, + kMaxKey /* last key id */, 2 /* step */, + 8 /* padding size (8 bytes randomly generated suffix) */); + + std::unique_ptr<BlockBuilder> builder; + auto contents = GetBlockContents(&builder, keys, values); + + CheckBlockContents(std::move(contents), kMaxKey, keys, values); +} + +TEST_F(BlockTest, IndexHashWithSharedPrefix) { + const int kMaxKey = 100000; + // for each prefix, there will be 5 keys starts with it. + const int kPrefixGroup = 5; + std::vector<std::string> keys; + std::vector<std::string> values; + // Generate keys with same prefix. + GenerateRandomKVs(&keys, &values, 0, // first key id + kMaxKey, // last key id + 2, // step + 10, // padding size, + kPrefixGroup); + + std::unique_ptr<BlockBuilder> builder; + auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup); + + CheckBlockContents(std::move(contents), kMaxKey, keys, values); +} + +// A slow and accurate version of BlockReadAmpBitmap that simply store +// all the marked ranges in a set. +class BlockReadAmpBitmapSlowAndAccurate { + public: + void Mark(size_t start_offset, size_t end_offset) { + assert(end_offset >= start_offset); + marked_ranges_.emplace(end_offset, start_offset); + } + + void ResetCheckSequence() { iter_valid_ = false; } + + // Return true if any byte in this range was Marked + // This does linear search from the previous position. When calling + // multiple times, `offset` needs to be incremental to get correct results. + // Call ResetCheckSequence() to reset it. + bool IsPinMarked(size_t offset) { + if (iter_valid_) { + // Has existing iterator, try linear search from + // the iterator. + for (int i = 0; i < 64; i++) { + if (offset < iter_->second) { + return false; + } + if (offset <= iter_->first) { + return true; + } + + iter_++; + if (iter_ == marked_ranges_.end()) { + iter_valid_ = false; + return false; + } + } + } + // Initial call or have linear searched too many times. + // Do binary search. + iter_ = marked_ranges_.lower_bound( + std::make_pair(offset, static_cast<size_t>(0))); + if (iter_ == marked_ranges_.end()) { + iter_valid_ = false; + return false; + } + iter_valid_ = true; + return offset <= iter_->first && offset >= iter_->second; + } + + private: + std::set<std::pair<size_t, size_t>> marked_ranges_; + std::set<std::pair<size_t, size_t>>::iterator iter_; + bool iter_valid_ = false; +}; + +TEST_F(BlockTest, BlockReadAmpBitmap) { + uint32_t pin_offset = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlockReadAmpBitmap:rnd", [&pin_offset](void *arg) { + pin_offset = *(static_cast<uint32_t *>(arg)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + std::vector<size_t> block_sizes = { + 1, // 1 byte + 32, // 32 bytes + 61, // 61 bytes + 64, // 64 bytes + 512, // 0.5 KB + 1024, // 1 KB + 1024 * 4, // 4 KB + 1024 * 10, // 10 KB + 1024 * 50, // 50 KB + 1024 * 1024 * 4, // 5 MB + 777, + 124653, + }; + const size_t kBytesPerBit = 64; + + Random rnd(301); + for (size_t block_size : block_sizes) { + std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics(); + BlockReadAmpBitmap read_amp_bitmap(block_size, kBytesPerBit, stats.get()); + BlockReadAmpBitmapSlowAndAccurate read_amp_slow_and_accurate; + + size_t needed_bits = (block_size / kBytesPerBit); + if (block_size % kBytesPerBit != 0) { + needed_bits++; + } + + ASSERT_EQ(stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES), block_size); + + // Generate some random entries + std::vector<size_t> random_entry_offsets; + for (int i = 0; i < 1000; i++) { + random_entry_offsets.push_back(rnd.Next() % block_size); + } + std::sort(random_entry_offsets.begin(), random_entry_offsets.end()); + auto it = + std::unique(random_entry_offsets.begin(), random_entry_offsets.end()); + random_entry_offsets.resize( + std::distance(random_entry_offsets.begin(), it)); + + std::vector<std::pair<size_t, size_t>> random_entries; + for (size_t i = 0; i < random_entry_offsets.size(); i++) { + size_t entry_start = random_entry_offsets[i]; + size_t entry_end; + if (i + 1 < random_entry_offsets.size()) { + entry_end = random_entry_offsets[i + 1] - 1; + } else { + entry_end = block_size - 1; + } + random_entries.emplace_back(entry_start, entry_end); + } + + for (size_t i = 0; i < random_entries.size(); i++) { + read_amp_slow_and_accurate.ResetCheckSequence(); + auto ¤t_entry = random_entries[rnd.Next() % random_entries.size()]; + + read_amp_bitmap.Mark(static_cast<uint32_t>(current_entry.first), + static_cast<uint32_t>(current_entry.second)); + read_amp_slow_and_accurate.Mark(current_entry.first, + current_entry.second); + + size_t total_bits = 0; + for (size_t bit_idx = 0; bit_idx < needed_bits; bit_idx++) { + total_bits += read_amp_slow_and_accurate.IsPinMarked( + bit_idx * kBytesPerBit + pin_offset); + } + size_t expected_estimate_useful = total_bits * kBytesPerBit; + size_t got_estimate_useful = + stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES); + ASSERT_EQ(expected_estimate_useful, got_estimate_useful); + } + } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlockTest, BlockWithReadAmpBitmap) { + Random rnd(301); + Options options = Options(); + std::unique_ptr<InternalKeyComparator> ic; + ic.reset(new test::PlainInternalKeyComparator(options.comparator)); + + std::vector<std::string> keys; + std::vector<std::string> values; + BlockBuilder builder(16); + int num_records = 10000; + + GenerateRandomKVs(&keys, &values, 0, num_records, 1); + // add a bunch of records to a block + for (int i = 0; i < num_records; i++) { + builder.Add(keys[i], values[i]); + } + + Slice rawblock = builder.Finish(); + const size_t kBytesPerBit = 8; + + // Read the block sequentially using Next() + { + std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber, + kBytesPerBit, stats.get()); + + // read contents of block sequentially + size_t read_bytes = 0; + DataBlockIter *iter = + static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>( + options.comparator, options.comparator, nullptr, stats.get())); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + iter->value(); + read_bytes += iter->TEST_CurrentEntrySize(); + + double semi_acc_read_amp = + static_cast<double>(read_bytes) / rawblock.size(); + double read_amp = static_cast<double>(stats->getTickerCount( + READ_AMP_ESTIMATE_USEFUL_BYTES)) / + stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + // Error in read amplification will be less than 1% if we are reading + // sequentially + double error_pct = fabs(semi_acc_read_amp - read_amp) * 100; + EXPECT_LT(error_pct, 1); + } + + delete iter; + } + + // Read the block sequentially using Seek() + { + std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber, + kBytesPerBit, stats.get()); + + size_t read_bytes = 0; + DataBlockIter *iter = + static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>( + options.comparator, options.comparator, nullptr, stats.get())); + for (int i = 0; i < num_records; i++) { + Slice k(keys[i]); + + // search in block for this key + iter->Seek(k); + iter->value(); + read_bytes += iter->TEST_CurrentEntrySize(); + + double semi_acc_read_amp = + static_cast<double>(read_bytes) / rawblock.size(); + double read_amp = static_cast<double>(stats->getTickerCount( + READ_AMP_ESTIMATE_USEFUL_BYTES)) / + stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + // Error in read amplification will be less than 1% if we are reading + // sequentially + double error_pct = fabs(semi_acc_read_amp - read_amp) * 100; + EXPECT_LT(error_pct, 1); + } + delete iter; + } + + // Read the block randomly + { + std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber, + kBytesPerBit, stats.get()); + + size_t read_bytes = 0; + DataBlockIter *iter = + static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>( + options.comparator, options.comparator, nullptr, stats.get())); + std::unordered_set<int> read_keys; + for (int i = 0; i < num_records; i++) { + int index = rnd.Uniform(num_records); + Slice k(keys[index]); + + iter->Seek(k); + iter->value(); + if (read_keys.find(index) == read_keys.end()) { + read_keys.insert(index); + read_bytes += iter->TEST_CurrentEntrySize(); + } + + double semi_acc_read_amp = + static_cast<double>(read_bytes) / rawblock.size(); + double read_amp = static_cast<double>(stats->getTickerCount( + READ_AMP_ESTIMATE_USEFUL_BYTES)) / + stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + double error_pct = fabs(semi_acc_read_amp - read_amp) * 100; + // Error in read amplification will be less than 2% if we are reading + // randomly + EXPECT_LT(error_pct, 2); + } + delete iter; + } +} + +TEST_F(BlockTest, ReadAmpBitmapPow2) { + std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics(); + ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1); + ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2); + ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4); + ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8); + ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16); + ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32); + + ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2); + ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4); + ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8); + ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16); + ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32); + ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32); +} + +} // namespace rocksdb + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/bloom_block.cc b/src/rocksdb/table/bloom_block.cc new file mode 100644 index 00000000..61959030 --- /dev/null +++ b/src/rocksdb/table/bloom_block.cc @@ -0,0 +1,23 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/bloom_block.h" + +#include <string> +#include "rocksdb/slice.h" +#include "util/dynamic_bloom.h" + +namespace rocksdb { + +void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t>& keys_hashes) { + for (auto hash : keys_hashes) { + bloom_.AddHash(hash); + } +} + +Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); } + +const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock"; +} // namespace rocksdb diff --git a/src/rocksdb/table/bloom_block.h b/src/rocksdb/table/bloom_block.h new file mode 100644 index 00000000..483fa25d --- /dev/null +++ b/src/rocksdb/table/bloom_block.h @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <vector> +#include <string> +#include "util/dynamic_bloom.h" + +namespace rocksdb { +class Logger; + +class BloomBlockBuilder { + public: + static const std::string kBloomBlock; + + explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {} + + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger) { + bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, + logger); + } + + uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); } + + void AddKeysHashes(const std::vector<uint32_t>& keys_hashes); + + Slice Finish(); + + private: + DynamicBloom bloom_; +}; + +}; // namespace rocksdb diff --git a/src/rocksdb/table/cleanable_test.cc b/src/rocksdb/table/cleanable_test.cc new file mode 100644 index 00000000..f18c33b8 --- /dev/null +++ b/src/rocksdb/table/cleanable_test.cc @@ -0,0 +1,277 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <functional> + +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/iostats_context.h" +#include "rocksdb/perf_context.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class CleanableTest : public testing::Test {}; + +// Use this to keep track of the cleanups that were actually performed +void Multiplier(void* arg1, void* arg2) { + int* res = reinterpret_cast<int*>(arg1); + int* num = reinterpret_cast<int*>(arg2); + *res *= *num; +} + +// the first Cleanup is on stack and the rest on heap, so test with both cases +TEST_F(CleanableTest, Register) { + int n2 = 2, n3 = 3; + int res = 1; + { Cleanable c1; } + // ~Cleanable + ASSERT_EQ(1, res); + + res = 1; + { + Cleanable c1; + c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2; + } + // ~Cleanable + ASSERT_EQ(2, res); + + res = 1; + { + Cleanable c1; + c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2; + c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3; + } + // ~Cleanable + ASSERT_EQ(6, res); + + // Test the Reset does cleanup + res = 1; + { + Cleanable c1; + c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2; + c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3; + c1.Reset(); + ASSERT_EQ(6, res); + } + // ~Cleanable + ASSERT_EQ(6, res); + + // Test Clenable is usable after Reset + res = 1; + { + Cleanable c1; + c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2; + c1.Reset(); + ASSERT_EQ(2, res); + c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3; + } + // ~Cleanable + ASSERT_EQ(6, res); +} + +// the first Cleanup is on stack and the rest on heap, +// so test all the combinations of them +TEST_F(CleanableTest, Delegation) { + int n2 = 2, n3 = 3, n5 = 5, n7 = 7; + int res = 1; + { + Cleanable c2; + { + Cleanable c1; + c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2; + c1.DelegateCleanupsTo(&c2); + } + // ~Cleanable + ASSERT_EQ(1, res); + } + // ~Cleanable + ASSERT_EQ(2, res); + + res = 1; + { + Cleanable c2; + { + Cleanable c1; + c1.DelegateCleanupsTo(&c2); + } + // ~Cleanable + ASSERT_EQ(1, res); + } + // ~Cleanable + ASSERT_EQ(1, res); + + res = 1; + { + Cleanable c2; + { + Cleanable c1; + c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2; + c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3; + c1.DelegateCleanupsTo(&c2); + } + // ~Cleanable + ASSERT_EQ(1, res); + } + // ~Cleanable + ASSERT_EQ(6, res); + + res = 1; + { + Cleanable c2; + c2.RegisterCleanup(Multiplier, &res, &n5); // res = 5; + { + Cleanable c1; + c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2; + c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3; + c1.DelegateCleanupsTo(&c2); // res = 2 * 3 * 5; + } + // ~Cleanable + ASSERT_EQ(1, res); + } + // ~Cleanable + ASSERT_EQ(30, res); + + res = 1; + { + Cleanable c2; + c2.RegisterCleanup(Multiplier, &res, &n5); // res = 5; + c2.RegisterCleanup(Multiplier, &res, &n7); // res = 5 * 7; + { + Cleanable c1; + c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2; + c1.RegisterCleanup(Multiplier, &res, &n3); // res = 2 * 3; + c1.DelegateCleanupsTo(&c2); // res = 2 * 3 * 5 * 7; + } + // ~Cleanable + ASSERT_EQ(1, res); + } + // ~Cleanable + ASSERT_EQ(210, res); + + res = 1; + { + Cleanable c2; + c2.RegisterCleanup(Multiplier, &res, &n5); // res = 5; + c2.RegisterCleanup(Multiplier, &res, &n7); // res = 5 * 7; + { + Cleanable c1; + c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2; + c1.DelegateCleanupsTo(&c2); // res = 2 * 5 * 7; + } + // ~Cleanable + ASSERT_EQ(1, res); + } + // ~Cleanable + ASSERT_EQ(70, res); + + res = 1; + { + Cleanable c2; + c2.RegisterCleanup(Multiplier, &res, &n5); // res = 5; + c2.RegisterCleanup(Multiplier, &res, &n7); // res = 5 * 7; + { + Cleanable c1; + c1.DelegateCleanupsTo(&c2); // res = 5 * 7; + } + // ~Cleanable + ASSERT_EQ(1, res); + } + // ~Cleanable + ASSERT_EQ(35, res); + + res = 1; + { + Cleanable c2; + c2.RegisterCleanup(Multiplier, &res, &n5); // res = 5; + { + Cleanable c1; + c1.DelegateCleanupsTo(&c2); // res = 5; + } + // ~Cleanable + ASSERT_EQ(1, res); + } + // ~Cleanable + ASSERT_EQ(5, res); +} + +static void ReleaseStringHeap(void* s, void*) { + delete reinterpret_cast<const std::string*>(s); +} + +class PinnableSlice4Test : public PinnableSlice { + public: + void TestStringIsRegistered(std::string* s) { + ASSERT_TRUE(cleanup_.function == ReleaseStringHeap); + ASSERT_EQ(cleanup_.arg1, s); + ASSERT_EQ(cleanup_.arg2, nullptr); + ASSERT_EQ(cleanup_.next, nullptr); + } +}; + +// Putting the PinnableSlice tests here due to similarity to Cleanable tests +TEST_F(CleanableTest, PinnableSlice) { + int n2 = 2; + int res = 1; + const std::string const_str = "123"; + + { + res = 1; + PinnableSlice4Test value; + Slice slice(const_str); + value.PinSlice(slice, Multiplier, &res, &n2); + std::string str; + str.assign(value.data(), value.size()); + ASSERT_EQ(const_str, str); + } + // ~Cleanable + ASSERT_EQ(2, res); + + { + res = 1; + PinnableSlice4Test value; + Slice slice(const_str); + { + Cleanable c1; + c1.RegisterCleanup(Multiplier, &res, &n2); // res = 2; + value.PinSlice(slice, &c1); + } + // ~Cleanable + ASSERT_EQ(1, res); // cleanups must have be delegated to value + std::string str; + str.assign(value.data(), value.size()); + ASSERT_EQ(const_str, str); + } + // ~Cleanable + ASSERT_EQ(2, res); + + { + PinnableSlice4Test value; + Slice slice(const_str); + value.PinSelf(slice); + std::string str; + str.assign(value.data(), value.size()); + ASSERT_EQ(const_str, str); + } + + { + PinnableSlice4Test value; + std::string* self_str_ptr = value.GetSelf(); + self_str_ptr->assign(const_str); + value.PinSelf(); + std::string str; + str.assign(value.data(), value.size()); + ASSERT_EQ(const_str, str); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/cuckoo_table_builder.cc b/src/rocksdb/table/cuckoo_table_builder.cc new file mode 100644 index 00000000..f590e6ad --- /dev/null +++ b/src/rocksdb/table/cuckoo_table_builder.cc @@ -0,0 +1,516 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include "table/cuckoo_table_builder.h" + +#include <assert.h> +#include <algorithm> +#include <limits> +#include <string> +#include <vector> + +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "table/block_builder.h" +#include "table/cuckoo_table_factory.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "util/autovector.h" +#include "util/file_reader_writer.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace rocksdb { +const std::string CuckooTablePropertyNames::kEmptyKey = + "rocksdb.cuckoo.bucket.empty.key"; +const std::string CuckooTablePropertyNames::kNumHashFunc = + "rocksdb.cuckoo.hash.num"; +const std::string CuckooTablePropertyNames::kHashTableSize = + "rocksdb.cuckoo.hash.size"; +const std::string CuckooTablePropertyNames::kValueLength = + "rocksdb.cuckoo.value.length"; +const std::string CuckooTablePropertyNames::kIsLastLevel = + "rocksdb.cuckoo.file.islastlevel"; +const std::string CuckooTablePropertyNames::kCuckooBlockSize = + "rocksdb.cuckoo.hash.cuckooblocksize"; +const std::string CuckooTablePropertyNames::kIdentityAsFirstHash = + "rocksdb.cuckoo.hash.identityfirst"; +const std::string CuckooTablePropertyNames::kUseModuleHash = + "rocksdb.cuckoo.hash.usemodule"; +const std::string CuckooTablePropertyNames::kUserKeyLength = + "rocksdb.cuckoo.hash.userkeylength"; + +// Obtained by running echo rocksdb.table.cuckoo | sha1sum +extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; + +CuckooTableBuilder::CuckooTableBuilder( + WritableFileWriter* file, double max_hash_table_ratio, + uint32_t max_num_hash_table, uint32_t max_search_depth, + const Comparator* user_comparator, uint32_t cuckoo_block_size, + bool use_module_hash, bool identity_as_first_hash, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t), + uint32_t column_family_id, const std::string& column_family_name) + : num_hash_func_(2), + file_(file), + max_hash_table_ratio_(max_hash_table_ratio), + max_num_hash_func_(max_num_hash_table), + max_search_depth_(max_search_depth), + cuckoo_block_size_(std::max(1U, cuckoo_block_size)), + hash_table_size_(use_module_hash ? 0 : 2), + is_last_level_file_(false), + has_seen_first_key_(false), + has_seen_first_value_(false), + key_size_(0), + value_size_(0), + num_entries_(0), + num_values_(0), + ucomp_(user_comparator), + use_module_hash_(use_module_hash), + identity_as_first_hash_(identity_as_first_hash), + get_slice_hash_(get_slice_hash), + closed_(false) { + // Data is in a huge block. + properties_.num_data_blocks = 1; + properties_.index_size = 0; + properties_.filter_size = 0; + properties_.column_family_id = column_family_id; + properties_.column_family_name = column_family_name; +} + +void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { + if (num_entries_ >= kMaxVectorIdx - 1) { + status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1"); + return; + } + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + status_ = Status::Corruption("Unable to parse key into inernal key."); + return; + } + if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) { + status_ = Status::NotSupported("Unsupported key type " + + ToString(ikey.type)); + return; + } + + // Determine if we can ignore the sequence number and value type from + // internal keys by looking at sequence number from first key. We assume + // that if first key has a zero sequence number, then all the remaining + // keys will have zero seq. no. + if (!has_seen_first_key_) { + is_last_level_file_ = ikey.sequence == 0; + has_seen_first_key_ = true; + smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size(); + } + if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) { + status_ = Status::NotSupported("all keys have to be the same size"); + return; + } + + if (ikey.type == kTypeValue) { + if (!has_seen_first_value_) { + has_seen_first_value_ = true; + value_size_ = value.size(); + } + if (value_size_ != value.size()) { + status_ = Status::NotSupported("all values have to be the same size"); + return; + } + + if (is_last_level_file_) { + kvs_.append(ikey.user_key.data(), ikey.user_key.size()); + } else { + kvs_.append(key.data(), key.size()); + } + kvs_.append(value.data(), value.size()); + ++num_values_; + } else { + if (is_last_level_file_) { + deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size()); + } else { + deleted_keys_.append(key.data(), key.size()); + } + } + ++num_entries_; + + // In order to fill the empty buckets in the hash table, we identify a + // key which is not used so far (unused_user_key). We determine this by + // maintaining smallest and largest keys inserted so far in bytewise order + // and use them to find a key outside this range in Finish() operation. + // Note that this strategy is independent of user comparator used here. + if (ikey.user_key.compare(smallest_user_key_) < 0) { + smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + } else if (ikey.user_key.compare(largest_user_key_) > 0) { + largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + } + if (!use_module_hash_) { + if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) { + hash_table_size_ *= 2; + } + } +} + +bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const { + assert(closed_); + return idx >= num_values_; +} + +Slice CuckooTableBuilder::GetKey(uint64_t idx) const { + assert(closed_); + if (IsDeletedKey(idx)) { + return Slice(&deleted_keys_[static_cast<size_t>((idx - num_values_) * key_size_)], static_cast<size_t>(key_size_)); + } + return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_))], static_cast<size_t>(key_size_)); +} + +Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const { + assert(closed_); + return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx)); +} + +Slice CuckooTableBuilder::GetValue(uint64_t idx) const { + assert(closed_); + if (IsDeletedKey(idx)) { + static std::string empty_value(static_cast<unsigned int>(value_size_), 'a'); + return Slice(empty_value); + } + return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_) + key_size_)], static_cast<size_t>(value_size_)); +} + +Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) { + buckets->resize(static_cast<size_t>(hash_table_size_ + cuckoo_block_size_ - 1)); + uint32_t make_space_for_key_call_id = 0; + for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) { + uint64_t bucket_id = 0; + bool bucket_found = false; + autovector<uint64_t> hash_vals; + Slice user_key = GetUserKey(vector_idx); + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found; + ++hash_cnt) { + uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_, + hash_table_size_, identity_as_first_hash_, get_slice_hash_); + // If there is a collision, check next cuckoo_block_size_ locations for + // empty locations. While checking, if we reach end of the hash table, + // stop searching and proceed for next hash function. + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, ++hash_val) { + if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) { + bucket_id = hash_val; + bucket_found = true; + break; + } else { + if (ucomp_->Compare(user_key, + GetUserKey((*buckets)[static_cast<size_t>(hash_val)].vector_idx)) == 0) { + return Status::NotSupported("Same key is being inserted again."); + } + hash_vals.push_back(hash_val); + } + } + } + while (!bucket_found && !MakeSpaceForKey(hash_vals, + ++make_space_for_key_call_id, buckets, &bucket_id)) { + // Rehash by increashing number of hash tables. + if (num_hash_func_ >= max_num_hash_func_) { + return Status::NotSupported("Too many collisions. Unable to hash."); + } + // We don't really need to rehash the entire table because old hashes are + // still valid and we only increased the number of hash functions. + uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_, + hash_table_size_, identity_as_first_hash_, get_slice_hash_); + ++num_hash_func_; + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, ++hash_val) { + if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) { + bucket_found = true; + bucket_id = hash_val; + break; + } else { + hash_vals.push_back(hash_val); + } + } + } + (*buckets)[static_cast<size_t>(bucket_id)].vector_idx = vector_idx; + } + return Status::OK(); +} + +Status CuckooTableBuilder::Finish() { + assert(!closed_); + closed_ = true; + std::vector<CuckooBucket> buckets; + Status s; + std::string unused_bucket; + if (num_entries_ > 0) { + // Calculate the real hash size if module hash is enabled. + if (use_module_hash_) { + hash_table_size_ = + static_cast<uint64_t>(num_entries_ / max_hash_table_ratio_); + } + s = MakeHashTable(&buckets); + if (!s.ok()) { + return s; + } + // Determine unused_user_key to fill empty buckets. + std::string unused_user_key = smallest_user_key_; + int curr_pos = static_cast<int>(unused_user_key.size()) - 1; + while (curr_pos >= 0) { + --unused_user_key[curr_pos]; + if (Slice(unused_user_key).compare(smallest_user_key_) < 0) { + break; + } + --curr_pos; + } + if (curr_pos < 0) { + // Try using the largest key to identify an unused key. + unused_user_key = largest_user_key_; + curr_pos = static_cast<int>(unused_user_key.size()) - 1; + while (curr_pos >= 0) { + ++unused_user_key[curr_pos]; + if (Slice(unused_user_key).compare(largest_user_key_) > 0) { + break; + } + --curr_pos; + } + } + if (curr_pos < 0) { + return Status::Corruption("Unable to find unused key"); + } + if (is_last_level_file_) { + unused_bucket = unused_user_key; + } else { + ParsedInternalKey ikey(unused_user_key, 0, kTypeValue); + AppendInternalKey(&unused_bucket, ikey); + } + } + properties_.num_entries = num_entries_; + properties_.num_deletions = num_entries_ - num_values_; + properties_.fixed_key_len = key_size_; + properties_.user_collected_properties[ + CuckooTablePropertyNames::kValueLength].assign( + reinterpret_cast<const char*>(&value_size_), sizeof(value_size_)); + + uint64_t bucket_size = key_size_ + value_size_; + unused_bucket.resize(static_cast<size_t>(bucket_size), 'a'); + // Write the table. + uint32_t num_added = 0; + for (auto& bucket : buckets) { + if (bucket.vector_idx == kMaxVectorIdx) { + s = file_->Append(Slice(unused_bucket)); + } else { + ++num_added; + s = file_->Append(GetKey(bucket.vector_idx)); + if (s.ok()) { + if (value_size_ > 0) { + s = file_->Append(GetValue(bucket.vector_idx)); + } + } + } + if (!s.ok()) { + return s; + } + } + assert(num_added == NumEntries()); + properties_.raw_key_size = num_added * properties_.fixed_key_len; + properties_.raw_value_size = num_added * value_size_; + + uint64_t offset = buckets.size() * bucket_size; + properties_.data_size = offset; + unused_bucket.resize(static_cast<size_t>(properties_.fixed_key_len)); + properties_.user_collected_properties[ + CuckooTablePropertyNames::kEmptyKey] = unused_bucket; + properties_.user_collected_properties[ + CuckooTablePropertyNames::kNumHashFunc].assign( + reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_)); + + properties_.user_collected_properties[ + CuckooTablePropertyNames::kHashTableSize].assign( + reinterpret_cast<const char*>(&hash_table_size_), + sizeof(hash_table_size_)); + properties_.user_collected_properties[ + CuckooTablePropertyNames::kIsLastLevel].assign( + reinterpret_cast<const char*>(&is_last_level_file_), + sizeof(is_last_level_file_)); + properties_.user_collected_properties[ + CuckooTablePropertyNames::kCuckooBlockSize].assign( + reinterpret_cast<const char*>(&cuckoo_block_size_), + sizeof(cuckoo_block_size_)); + properties_.user_collected_properties[ + CuckooTablePropertyNames::kIdentityAsFirstHash].assign( + reinterpret_cast<const char*>(&identity_as_first_hash_), + sizeof(identity_as_first_hash_)); + properties_.user_collected_properties[ + CuckooTablePropertyNames::kUseModuleHash].assign( + reinterpret_cast<const char*>(&use_module_hash_), + sizeof(use_module_hash_)); + uint32_t user_key_len = static_cast<uint32_t>(smallest_user_key_.size()); + properties_.user_collected_properties[ + CuckooTablePropertyNames::kUserKeyLength].assign( + reinterpret_cast<const char*>(&user_key_len), + sizeof(user_key_len)); + + // Write meta blocks. + MetaIndexBuilder meta_index_builder; + PropertyBlockBuilder property_block_builder; + + property_block_builder.AddTableProperty(properties_); + property_block_builder.Add(properties_.user_collected_properties); + Slice property_block = property_block_builder.Finish(); + BlockHandle property_block_handle; + property_block_handle.set_offset(offset); + property_block_handle.set_size(property_block.size()); + s = file_->Append(property_block); + offset += property_block.size(); + if (!s.ok()) { + return s; + } + + meta_index_builder.Add(kPropertiesBlock, property_block_handle); + Slice meta_index_block = meta_index_builder.Finish(); + + BlockHandle meta_index_block_handle; + meta_index_block_handle.set_offset(offset); + meta_index_block_handle.set_size(meta_index_block.size()); + s = file_->Append(meta_index_block); + if (!s.ok()) { + return s; + } + + Footer footer(kCuckooTableMagicNumber, 1); + footer.set_metaindex_handle(meta_index_block_handle); + footer.set_index_handle(BlockHandle::NullBlockHandle()); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + s = file_->Append(footer_encoding); + return s; +} + +void CuckooTableBuilder::Abandon() { + assert(!closed_); + closed_ = true; +} + +uint64_t CuckooTableBuilder::NumEntries() const { + return num_entries_; +} + +uint64_t CuckooTableBuilder::FileSize() const { + if (closed_) { + return file_->GetFileSize(); + } else if (num_entries_ == 0) { + return 0; + } + + if (use_module_hash_) { + return static_cast<uint64_t>((key_size_ + value_size_) * + num_entries_ / max_hash_table_ratio_); + } else { + // Account for buckets being a power of two. + // As elements are added, file size remains constant for a while and + // doubles its size. Since compaction algorithm stops adding elements + // only after it exceeds the file limit, we account for the extra element + // being added here. + uint64_t expected_hash_table_size = hash_table_size_; + if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) { + expected_hash_table_size *= 2; + } + return (key_size_ + value_size_) * expected_hash_table_size - 1; + } +} + +// This method is invoked when there is no place to insert the target key. +// It searches for a set of elements that can be moved to accommodate target +// key. The search is a BFS graph traversal with first level (hash_vals) +// being all the buckets target key could go to. +// Then, from each node (curr_node), we find all the buckets that curr_node +// could go to. They form the children of curr_node in the tree. +// We continue the traversal until we find an empty bucket, in which case, we +// move all elements along the path from first level to this empty bucket, to +// make space for target key which is inserted at first level (*bucket_id). +// If tree depth exceedes max depth, we return false indicating failure. +bool CuckooTableBuilder::MakeSpaceForKey( + const autovector<uint64_t>& hash_vals, + const uint32_t make_space_for_key_call_id, + std::vector<CuckooBucket>* buckets, uint64_t* bucket_id) { + struct CuckooNode { + uint64_t bucket_id; + uint32_t depth; + uint32_t parent_pos; + CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos) + : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {} + }; + // This is BFS search tree that is stored simply as a vector. + // Each node stores the index of parent node in the vector. + std::vector<CuckooNode> tree; + // We want to identify already visited buckets in the current method call so + // that we don't add same buckets again for exploration in the tree. + // We do this by maintaining a count of current method call in + // make_space_for_key_call_id, which acts as a unique id for this invocation + // of the method. We store this number into the nodes that we explore in + // current method call. + // It is unlikely for the increment operation to overflow because the maximum + // no. of times this will be called is <= max_num_hash_func_ + num_entries_. + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { + uint64_t bid = hash_vals[hash_cnt]; + (*buckets)[static_cast<size_t>(bid)].make_space_for_key_call_id = make_space_for_key_call_id; + tree.push_back(CuckooNode(bid, 0, 0)); + } + bool null_found = false; + uint32_t curr_pos = 0; + while (!null_found && curr_pos < tree.size()) { + CuckooNode& curr_node = tree[curr_pos]; + uint32_t curr_depth = curr_node.depth; + if (curr_depth >= max_search_depth_) { + break; + } + CuckooBucket& curr_bucket = (*buckets)[static_cast<size_t>(curr_node.bucket_id)]; + for (uint32_t hash_cnt = 0; + hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) { + uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx), + hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_, + get_slice_hash_); + // Iterate inside Cuckoo Block. + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, ++child_bucket_id) { + if ((*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id == + make_space_for_key_call_id) { + continue; + } + (*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id = + make_space_for_key_call_id; + tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, + curr_pos)); + if ((*buckets)[static_cast<size_t>(child_bucket_id)].vector_idx == kMaxVectorIdx) { + null_found = true; + break; + } + } + } + ++curr_pos; + } + + if (null_found) { + // There is an empty node in tree.back(). Now, traverse the path from this + // empty node to top of the tree and at every node in the path, replace + // child with the parent. Stop when first level is reached in the tree + // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return + // this location in first level for target key to be inserted. + uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1; + while (bucket_to_replace_pos >= num_hash_func_) { + CuckooNode& curr_node = tree[bucket_to_replace_pos]; + (*buckets)[static_cast<size_t>(curr_node.bucket_id)] = + (*buckets)[static_cast<size_t>(tree[curr_node.parent_pos].bucket_id)]; + bucket_to_replace_pos = curr_node.parent_pos; + } + *bucket_id = tree[bucket_to_replace_pos].bucket_id; + } + return null_found; +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/cuckoo_table_builder.h b/src/rocksdb/table/cuckoo_table_builder.h new file mode 100644 index 00000000..3829541b --- /dev/null +++ b/src/rocksdb/table/cuckoo_table_builder.h @@ -0,0 +1,127 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE +#include <stdint.h> +#include <limits> +#include <string> +#include <utility> +#include <vector> +#include "port/port.h" +#include "rocksdb/status.h" +#include "table/table_builder.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "util/autovector.h" + +namespace rocksdb { + +class CuckooTableBuilder: public TableBuilder { + public: + CuckooTableBuilder(WritableFileWriter* file, double max_hash_table_ratio, + uint32_t max_num_hash_func, uint32_t max_search_depth, + const Comparator* user_comparator, + uint32_t cuckoo_block_size, bool use_module_hash, + bool identity_as_first_hash, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, + uint64_t), + uint32_t column_family_id, + const std::string& column_family_name); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~CuckooTableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override { return status_; } + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + TableProperties GetTableProperties() const override { return properties_; } + + private: + struct CuckooBucket { + CuckooBucket() + : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {} + uint32_t vector_idx; + // This number will not exceed kvs_.size() + max_num_hash_func_. + // We assume number of items is <= 2^32. + uint32_t make_space_for_key_call_id; + }; + static const uint32_t kMaxVectorIdx = port::kMaxInt32; + + bool MakeSpaceForKey(const autovector<uint64_t>& hash_vals, + const uint32_t call_id, + std::vector<CuckooBucket>* buckets, uint64_t* bucket_id); + Status MakeHashTable(std::vector<CuckooBucket>* buckets); + + inline bool IsDeletedKey(uint64_t idx) const; + inline Slice GetKey(uint64_t idx) const; + inline Slice GetUserKey(uint64_t idx) const; + inline Slice GetValue(uint64_t idx) const; + + uint32_t num_hash_func_; + WritableFileWriter* file_; + const double max_hash_table_ratio_; + const uint32_t max_num_hash_func_; + const uint32_t max_search_depth_; + const uint32_t cuckoo_block_size_; + uint64_t hash_table_size_; + bool is_last_level_file_; + bool has_seen_first_key_; + bool has_seen_first_value_; + uint64_t key_size_; + uint64_t value_size_; + // A list of fixed-size key-value pairs concatenating into a string. + // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific + // key / value given an index + std::string kvs_; + std::string deleted_keys_; + // Number of key-value pairs stored in kvs_ + number of deleted keys + uint64_t num_entries_; + // Number of keys that contain value (non-deletion op) + uint64_t num_values_; + Status status_; + TableProperties properties_; + const Comparator* ucomp_; + bool use_module_hash_; + bool identity_as_first_hash_; + uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, + uint64_t max_num_buckets); + std::string largest_user_key_ = ""; + std::string smallest_user_key_ = ""; + + bool closed_; // Either Finish() or Abandon() has been called. + + // No copying allowed + CuckooTableBuilder(const CuckooTableBuilder&) = delete; + void operator=(const CuckooTableBuilder&) = delete; +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/cuckoo_table_builder_test.cc b/src/rocksdb/table/cuckoo_table_builder_test.cc new file mode 100644 index 00000000..c1e35032 --- /dev/null +++ b/src/rocksdb/table/cuckoo_table_builder_test.cc @@ -0,0 +1,649 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include <vector> +#include <string> +#include <map> +#include <utility> + +#include "table/meta_blocks.h" +#include "table/cuckoo_table_builder.h" +#include "util/file_reader_writer.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { +extern const uint64_t kCuckooTableMagicNumber; + +namespace { +std::unordered_map<std::string, std::vector<uint64_t>> hash_map; + +uint64_t GetSliceHash(const Slice& s, uint32_t index, + uint64_t /*max_num_buckets*/) { + return hash_map[s.ToString()][index]; +} +} // namespace + +class CuckooBuilderTest : public testing::Test { + public: + CuckooBuilderTest() { + env_ = Env::Default(); + Options options; + options.allow_mmap_reads = true; + env_options_ = EnvOptions(options); + } + + void CheckFileContents(const std::vector<std::string>& keys, + const std::vector<std::string>& values, + const std::vector<uint64_t>& expected_locations, + std::string expected_unused_bucket, uint64_t expected_table_size, + uint32_t expected_num_hash_func, bool expected_is_last_level, + uint32_t expected_cuckoo_block_size = 1) { + uint64_t num_deletions = 0; + for (const auto& key : keys) { + ParsedInternalKey parsed; + if (ParseInternalKey(key, &parsed) && parsed.type == kTypeDeletion) { + num_deletions++; + } + } + // Read file + std::unique_ptr<RandomAccessFile> read_file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_)); + uint64_t read_file_size; + ASSERT_OK(env_->GetFileSize(fname, &read_file_size)); + + // @lint-ignore TXT2 T25377293 Grandfathered in + Options options; + options.allow_mmap_reads = true; + ImmutableCFOptions ioptions(options); + + // Assert Table Properties. + TableProperties* props = nullptr; + std::unique_ptr<RandomAccessFileReader> file_reader( + new RandomAccessFileReader(std::move(read_file), fname)); + ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size, + kCuckooTableMagicNumber, ioptions, + &props, true /* compression_type_missing */)); + // Check unused bucket. + std::string unused_key = props->user_collected_properties[ + CuckooTablePropertyNames::kEmptyKey]; + ASSERT_EQ(expected_unused_bucket.substr(0, + props->fixed_key_len), unused_key); + + uint64_t value_len_found = + *reinterpret_cast<const uint64_t*>(props->user_collected_properties[ + CuckooTablePropertyNames::kValueLength].data()); + ASSERT_EQ(values.empty() ? 0 : values[0].size(), value_len_found); + ASSERT_EQ(props->raw_value_size, values.size()*value_len_found); + const uint64_t table_size = + *reinterpret_cast<const uint64_t*>(props->user_collected_properties[ + CuckooTablePropertyNames::kHashTableSize].data()); + ASSERT_EQ(expected_table_size, table_size); + const uint32_t num_hash_func_found = + *reinterpret_cast<const uint32_t*>(props->user_collected_properties[ + CuckooTablePropertyNames::kNumHashFunc].data()); + ASSERT_EQ(expected_num_hash_func, num_hash_func_found); + const uint32_t cuckoo_block_size = + *reinterpret_cast<const uint32_t*>(props->user_collected_properties[ + CuckooTablePropertyNames::kCuckooBlockSize].data()); + ASSERT_EQ(expected_cuckoo_block_size, cuckoo_block_size); + const bool is_last_level_found = + *reinterpret_cast<const bool*>(props->user_collected_properties[ + CuckooTablePropertyNames::kIsLastLevel].data()); + ASSERT_EQ(expected_is_last_level, is_last_level_found); + + ASSERT_EQ(props->num_entries, keys.size()); + ASSERT_EQ(props->num_deletions, num_deletions); + ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size()); + ASSERT_EQ(props->data_size, expected_unused_bucket.size() * + (expected_table_size + expected_cuckoo_block_size - 1)); + ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len); + ASSERT_EQ(props->column_family_id, 0); + ASSERT_EQ(props->column_family_name, kDefaultColumnFamilyName); + delete props; + + // Check contents of the bucket. + std::vector<bool> keys_found(keys.size(), false); + size_t bucket_size = expected_unused_bucket.size(); + for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) { + Slice read_slice; + ASSERT_OK(file_reader->Read(i * bucket_size, bucket_size, &read_slice, + nullptr)); + size_t key_idx = + std::find(expected_locations.begin(), expected_locations.end(), i) - + expected_locations.begin(); + if (key_idx == keys.size()) { + // i is not one of the expected locations. Empty bucket. + if (read_slice.data() == nullptr) { + ASSERT_EQ(0, expected_unused_bucket.size()); + } else { + ASSERT_EQ(read_slice.compare(expected_unused_bucket), 0); + } + } else { + keys_found[key_idx] = true; + ASSERT_EQ(read_slice.compare(keys[key_idx] + values[key_idx]), 0); + } + } + for (auto key_found : keys_found) { + // Check that all keys wereReader found. + ASSERT_TRUE(key_found); + } + } + + std::string GetInternalKey(Slice user_key, bool zero_seqno, + ValueType type = kTypeValue) { + IterKey ikey; + ikey.SetInternalKey(user_key, zero_seqno ? 0 : 1000, type); + return ikey.GetInternalKey().ToString(); + } + + uint64_t NextPowOf2(uint64_t num) { + uint64_t n = 2; + while (n <= num) { + n *= 2; + } + return n; + } + + uint64_t GetExpectedTableSize(uint64_t num) { + return NextPowOf2(static_cast<uint64_t>(num / kHashTableRatio)); + } + + + Env* env_; + EnvOptions env_options_; + std::string fname; + const double kHashTableRatio = 0.9; +}; + +TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) { + std::unique_ptr<WritableFile> writable_file; + fname = test::PerThreadDBPath("EmptyFile"); + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100, + BytewiseComparator(), 1, false, false, + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + ASSERT_EQ(0UL, builder.FileSize()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(file_writer->Close()); + CheckFileContents({}, {}, {}, "", 2, 2, false); +} + +TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { + for (auto type : {kTypeValue, kTypeDeletion}) { + uint32_t num_hash_fun = 4; + std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"}; + std::vector<std::string> values; + if (type == kTypeValue) { + values = {"v01", "v02", "v03", "v04"}; + } else { + values = {"", "", "", ""}; + } + // Need to have a temporary variable here as VS compiler does not currently + // support operator= with initializer_list as a parameter + std::unordered_map<std::string, std::vector<uint64_t>> hm = { + {user_keys[0], {0, 1, 2, 3}}, + {user_keys[1], {1, 2, 3, 4}}, + {user_keys[2], {2, 3, 4, 5}}, + {user_keys[3], {3, 4, 5, 6}}}; + hash_map = std::move(hm); + + std::vector<uint64_t> expected_locations = {0, 1, 2, 3}; + std::vector<std::string> keys; + for (auto& user_key : user_keys) { + keys.push_back(GetInternalKey(user_key, false, type)); + } + uint64_t expected_table_size = GetExpectedTableSize(keys.size()); + + std::unique_ptr<WritableFile> writable_file; + fname = test::PerThreadDBPath("NoCollisionFullKey"); + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(keys[i]), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + size_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(file_writer->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + + std::string expected_unused_bucket = GetInternalKey("key00", true); + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(keys, values, expected_locations, expected_unused_bucket, + expected_table_size, 2, false); + } +} + +TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { + uint32_t num_hash_fun = 4; + std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"}; + std::vector<std::string> values = {"v01", "v02", "v03", "v04"}; + // Need to have a temporary variable here as VS compiler does not currently + // support operator= with initializer_list as a parameter + std::unordered_map<std::string, std::vector<uint64_t>> hm = { + {user_keys[0], {0, 1, 2, 3}}, + {user_keys[1], {0, 1, 2, 3}}, + {user_keys[2], {0, 1, 2, 3}}, + {user_keys[3], {0, 1, 2, 3}}, + }; + hash_map = std::move(hm); + + std::vector<uint64_t> expected_locations = {0, 1, 2, 3}; + std::vector<std::string> keys; + for (auto& user_key : user_keys) { + keys.push_back(GetInternalKey(user_key, false)); + } + uint64_t expected_table_size = GetExpectedTableSize(keys.size()); + + std::unique_ptr<WritableFile> writable_file; + fname = test::PerThreadDBPath("WithCollisionFullKey"); + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(keys[i]), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + size_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(file_writer->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + + std::string expected_unused_bucket = GetInternalKey("key00", true); + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(keys, values, expected_locations, + expected_unused_bucket, expected_table_size, 4, false); +} + +TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { + uint32_t num_hash_fun = 4; + std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"}; + std::vector<std::string> values = {"v01", "v02", "v03", "v04"}; + // Need to have a temporary variable here as VS compiler does not currently + // support operator= with initializer_list as a parameter + std::unordered_map<std::string, std::vector<uint64_t>> hm = { + {user_keys[0], {0, 1, 2, 3}}, + {user_keys[1], {0, 1, 2, 3}}, + {user_keys[2], {0, 1, 2, 3}}, + {user_keys[3], {0, 1, 2, 3}}, + }; + hash_map = std::move(hm); + + std::vector<uint64_t> expected_locations = {0, 1, 2, 3}; + std::vector<std::string> keys; + for (auto& user_key : user_keys) { + keys.push_back(GetInternalKey(user_key, false)); + } + uint64_t expected_table_size = GetExpectedTableSize(keys.size()); + + std::unique_ptr<WritableFile> writable_file; + uint32_t cuckoo_block_size = 2; + fname = test::PerThreadDBPath("WithCollisionFullKey2"); + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); + CuckooTableBuilder builder( + file_writer.get(), kHashTableRatio, num_hash_fun, 100, + BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash, + 0 /* column_family_id */, kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(keys[i]), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + size_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(file_writer->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + + std::string expected_unused_bucket = GetInternalKey("key00", true); + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(keys, values, expected_locations, + expected_unused_bucket, expected_table_size, 3, false, cuckoo_block_size); +} + +TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { + // Have two hash functions. Insert elements with overlapping hashes. + // Finally insert an element with hash value somewhere in the middle + // so that it displaces all the elements after that. + uint32_t num_hash_fun = 2; + std::vector<std::string> user_keys = {"key01", "key02", "key03", + "key04", "key05"}; + std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"}; + // Need to have a temporary variable here as VS compiler does not currently + // support operator= with initializer_list as a parameter + std::unordered_map<std::string, std::vector<uint64_t>> hm = { + {user_keys[0], {0, 1}}, + {user_keys[1], {1, 2}}, + {user_keys[2], {2, 3}}, + {user_keys[3], {3, 4}}, + {user_keys[4], {0, 2}}, + }; + hash_map = std::move(hm); + + std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2}; + std::vector<std::string> keys; + for (auto& user_key : user_keys) { + keys.push_back(GetInternalKey(user_key, false)); + } + uint64_t expected_table_size = GetExpectedTableSize(keys.size()); + + std::unique_ptr<WritableFile> writable_file; + fname = test::PerThreadDBPath("WithCollisionPathFullKey"); + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(keys[i]), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + size_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(file_writer->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + + std::string expected_unused_bucket = GetInternalKey("key00", true); + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(keys, values, expected_locations, + expected_unused_bucket, expected_table_size, 2, false); +} + +TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { + uint32_t num_hash_fun = 2; + std::vector<std::string> user_keys = {"key01", "key02", "key03", + "key04", "key05"}; + std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"}; + // Need to have a temporary variable here as VS compiler does not currently + // support operator= with initializer_list as a parameter + std::unordered_map<std::string, std::vector<uint64_t>> hm = { + {user_keys[0], {0, 1}}, + {user_keys[1], {1, 2}}, + {user_keys[2], {3, 4}}, + {user_keys[3], {4, 5}}, + {user_keys[4], {0, 3}}, + }; + hash_map = std::move(hm); + + std::vector<uint64_t> expected_locations = {2, 1, 3, 4, 0}; + std::vector<std::string> keys; + for (auto& user_key : user_keys) { + keys.push_back(GetInternalKey(user_key, false)); + } + uint64_t expected_table_size = GetExpectedTableSize(keys.size()); + + std::unique_ptr<WritableFile> writable_file; + fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock"); + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 2, false, false, + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(keys[i]), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + size_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(file_writer->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + + std::string expected_unused_bucket = GetInternalKey("key00", true); + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(keys, values, expected_locations, + expected_unused_bucket, expected_table_size, 2, false, 2); +} + +TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { + uint32_t num_hash_fun = 4; + std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"}; + std::vector<std::string> values = {"v01", "v02", "v03", "v04"}; + // Need to have a temporary variable here as VS compiler does not currently + // support operator= with initializer_list as a parameter + std::unordered_map<std::string, std::vector<uint64_t>> hm = { + {user_keys[0], {0, 1, 2, 3}}, + {user_keys[1], {1, 2, 3, 4}}, + {user_keys[2], {2, 3, 4, 5}}, + {user_keys[3], {3, 4, 5, 6}}}; + hash_map = std::move(hm); + + std::vector<uint64_t> expected_locations = {0, 1, 2, 3}; + uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); + + std::unique_ptr<WritableFile> writable_file; + fname = test::PerThreadDBPath("NoCollisionUserKey"); + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + size_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(file_writer->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + + std::string expected_unused_bucket = "key00"; + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(user_keys, values, expected_locations, + expected_unused_bucket, expected_table_size, 2, true); +} + +TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { + uint32_t num_hash_fun = 4; + std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"}; + std::vector<std::string> values = {"v01", "v02", "v03", "v04"}; + // Need to have a temporary variable here as VS compiler does not currently + // support operator= with initializer_list as a parameter + std::unordered_map<std::string, std::vector<uint64_t>> hm = { + {user_keys[0], {0, 1, 2, 3}}, + {user_keys[1], {0, 1, 2, 3}}, + {user_keys[2], {0, 1, 2, 3}}, + {user_keys[3], {0, 1, 2, 3}}, + }; + hash_map = std::move(hm); + + std::vector<uint64_t> expected_locations = {0, 1, 2, 3}; + uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); + + std::unique_ptr<WritableFile> writable_file; + fname = test::PerThreadDBPath("WithCollisionUserKey"); + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + size_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(file_writer->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + + std::string expected_unused_bucket = "key00"; + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(user_keys, values, expected_locations, + expected_unused_bucket, expected_table_size, 4, true); +} + +TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { + uint32_t num_hash_fun = 2; + std::vector<std::string> user_keys = {"key01", "key02", "key03", + "key04", "key05"}; + std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"}; + // Need to have a temporary variable here as VS compiler does not currently + // support operator= with initializer_list as a parameter + std::unordered_map<std::string, std::vector<uint64_t>> hm = { + {user_keys[0], {0, 1}}, + {user_keys[1], {1, 2}}, + {user_keys[2], {2, 3}}, + {user_keys[3], {3, 4}}, + {user_keys[4], {0, 2}}, + }; + hash_map = std::move(hm); + + std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2}; + uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); + + std::unique_ptr<WritableFile> writable_file; + fname = test::PerThreadDBPath("WithCollisionPathUserKey"); + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 2, BytewiseComparator(), 1, false, false, + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + size_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(file_writer->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); + + std::string expected_unused_bucket = "key00"; + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(user_keys, values, expected_locations, + expected_unused_bucket, expected_table_size, 2, true); +} + +TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) { + // Have two hash functions. Insert elements with overlapping hashes. + // Finally try inserting an element with hash value somewhere in the middle + // and it should fail because the no. of elements to displace is too high. + uint32_t num_hash_fun = 2; + std::vector<std::string> user_keys = {"key01", "key02", "key03", + "key04", "key05"}; + // Need to have a temporary variable here as VS compiler does not currently + // support operator= with initializer_list as a parameter + std::unordered_map<std::string, std::vector<uint64_t>> hm = { + {user_keys[0], {0, 1}}, + {user_keys[1], {1, 2}}, + {user_keys[2], {2, 3}}, + {user_keys[3], {3, 4}}, + {user_keys[4], {0, 1}}, + }; + hash_map = std::move(hm); + + std::unique_ptr<WritableFile> writable_file; + fname = test::PerThreadDBPath("WithCollisionPathUserKey"); + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 2, BytewiseComparator(), 1, false, false, + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value")); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + ASSERT_TRUE(builder.Finish().IsNotSupported()); + ASSERT_OK(file_writer->Close()); +} + +TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { + // Need to have a temporary variable here as VS compiler does not currently + // support operator= with initializer_list as a parameter + std::unordered_map<std::string, std::vector<uint64_t>> hm = { + {"repeatedkey", {0, 1, 2, 3}}}; + hash_map = std::move(hm); + uint32_t num_hash_fun = 4; + std::string user_key = "repeatedkey"; + + std::unique_ptr<WritableFile> writable_file; + fname = test::PerThreadDBPath("FailWhenSameKeyInserted"); + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash, 0 /* column_family_id */, + kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + + builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1")); + ASSERT_EQ(builder.NumEntries(), 1u); + ASSERT_OK(builder.status()); + builder.Add(Slice(GetInternalKey(user_key, true)), Slice("value2")); + ASSERT_EQ(builder.NumEntries(), 2u); + ASSERT_OK(builder.status()); + + ASSERT_TRUE(builder.Finish().IsNotSupported()); + ASSERT_OK(file_writer->Close()); +} +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/cuckoo_table_factory.cc b/src/rocksdb/table/cuckoo_table_factory.cc new file mode 100644 index 00000000..74d18d51 --- /dev/null +++ b/src/rocksdb/table/cuckoo_table_factory.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include "table/cuckoo_table_factory.h" + +#include "db/dbformat.h" +#include "table/cuckoo_table_builder.h" +#include "table/cuckoo_table_reader.h" + +namespace rocksdb { + +Status CuckooTableFactory::NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table, + bool /*prefetch_index_and_filter_in_cache*/) const { + std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader( + table_reader_options.ioptions, std::move(file), file_size, + table_reader_options.internal_comparator.user_comparator(), nullptr)); + Status s = new_reader->status(); + if (s.ok()) { + *table = std::move(new_reader); + } + return s; +} + +TableBuilder* CuckooTableFactory::NewTableBuilder( + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + WritableFileWriter* file) const { + // Ignore the skipFIlters flag. Does not apply to this file format + // + + // TODO: change builder to take the option struct + return new CuckooTableBuilder( + file, table_options_.hash_table_ratio, 64, + table_options_.max_search_depth, + table_builder_options.internal_comparator.user_comparator(), + table_options_.cuckoo_block_size, table_options_.use_module_hash, + table_options_.identity_as_first_hash, nullptr /* get_slice_hash */, + column_family_id, table_builder_options.column_family_name); +} + +std::string CuckooTableFactory::GetPrintableTableOptions() const { + std::string ret; + ret.reserve(2000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", + table_options_.hash_table_ratio); + ret.append(buffer); + snprintf(buffer, kBufferSize, " max_search_depth: %u\n", + table_options_.max_search_depth); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cuckoo_block_size: %u\n", + table_options_.cuckoo_block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " identity_as_first_hash: %d\n", + table_options_.identity_as_first_hash); + ret.append(buffer); + return ret; +} + +TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) { + return new CuckooTableFactory(table_options); +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/cuckoo_table_factory.h b/src/rocksdb/table/cuckoo_table_factory.h new file mode 100644 index 00000000..eb3c5e51 --- /dev/null +++ b/src/rocksdb/table/cuckoo_table_factory.h @@ -0,0 +1,92 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <string> +#include "rocksdb/table.h" +#include "util/murmurhash.h" +#include "rocksdb/options.h" + +namespace rocksdb { + +const uint32_t kCuckooMurmurSeedMultiplier = 816922183; +static inline uint64_t CuckooHash( + const Slice& user_key, uint32_t hash_cnt, bool use_module_hash, + uint64_t table_size_, bool identity_as_first_hash, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) { +#if !defined NDEBUG || defined OS_WIN + // This part is used only in unit tests but we have to keep it for Windows + // build as we run test in both debug and release modes under Windows. + if (get_slice_hash != nullptr) { + return get_slice_hash(user_key, hash_cnt, table_size_); + } +#else + (void)get_slice_hash; +#endif + + uint64_t value = 0; + if (hash_cnt == 0 && identity_as_first_hash) { + value = (*reinterpret_cast<const int64_t*>(user_key.data())); + } else { + value = MurmurHash(user_key.data(), static_cast<int>(user_key.size()), + kCuckooMurmurSeedMultiplier * hash_cnt); + } + if (use_module_hash) { + return value % table_size_; + } else { + return value & (table_size_ - 1); + } +} + +// Cuckoo Table is designed for applications that require fast point lookups +// but not fast range scans. +// +// Some assumptions: +// - Key length and Value length are fixed. +// - Does not support Snapshot. +// - Does not support Merge operations. +// - Does not support prefix bloom filters. +class CuckooTableFactory : public TableFactory { + public: + explicit CuckooTableFactory(const CuckooTableOptions& table_options) + : table_options_(table_options) {} + ~CuckooTableFactory() {} + + const char* Name() const override { return "CuckooTable"; } + + Status NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table, + bool prefetch_index_and_filter_in_cache = true) const override; + + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + uint32_t column_family_id, WritableFileWriter* file) const override; + + // Sanitizes the specified DB Options. + Status SanitizeOptions( + const DBOptions& /*db_opts*/, + const ColumnFamilyOptions& /*cf_opts*/) const override { + return Status::OK(); + } + + std::string GetPrintableTableOptions() const override; + + void* GetOptions() override { return &table_options_; } + + Status GetOptionString(std::string* /*opt_string*/, + const std::string& /*delimiter*/) const override { + return Status::OK(); + } + + private: + CuckooTableOptions table_options_; +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/cuckoo_table_reader.cc b/src/rocksdb/table/cuckoo_table_reader.cc new file mode 100644 index 00000000..f4df2467 --- /dev/null +++ b/src/rocksdb/table/cuckoo_table_reader.cc @@ -0,0 +1,398 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#include "table/cuckoo_table_reader.h" + +#include <algorithm> +#include <limits> +#include <string> +#include <utility> +#include <vector> +#include "rocksdb/iterator.h" +#include "rocksdb/table.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" +#include "table/cuckoo_table_factory.h" +#include "table/get_context.h" +#include "util/arena.h" +#include "util/coding.h" + +namespace rocksdb { +namespace { +const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); +const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max(); +} + +extern const uint64_t kCuckooTableMagicNumber; + +CuckooTableReader::CuckooTableReader( + const ImmutableCFOptions& ioptions, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + const Comparator* comparator, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) + : file_(std::move(file)), + is_last_level_(false), + identity_as_first_hash_(false), + use_module_hash_(false), + num_hash_func_(0), + unused_key_(""), + key_length_(0), + user_key_length_(0), + value_length_(0), + bucket_length_(0), + cuckoo_block_size_(0), + cuckoo_block_bytes_minus_one_(0), + table_size_(0), + ucomp_(comparator), + get_slice_hash_(get_slice_hash) { + if (!ioptions.allow_mmap_reads) { + status_ = Status::InvalidArgument("File is not mmaped"); + } + TableProperties* props = nullptr; + status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, + ioptions, &props, true /* compression_type_missing */); + if (!status_.ok()) { + return; + } + table_props_.reset(props); + auto& user_props = props->user_collected_properties; + auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc); + if (hash_funs == user_props.end()) { + status_ = Status::Corruption("Number of hash functions not found"); + return; + } + num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data()); + auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey); + if (unused_key == user_props.end()) { + status_ = Status::Corruption("Empty bucket value not found"); + return; + } + unused_key_ = unused_key->second; + + key_length_ = static_cast<uint32_t>(props->fixed_key_len); + auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength); + if (user_key_len == user_props.end()) { + status_ = Status::Corruption("User key length not found"); + return; + } + user_key_length_ = *reinterpret_cast<const uint32_t*>( + user_key_len->second.data()); + + auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength); + if (value_length == user_props.end()) { + status_ = Status::Corruption("Value length not found"); + return; + } + value_length_ = *reinterpret_cast<const uint32_t*>( + value_length->second.data()); + bucket_length_ = key_length_ + value_length_; + + auto hash_table_size = user_props.find( + CuckooTablePropertyNames::kHashTableSize); + if (hash_table_size == user_props.end()) { + status_ = Status::Corruption("Hash table size not found"); + return; + } + table_size_ = *reinterpret_cast<const uint64_t*>( + hash_table_size->second.data()); + + auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); + if (is_last_level == user_props.end()) { + status_ = Status::Corruption("Is last level not found"); + return; + } + is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data()); + + auto identity_as_first_hash = user_props.find( + CuckooTablePropertyNames::kIdentityAsFirstHash); + if (identity_as_first_hash == user_props.end()) { + status_ = Status::Corruption("identity as first hash not found"); + return; + } + identity_as_first_hash_ = *reinterpret_cast<const bool*>( + identity_as_first_hash->second.data()); + + auto use_module_hash = user_props.find( + CuckooTablePropertyNames::kUseModuleHash); + if (use_module_hash == user_props.end()) { + status_ = Status::Corruption("hash type is not found"); + return; + } + use_module_hash_ = *reinterpret_cast<const bool*>( + use_module_hash->second.data()); + auto cuckoo_block_size = user_props.find( + CuckooTablePropertyNames::kCuckooBlockSize); + if (cuckoo_block_size == user_props.end()) { + status_ = Status::Corruption("Cuckoo block size not found"); + return; + } + cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>( + cuckoo_block_size->second.data()); + cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; + status_ = file_->Read(0, static_cast<size_t>(file_size), &file_data_, nullptr); +} + +Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, + const Slice& key, GetContext* get_context, + const SliceTransform* /* prefix_extractor */, + bool /*skip_filters*/) { + assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); + Slice user_key = ExtractUserKey(key); + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { + uint64_t offset = bucket_length_ * CuckooHash( + user_key, hash_cnt, use_module_hash_, table_size_, + identity_as_first_hash_, get_slice_hash_); + const char* bucket = &file_data_.data()[offset]; + for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; + ++block_idx, bucket += bucket_length_) { + if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()), + Slice(bucket, user_key.size()))) { + return Status::OK(); + } + // Here, we compare only the user key part as we support only one entry + // per user key and we don't support snapshot. + if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) { + Slice value(bucket + key_length_, value_length_); + if (is_last_level_) { + // Sequence number is not stored at the last level, so we will use + // kMaxSequenceNumber since it is unknown. This could cause some + // transactions to fail to lock a key due to known sequence number. + // However, it is expected for anyone to use a CuckooTable in a + // TransactionDB. + get_context->SaveValue(value, kMaxSequenceNumber); + } else { + Slice full_key(bucket, key_length_); + ParsedInternalKey found_ikey; + ParseInternalKey(full_key, &found_ikey); + bool dont_care __attribute__((__unused__)); + get_context->SaveValue(found_ikey, value, &dont_care); + } + // We don't support merge operations. So, we return here. + return Status::OK(); + } + } + } + return Status::OK(); +} + +void CuckooTableReader::Prepare(const Slice& key) { + // Prefetch the first Cuckoo Block. + Slice user_key = ExtractUserKey(key); + uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) + + bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_, + identity_as_first_hash_, nullptr); + uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_; + for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) { + PREFETCH(reinterpret_cast<const char*>(addr), 0, 3); + } +} + +class CuckooTableIterator : public InternalIterator { + public: + explicit CuckooTableIterator(CuckooTableReader* reader); + ~CuckooTableIterator() override {} + bool Valid() const override; + void SeekToFirst() override; + void SeekToLast() override; + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void Next() override; + void Prev() override; + Slice key() const override; + Slice value() const override; + Status status() const override { return Status::OK(); } + void InitIfNeeded(); + + private: + struct BucketComparator { + BucketComparator(const Slice& file_data, const Comparator* ucomp, + uint32_t bucket_len, uint32_t user_key_len, + const Slice& target = Slice()) + : file_data_(file_data), + ucomp_(ucomp), + bucket_len_(bucket_len), + user_key_len_(user_key_len), + target_(target) {} + bool operator()(const uint32_t first, const uint32_t second) const { + const char* first_bucket = + (first == kInvalidIndex) ? target_.data() : + &file_data_.data()[first * bucket_len_]; + const char* second_bucket = + (second == kInvalidIndex) ? target_.data() : + &file_data_.data()[second * bucket_len_]; + return ucomp_->Compare(Slice(first_bucket, user_key_len_), + Slice(second_bucket, user_key_len_)) < 0; + } + private: + const Slice file_data_; + const Comparator* ucomp_; + const uint32_t bucket_len_; + const uint32_t user_key_len_; + const Slice target_; + }; + + const BucketComparator bucket_comparator_; + void PrepareKVAtCurrIdx(); + CuckooTableReader* reader_; + bool initialized_; + // Contains a map of keys to bucket_id sorted in key order. + std::vector<uint32_t> sorted_bucket_ids_; + // We assume that the number of items can be stored in uint32 (4 Billion). + uint32_t curr_key_idx_; + Slice curr_value_; + IterKey curr_key_; + // No copying allowed + CuckooTableIterator(const CuckooTableIterator&) = delete; + void operator=(const Iterator&) = delete; +}; + +CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) + : bucket_comparator_(reader->file_data_, reader->ucomp_, + reader->bucket_length_, reader->user_key_length_), + reader_(reader), + initialized_(false), + curr_key_idx_(kInvalidIndex) { + sorted_bucket_ids_.clear(); + curr_value_.clear(); + curr_key_.Clear(); +} + +void CuckooTableIterator::InitIfNeeded() { + if (initialized_) { + return; + } + sorted_bucket_ids_.reserve(static_cast<size_t>(reader_->GetTableProperties()->num_entries)); + uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1; + assert(num_buckets < kInvalidIndex); + const char* bucket = reader_->file_data_.data(); + for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) { + if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) { + sorted_bucket_ids_.push_back(bucket_id); + } + bucket += reader_->bucket_length_; + } + assert(sorted_bucket_ids_.size() == + reader_->GetTableProperties()->num_entries); + std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), + bucket_comparator_); + curr_key_idx_ = kInvalidIndex; + initialized_ = true; +} + +void CuckooTableIterator::SeekToFirst() { + InitIfNeeded(); + curr_key_idx_ = 0; + PrepareKVAtCurrIdx(); +} + +void CuckooTableIterator::SeekToLast() { + InitIfNeeded(); + curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1; + PrepareKVAtCurrIdx(); +} + +void CuckooTableIterator::Seek(const Slice& target) { + InitIfNeeded(); + const BucketComparator seek_comparator( + reader_->file_data_, reader_->ucomp_, + reader_->bucket_length_, reader_->user_key_length_, + ExtractUserKey(target)); + auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(), + sorted_bucket_ids_.end(), + kInvalidIndex, + seek_comparator); + curr_key_idx_ = + static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it)); + PrepareKVAtCurrIdx(); +} + +void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) { + // Not supported + assert(false); +} + +bool CuckooTableIterator::Valid() const { + return curr_key_idx_ < sorted_bucket_ids_.size(); +} + +void CuckooTableIterator::PrepareKVAtCurrIdx() { + if (!Valid()) { + curr_value_.clear(); + curr_key_.Clear(); + return; + } + uint32_t id = sorted_bucket_ids_[curr_key_idx_]; + const char* offset = reader_->file_data_.data() + + id * reader_->bucket_length_; + if (reader_->is_last_level_) { + // Always return internal key. + curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), + 0, kTypeValue); + } else { + curr_key_.SetInternalKey(Slice(offset, reader_->key_length_)); + } + curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_); +} + +void CuckooTableIterator::Next() { + if (!Valid()) { + curr_value_.clear(); + curr_key_.Clear(); + return; + } + ++curr_key_idx_; + PrepareKVAtCurrIdx(); +} + +void CuckooTableIterator::Prev() { + if (curr_key_idx_ == 0) { + curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()); + } + if (!Valid()) { + curr_value_.clear(); + curr_key_.Clear(); + return; + } + --curr_key_idx_; + PrepareKVAtCurrIdx(); +} + +Slice CuckooTableIterator::key() const { + assert(Valid()); + return curr_key_.GetInternalKey(); +} + +Slice CuckooTableIterator::value() const { + assert(Valid()); + return curr_value_; +} + +InternalIterator* CuckooTableReader::NewIterator( + const ReadOptions& /*read_options*/, + const SliceTransform* /* prefix_extractor */, Arena* arena, + bool /*skip_filters*/, bool /*for_compaction*/) { + if (!status().ok()) { + return NewErrorInternalIterator<Slice>( + Status::Corruption("CuckooTableReader status is not okay."), arena); + } + CuckooTableIterator* iter; + if (arena == nullptr) { + iter = new CuckooTableIterator(this); + } else { + auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator)); + iter = new (iter_mem) CuckooTableIterator(this); + } + return iter; +} + +size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; } + +} // namespace rocksdb +#endif diff --git a/src/rocksdb/table/cuckoo_table_reader.h b/src/rocksdb/table/cuckoo_table_reader.h new file mode 100644 index 00000000..b37d4637 --- /dev/null +++ b/src/rocksdb/table/cuckoo_table_reader.h @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE +#include <string> +#include <memory> +#include <utility> +#include <vector> + +#include "db/dbformat.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "table/table_reader.h" +#include "util/file_reader_writer.h" + +namespace rocksdb { + +class Arena; +class TableReader; + +class CuckooTableReader: public TableReader { + public: + CuckooTableReader(const ImmutableCFOptions& ioptions, + std::unique_ptr<RandomAccessFileReader>&& file, + uint64_t file_size, const Comparator* user_comparator, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, + uint64_t)); + ~CuckooTableReader() {} + + std::shared_ptr<const TableProperties> GetTableProperties() const override { + return table_props_; + } + + Status status() const { return status_; } + + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena = nullptr, + bool skip_filters = false, + bool for_compaction = false) override; + void Prepare(const Slice& target) override; + + // Report an approximation of how much memory has been used. + size_t ApproximateMemoryUsage() const override; + + // Following methods are not implemented for Cuckoo Table Reader + uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; } + void SetupForCompaction() override {} + // End of methods not implemented. + + private: + friend class CuckooTableIterator; + void LoadAllKeys(std::vector<std::pair<Slice, uint32_t>>* key_to_bucket_id); + std::unique_ptr<RandomAccessFileReader> file_; + Slice file_data_; + bool is_last_level_; + bool identity_as_first_hash_; + bool use_module_hash_; + std::shared_ptr<const TableProperties> table_props_; + Status status_; + uint32_t num_hash_func_; + std::string unused_key_; + uint32_t key_length_; + uint32_t user_key_length_; + uint32_t value_length_; + uint32_t bucket_length_; + uint32_t cuckoo_block_size_; + uint32_t cuckoo_block_bytes_minus_one_; + uint64_t table_size_; + const Comparator* ucomp_; + uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, + uint64_t max_num_buckets); +}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/cuckoo_table_reader_test.cc b/src/rocksdb/table/cuckoo_table_reader_test.cc new file mode 100644 index 00000000..74fb52e6 --- /dev/null +++ b/src/rocksdb/table/cuckoo_table_reader_test.cc @@ -0,0 +1,573 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run this test... Skipping...\n"); + return 0; +} +#else + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include <inttypes.h> +#include <vector> +#include <string> +#include <map> + +#include "table/cuckoo_table_builder.h" +#include "table/cuckoo_table_factory.h" +#include "table/cuckoo_table_reader.h" +#include "table/get_context.h" +#include "table/meta_blocks.h" +#include "util/arena.h" +#include "util/gflags_compat.h" +#include "util/random.h" +#include "util/string_util.h" +#include "util/testharness.h" +#include "util/testutil.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +using GFLAGS_NAMESPACE::SetUsageMessage; + +DEFINE_string(file_dir, "", "Directory where the files will be created" + " for benchmark. Added for using tmpfs."); +DEFINE_bool(enable_perf, false, "Run Benchmark Tests too."); +DEFINE_bool(write, false, + "Should write new values to file in performance tests?"); +DEFINE_bool(identity_as_first_hash, true, "use identity as first hash"); + +namespace rocksdb { + +namespace { +const uint32_t kNumHashFunc = 10; +// Methods, variables related to Hash functions. +std::unordered_map<std::string, std::vector<uint64_t>> hash_map; + +void AddHashLookups(const std::string& s, uint64_t bucket_id, + uint32_t num_hash_fun) { + std::vector<uint64_t> v; + for (uint32_t i = 0; i < num_hash_fun; i++) { + v.push_back(bucket_id + i); + } + hash_map[s] = v; +} + +uint64_t GetSliceHash(const Slice& s, uint32_t index, + uint64_t /*max_num_buckets*/) { + return hash_map[s.ToString()][index]; +} +} // namespace + +class CuckooReaderTest : public testing::Test { + public: + using testing::Test::SetUp; + + CuckooReaderTest() { + options.allow_mmap_reads = true; + env = options.env; + env_options = EnvOptions(options); + } + + void SetUp(int num) { + num_items = num; + hash_map.clear(); + keys.clear(); + keys.resize(num_items); + user_keys.clear(); + user_keys.resize(num_items); + values.clear(); + values.resize(num_items); + } + + std::string NumToStr(int64_t i) { + return std::string(reinterpret_cast<char*>(&i), sizeof(i)); + } + + void CreateCuckooFileAndCheckReader( + const Comparator* ucomp = BytewiseComparator()) { + std::unique_ptr<WritableFile> writable_file; + ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, env_options)); + + CuckooTableBuilder builder( + file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false, + GetSliceHash, 0 /* column_family_id */, kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) { + builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); + ASSERT_OK(builder.status()); + ASSERT_EQ(builder.NumEntries(), key_idx + 1); + } + ASSERT_OK(builder.Finish()); + ASSERT_EQ(num_items, builder.NumEntries()); + file_size = builder.FileSize(); + ASSERT_OK(file_writer->Close()); + + // Check reader now. + std::unique_ptr<RandomAccessFile> read_file; + ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + std::unique_ptr<RandomAccessFileReader> file_reader( + new RandomAccessFileReader(std::move(read_file), fname)); + const ImmutableCFOptions ioptions(options); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, + GetSliceHash); + ASSERT_OK(reader.status()); + // Assume no merge/deletion + for (uint32_t i = 0; i < num_items; ++i) { + PinnableSlice value; + GetContext get_context(ucomp, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(user_keys[i]), &value, + nullptr, nullptr, nullptr, nullptr); + ASSERT_OK( + reader.Get(ReadOptions(), Slice(keys[i]), &get_context, nullptr)); + ASSERT_STREQ(values[i].c_str(), value.data()); + } + } + void UpdateKeys(bool with_zero_seqno) { + for (uint32_t i = 0; i < num_items; i++) { + ParsedInternalKey ikey(user_keys[i], + with_zero_seqno ? 0 : i + 1000, kTypeValue); + keys[i].clear(); + AppendInternalKey(&keys[i], ikey); + } + } + + void CheckIterator(const Comparator* ucomp = BytewiseComparator()) { + std::unique_ptr<RandomAccessFile> read_file; + ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + std::unique_ptr<RandomAccessFileReader> file_reader( + new RandomAccessFileReader(std::move(read_file), fname)); + const ImmutableCFOptions ioptions(options); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, + GetSliceHash); + ASSERT_OK(reader.status()); + InternalIterator* it = + reader.NewIterator(ReadOptions(), nullptr, nullptr, false); + ASSERT_OK(it->status()); + ASSERT_TRUE(!it->Valid()); + it->SeekToFirst(); + int cnt = 0; + while (it->Valid()) { + ASSERT_OK(it->status()); + ASSERT_TRUE(Slice(keys[cnt]) == it->key()); + ASSERT_TRUE(Slice(values[cnt]) == it->value()); + ++cnt; + it->Next(); + } + ASSERT_EQ(static_cast<uint32_t>(cnt), num_items); + + it->SeekToLast(); + cnt = static_cast<int>(num_items) - 1; + ASSERT_TRUE(it->Valid()); + while (it->Valid()) { + ASSERT_OK(it->status()); + ASSERT_TRUE(Slice(keys[cnt]) == it->key()); + ASSERT_TRUE(Slice(values[cnt]) == it->value()); + --cnt; + it->Prev(); + } + ASSERT_EQ(cnt, -1); + + cnt = static_cast<int>(num_items) / 2; + it->Seek(keys[cnt]); + while (it->Valid()) { + ASSERT_OK(it->status()); + ASSERT_TRUE(Slice(keys[cnt]) == it->key()); + ASSERT_TRUE(Slice(values[cnt]) == it->value()); + ++cnt; + it->Next(); + } + ASSERT_EQ(static_cast<uint32_t>(cnt), num_items); + delete it; + + Arena arena; + it = reader.NewIterator(ReadOptions(), nullptr, &arena); + ASSERT_OK(it->status()); + ASSERT_TRUE(!it->Valid()); + it->Seek(keys[num_items/2]); + ASSERT_TRUE(it->Valid()); + ASSERT_OK(it->status()); + ASSERT_TRUE(keys[num_items/2] == it->key()); + ASSERT_TRUE(values[num_items/2] == it->value()); + ASSERT_OK(it->status()); + it->~InternalIterator(); + } + + std::vector<std::string> keys; + std::vector<std::string> user_keys; + std::vector<std::string> values; + uint64_t num_items; + std::string fname; + uint64_t file_size; + Options options; + Env* env; + EnvOptions env_options; +}; + +TEST_F(CuckooReaderTest, WhenKeyExists) { + SetUp(kNumHashFunc); + fname = test::PerThreadDBPath("CuckooReader_WhenKeyExists"); + for (uint64_t i = 0; i < num_items; i++) { + user_keys[i] = "key" + NumToStr(i); + ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); + AppendInternalKey(&keys[i], ikey); + values[i] = "value" + NumToStr(i); + // Give disjoint hash values. + AddHashLookups(user_keys[i], i, kNumHashFunc); + } + CreateCuckooFileAndCheckReader(); + // Last level file. + UpdateKeys(true); + CreateCuckooFileAndCheckReader(); + // Test with collision. Make all hash values collide. + hash_map.clear(); + for (uint32_t i = 0; i < num_items; i++) { + AddHashLookups(user_keys[i], 0, kNumHashFunc); + } + UpdateKeys(false); + CreateCuckooFileAndCheckReader(); + // Last level file. + UpdateKeys(true); + CreateCuckooFileAndCheckReader(); +} + +TEST_F(CuckooReaderTest, WhenKeyExistsWithUint64Comparator) { + SetUp(kNumHashFunc); + fname = test::PerThreadDBPath("CuckooReaderUint64_WhenKeyExists"); + for (uint64_t i = 0; i < num_items; i++) { + user_keys[i].resize(8); + memcpy(&user_keys[i][0], static_cast<void*>(&i), 8); + ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); + AppendInternalKey(&keys[i], ikey); + values[i] = "value" + NumToStr(i); + // Give disjoint hash values. + AddHashLookups(user_keys[i], i, kNumHashFunc); + } + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); + // Last level file. + UpdateKeys(true); + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); + // Test with collision. Make all hash values collide. + hash_map.clear(); + for (uint32_t i = 0; i < num_items; i++) { + AddHashLookups(user_keys[i], 0, kNumHashFunc); + } + UpdateKeys(false); + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); + // Last level file. + UpdateKeys(true); + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); +} + +TEST_F(CuckooReaderTest, CheckIterator) { + SetUp(2*kNumHashFunc); + fname = test::PerThreadDBPath("CuckooReader_CheckIterator"); + for (uint64_t i = 0; i < num_items; i++) { + user_keys[i] = "key" + NumToStr(i); + ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue); + AppendInternalKey(&keys[i], ikey); + values[i] = "value" + NumToStr(i); + // Give disjoint hash values, in reverse order. + AddHashLookups(user_keys[i], num_items-i-1, kNumHashFunc); + } + CreateCuckooFileAndCheckReader(); + CheckIterator(); + // Last level file. + UpdateKeys(true); + CreateCuckooFileAndCheckReader(); + CheckIterator(); +} + +TEST_F(CuckooReaderTest, CheckIteratorUint64) { + SetUp(2*kNumHashFunc); + fname = test::PerThreadDBPath("CuckooReader_CheckIterator"); + for (uint64_t i = 0; i < num_items; i++) { + user_keys[i].resize(8); + memcpy(&user_keys[i][0], static_cast<void*>(&i), 8); + ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue); + AppendInternalKey(&keys[i], ikey); + values[i] = "value" + NumToStr(i); + // Give disjoint hash values, in reverse order. + AddHashLookups(user_keys[i], num_items-i-1, kNumHashFunc); + } + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); + CheckIterator(test::Uint64Comparator()); + // Last level file. + UpdateKeys(true); + CreateCuckooFileAndCheckReader(test::Uint64Comparator()); + CheckIterator(test::Uint64Comparator()); +} + +TEST_F(CuckooReaderTest, WhenKeyNotFound) { + // Add keys with colliding hash values. + SetUp(kNumHashFunc); + fname = test::PerThreadDBPath("CuckooReader_WhenKeyNotFound"); + for (uint64_t i = 0; i < num_items; i++) { + user_keys[i] = "key" + NumToStr(i); + ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); + AppendInternalKey(&keys[i], ikey); + values[i] = "value" + NumToStr(i); + // Make all hash values collide. + AddHashLookups(user_keys[i], 0, kNumHashFunc); + } + auto* ucmp = BytewiseComparator(); + CreateCuckooFileAndCheckReader(); + std::unique_ptr<RandomAccessFile> read_file; + ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + std::unique_ptr<RandomAccessFileReader> file_reader( + new RandomAccessFileReader(std::move(read_file), fname)); + const ImmutableCFOptions ioptions(options); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp, + GetSliceHash); + ASSERT_OK(reader.status()); + // Search for a key with colliding hash values. + std::string not_found_user_key = "key" + NumToStr(num_items); + std::string not_found_key; + AddHashLookups(not_found_user_key, 0, kNumHashFunc); + ParsedInternalKey ikey(not_found_user_key, 1000, kTypeValue); + AppendInternalKey(¬_found_key, ikey); + PinnableSlice value; + GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, + Slice(not_found_key), &value, nullptr, nullptr, + nullptr, nullptr); + ASSERT_OK( + reader.Get(ReadOptions(), Slice(not_found_key), &get_context, nullptr)); + ASSERT_TRUE(value.empty()); + ASSERT_OK(reader.status()); + // Search for a key with an independent hash value. + std::string not_found_user_key2 = "key" + NumToStr(num_items + 1); + AddHashLookups(not_found_user_key2, kNumHashFunc, kNumHashFunc); + ParsedInternalKey ikey2(not_found_user_key2, 1000, kTypeValue); + std::string not_found_key2; + AppendInternalKey(¬_found_key2, ikey2); + value.Reset(); + GetContext get_context2(ucmp, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(not_found_key2), &value, + nullptr, nullptr, nullptr, nullptr); + ASSERT_OK( + reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2, nullptr)); + ASSERT_TRUE(value.empty()); + ASSERT_OK(reader.status()); + + // Test read when key is unused key. + std::string unused_key = + reader.GetTableProperties()->user_collected_properties.at( + CuckooTablePropertyNames::kEmptyKey); + // Add hash values that map to empty buckets. + AddHashLookups(ExtractUserKey(unused_key).ToString(), + kNumHashFunc, kNumHashFunc); + value.Reset(); + GetContext get_context3(ucmp, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(unused_key), &value, + nullptr, nullptr, nullptr, nullptr); + ASSERT_OK( + reader.Get(ReadOptions(), Slice(unused_key), &get_context3, nullptr)); + ASSERT_TRUE(value.empty()); + ASSERT_OK(reader.status()); +} + +// Performance tests +namespace { +void GetKeys(uint64_t num, std::vector<std::string>* keys) { + keys->clear(); + IterKey k; + k.SetInternalKey("", 0, kTypeValue); + std::string internal_key_suffix = k.GetInternalKey().ToString(); + ASSERT_EQ(static_cast<size_t>(8), internal_key_suffix.size()); + for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { + uint64_t value = 2 * key_idx; + std::string new_key(reinterpret_cast<char*>(&value), sizeof(value)); + new_key += internal_key_suffix; + keys->push_back(new_key); + } +} + +std::string GetFileName(uint64_t num) { + if (FLAGS_file_dir.empty()) { + FLAGS_file_dir = test::TmpDir(); + } + return test::PerThreadDBPath(FLAGS_file_dir, "cuckoo_read_benchmark") + + ToString(num / 1000000) + "Mkeys"; +} + +// Create last level file as we are interested in measuring performance of +// last level file only. +void WriteFile(const std::vector<std::string>& keys, + const uint64_t num, double hash_ratio) { + Options options; + options.allow_mmap_reads = true; + Env* env = options.env; + EnvOptions env_options = EnvOptions(options); + std::string fname = GetFileName(num); + + std::unique_ptr<WritableFile> writable_file; + ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(writable_file), fname, env_options)); + CuckooTableBuilder builder( + file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5, + false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */, + kDefaultColumnFamilyName); + ASSERT_OK(builder.status()); + for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { + // Value is just a part of key. + builder.Add(Slice(keys[key_idx]), Slice(&keys[key_idx][0], 4)); + ASSERT_EQ(builder.NumEntries(), key_idx + 1); + ASSERT_OK(builder.status()); + } + ASSERT_OK(builder.Finish()); + ASSERT_EQ(num, builder.NumEntries()); + ASSERT_OK(file_writer->Close()); + + uint64_t file_size; + env->GetFileSize(fname, &file_size); + std::unique_ptr<RandomAccessFile> read_file; + ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + std::unique_ptr<RandomAccessFileReader> file_reader( + new RandomAccessFileReader(std::move(read_file), fname)); + + const ImmutableCFOptions ioptions(options); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, + test::Uint64Comparator(), nullptr); + ASSERT_OK(reader.status()); + ReadOptions r_options; + PinnableSlice value; + // Assume only the fast path is triggered + GetContext get_context(nullptr, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(), &value, nullptr, + nullptr, nullptr, nullptr); + for (uint64_t i = 0; i < num; ++i) { + value.Reset(); + value.clear(); + ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context, nullptr)); + ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4)); + } +} + +void ReadKeys(uint64_t num, uint32_t batch_size) { + Options options; + options.allow_mmap_reads = true; + Env* env = options.env; + EnvOptions env_options = EnvOptions(options); + std::string fname = GetFileName(num); + + uint64_t file_size; + env->GetFileSize(fname, &file_size); + std::unique_ptr<RandomAccessFile> read_file; + ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + std::unique_ptr<RandomAccessFileReader> file_reader( + new RandomAccessFileReader(std::move(read_file), fname)); + + const ImmutableCFOptions ioptions(options); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, + test::Uint64Comparator(), nullptr); + ASSERT_OK(reader.status()); + const UserCollectedProperties user_props = + reader.GetTableProperties()->user_collected_properties; + const uint32_t num_hash_fun = *reinterpret_cast<const uint32_t*>( + user_props.at(CuckooTablePropertyNames::kNumHashFunc).data()); + const uint64_t table_size = *reinterpret_cast<const uint64_t*>( + user_props.at(CuckooTablePropertyNames::kHashTableSize).data()); + fprintf(stderr, "With %" PRIu64 " items, utilization is %.2f%%, number of" + " hash functions: %u.\n", num, num * 100.0 / (table_size), num_hash_fun); + ReadOptions r_options; + + std::vector<uint64_t> keys; + keys.reserve(num); + for (uint64_t i = 0; i < num; ++i) { + keys.push_back(2 * i); + } + std::random_shuffle(keys.begin(), keys.end()); + + PinnableSlice value; + // Assume only the fast path is triggered + GetContext get_context(nullptr, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(), &value, nullptr, + nullptr, nullptr, nullptr); + uint64_t start_time = env->NowMicros(); + if (batch_size > 0) { + for (uint64_t i = 0; i < num; i += batch_size) { + for (uint64_t j = i; j < i+batch_size && j < num; ++j) { + reader.Prepare(Slice(reinterpret_cast<char*>(&keys[j]), 16)); + } + for (uint64_t j = i; j < i+batch_size && j < num; ++j) { + reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[j]), 16), + &get_context, nullptr); + } + } + } else { + for (uint64_t i = 0; i < num; i++) { + reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[i]), 16), + &get_context, nullptr); + } + } + float time_per_op = (env->NowMicros() - start_time) * 1.0f / num; + fprintf(stderr, + "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n", + time_per_op, 1.0 / time_per_op, batch_size); +} +} // namespace. + +TEST_F(CuckooReaderTest, TestReadPerformance) { + if (!FLAGS_enable_perf) { + return; + } + double hash_ratio = 0.95; + // These numbers are chosen to have a hash utilization % close to + // 0.9, 0.75, 0.6 and 0.5 respectively. + // They all create 128 M buckets. + std::vector<uint64_t> nums = {120*1024*1024, 100*1024*1024, 80*1024*1024, + 70*1024*1024}; +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n"); +#endif + for (uint64_t num : nums) { + if (FLAGS_write || + Env::Default()->FileExists(GetFileName(num)).IsNotFound()) { + std::vector<std::string> all_keys; + GetKeys(num, &all_keys); + WriteFile(all_keys, num, hash_ratio); + } + ReadKeys(num, 0); + ReadKeys(num, 10); + ReadKeys(num, 25); + ReadKeys(num, 50); + ReadKeys(num, 100); + fprintf(stderr, "\n"); + } +} +} // namespace rocksdb + +int main(int argc, char** argv) { + if (rocksdb::port::kLittleEndian) { + ::testing::InitGoogleTest(&argc, argv); + ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); + } + else { + fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n"); + return 0; + } +} + +#endif // GFLAGS. + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/data_block_footer.cc b/src/rocksdb/table/data_block_footer.cc new file mode 100644 index 00000000..cb9e1438 --- /dev/null +++ b/src/rocksdb/table/data_block_footer.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "data_block_footer.h" + +#include "rocksdb/table.h" + +namespace rocksdb { + +const int kDataBlockIndexTypeBitShift = 31; + +// 0x7FFFFFFF +const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u; + +// 0x7FFFFFFF +const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u; + +uint32_t PackIndexTypeAndNumRestarts( + BlockBasedTableOptions::DataBlockIndexType index_type, + uint32_t num_restarts) { + if (num_restarts > kMaxNumRestarts) { + assert(0); // mute travis "unused" warning + } + + uint32_t block_footer = num_restarts; + if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) { + block_footer |= 1u << kDataBlockIndexTypeBitShift; + } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) { + assert(0); + } + + return block_footer; +} + +void UnPackIndexTypeAndNumRestarts( + uint32_t block_footer, + BlockBasedTableOptions::DataBlockIndexType* index_type, + uint32_t* num_restarts) { + if (index_type) { + if (block_footer & 1u << kDataBlockIndexTypeBitShift) { + *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; + } else { + *index_type = BlockBasedTableOptions::kDataBlockBinarySearch; + } + } + + if (num_restarts) { + *num_restarts = block_footer & kNumRestartsMask; + assert(*num_restarts <= kMaxNumRestarts); + } +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/data_block_footer.h b/src/rocksdb/table/data_block_footer.h new file mode 100644 index 00000000..e6ff20bc --- /dev/null +++ b/src/rocksdb/table/data_block_footer.h @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/table.h" + +namespace rocksdb { + +uint32_t PackIndexTypeAndNumRestarts( + BlockBasedTableOptions::DataBlockIndexType index_type, + uint32_t num_restarts); + +void UnPackIndexTypeAndNumRestarts( + uint32_t block_footer, + BlockBasedTableOptions::DataBlockIndexType* index_type, + uint32_t* num_restarts); + +} // namespace rocksdb diff --git a/src/rocksdb/table/data_block_hash_index.cc b/src/rocksdb/table/data_block_hash_index.cc new file mode 100644 index 00000000..adb1d7b8 --- /dev/null +++ b/src/rocksdb/table/data_block_hash_index.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include <string> +#include <vector> + +#include "rocksdb/slice.h" +#include "table/data_block_hash_index.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace rocksdb { + +void DataBlockHashIndexBuilder::Add(const Slice& key, + const size_t restart_index) { + assert(Valid()); + if (restart_index > kMaxRestartSupportedByHashIndex) { + valid_ = false; + return; + } + + uint32_t hash_value = GetSliceHash(key); + hash_and_restart_pairs_.emplace_back(hash_value, + static_cast<uint8_t>(restart_index)); + estimated_num_buckets_ += bucket_per_key_; +} + +void DataBlockHashIndexBuilder::Finish(std::string& buffer) { + assert(Valid()); + uint16_t num_buckets = static_cast<uint16_t>(estimated_num_buckets_); + + if (num_buckets == 0) { + num_buckets = 1; // sanity check + } + + // The build-in hash cannot well distribute strings when into different + // buckets when num_buckets is power of two, resulting in high hash + // collision. + // We made the num_buckets to be odd to avoid this issue. + num_buckets |= 1; + + std::vector<uint8_t> buckets(num_buckets, kNoEntry); + // write the restart_index array + for (auto& entry : hash_and_restart_pairs_) { + uint32_t hash_value = entry.first; + uint8_t restart_index = entry.second; + uint16_t buck_idx = static_cast<uint16_t>(hash_value % num_buckets); + if (buckets[buck_idx] == kNoEntry) { + buckets[buck_idx] = restart_index; + } else if (buckets[buck_idx] != restart_index) { + // same bucket cannot store two different restart_index, mark collision + buckets[buck_idx] = kCollision; + } + } + + for (uint8_t restart_index : buckets) { + buffer.append( + const_cast<const char*>(reinterpret_cast<char*>(&restart_index)), + sizeof(restart_index)); + } + + // write NUM_BUCK + PutFixed16(&buffer, num_buckets); + + assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex); +} + +void DataBlockHashIndexBuilder::Reset() { + estimated_num_buckets_ = 0; + valid_ = true; + hash_and_restart_pairs_.clear(); +} + +void DataBlockHashIndex::Initialize(const char* data, uint16_t size, + uint16_t* map_offset) { + assert(size >= sizeof(uint16_t)); // NUM_BUCKETS + num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t)); + assert(num_buckets_ > 0); + assert(size > num_buckets_ * sizeof(uint8_t)); + *map_offset = static_cast<uint16_t>(size - sizeof(uint16_t) - + num_buckets_ * sizeof(uint8_t)); +} + +uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset, + const Slice& key) const { + uint32_t hash_value = GetSliceHash(key); + uint16_t idx = static_cast<uint16_t>(hash_value % num_buckets_); + const char* bucket_table = data + map_offset; + return static_cast<uint8_t>(*(bucket_table + idx * sizeof(uint8_t))); +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/data_block_hash_index.h b/src/rocksdb/table/data_block_hash_index.h new file mode 100644 index 00000000..0af8b257 --- /dev/null +++ b/src/rocksdb/table/data_block_hash_index.h @@ -0,0 +1,136 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <string> +#include <vector> + +#include "rocksdb/slice.h" + +namespace rocksdb { +// This is an experimental feature aiming to reduce the CPU utilization of +// point-lookup within a data-block. It is only used in data blocks, and not +// in meta-data blocks or per-table index blocks. +// +// It only used to support BlockBasedTable::Get(). +// +// A serialized hash index is appended to the data-block. The new block data +// format is as follows: +// +// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER] +// +// RI: Restart Interval (the same as the default data-block format) +// RI_IDX: Restart Interval index (the same as the default data-block format) +// HASH_IDX: The new data-block hash index feature. +// FOOTER: A 32bit block footer, which is the NUM_RESTARTS with the MSB as +// the flag indicating if this hash index is in use. Note that +// given a data block < 32KB, the MSB is never used. So we can +// borrow the MSB as the hash index flag. Therefore, this format is +// compatible with the legacy data-blocks with num_restarts < 32768, +// as the MSB is 0. +// +// The format of the data-block hash index is as follows: +// +// HASH_IDX: [B B B ... B NUM_BUCK] +// +// B: bucket, an array of restart index. Each buckets is uint8_t. +// NUM_BUCK: Number of buckets, which is the length of the bucket array. +// +// We reserve two special flag: +// kNoEntry=255, +// kCollision=254. +// +// Therefore, the max number of restarts this hash index can supoport is 253. +// +// Buckets are initialized to be kNoEntry. +// +// When storing a key in the hash index, the key is first hashed to a bucket. +// If there the bucket is empty (kNoEntry), the restart index is stored in +// the bucket. If there is already a restart index there, we will update the +// existing restart index to a collision marker (kCollision). If the +// the bucket is already marked as collision, we do not store the restart +// index either. +// +// During query process, a key is first hashed to a bucket. Then we examine if +// the buckets store nothing (kNoEntry) or the bucket had a collision +// (kCollision). If either of those happens, we get the restart index of +// the key and will directly go to the restart interval to search the key. +// +// Note that we only support blocks with #restart_interval < 254. If a block +// has more restart interval than that, hash index will not be create for it. + +const uint8_t kNoEntry = 255; +const uint8_t kCollision = 254; +const uint8_t kMaxRestartSupportedByHashIndex = 253; + +// Because we use uint16_t address, we only support block no more than 64KB +const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16; +const double kDefaultUtilRatio = 0.75; + +class DataBlockHashIndexBuilder { + public: + DataBlockHashIndexBuilder() + : bucket_per_key_(-1 /*uninitialized marker*/), + estimated_num_buckets_(0), + valid_(false) {} + + void Initialize(double util_ratio) { + if (util_ratio <= 0) { + util_ratio = kDefaultUtilRatio; // sanity check + } + bucket_per_key_ = 1 / util_ratio; + valid_ = true; + } + + inline bool Valid() const { return valid_ && bucket_per_key_ > 0; } + void Add(const Slice& key, const size_t restart_index); + void Finish(std::string& buffer); + void Reset(); + inline size_t EstimateSize() const { + uint16_t estimated_num_buckets = + static_cast<uint16_t>(estimated_num_buckets_); + + // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish. + estimated_num_buckets |= 1; + + return sizeof(uint16_t) + + static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t)); + } + + private: + double bucket_per_key_; // is the multiplicative inverse of util_ratio_ + double estimated_num_buckets_; + + // Now the only usage for `valid_` is to mark false when the inserted + // restart_index is larger than supported. In this case HashIndex is not + // appended to the block content. + bool valid_; + + std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_; + friend class DataBlockHashIndex_DataBlockHashTestSmall_Test; +}; + +class DataBlockHashIndex { + public: + DataBlockHashIndex() : num_buckets_(0) {} + + void Initialize(const char* data, uint16_t size, uint16_t* map_offset); + + uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const; + + inline bool Valid() { return num_buckets_ != 0; } + + private: + // To make the serialized hash index compact and to save the space overhead, + // here all the data fields persisted in the block are in uint16 format. + // We find that a uint16 is large enough to index every offset of a 64KiB + // block. + // So in other words, DataBlockHashIndex does not support block size equal + // or greater then 64KiB. + uint16_t num_buckets_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/data_block_hash_index_test.cc b/src/rocksdb/table/data_block_hash_index_test.cc new file mode 100644 index 00000000..11226648 --- /dev/null +++ b/src/rocksdb/table/data_block_hash_index_test.cc @@ -0,0 +1,724 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <cstdlib> +#include <string> +#include <unordered_map> + +#include "db/table_properties_collector.h" +#include "rocksdb/slice.h" +#include "table/block.h" +#include "table/block_based_table_reader.h" +#include "table/block_builder.h" +#include "table/data_block_hash_index.h" +#include "table/get_context.h" +#include "table/table_builder.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +bool SearchForOffset(DataBlockHashIndex& index, const char* data, + uint16_t map_offset, const Slice& key, + uint8_t& restart_point) { + uint8_t entry = index.Lookup(data, map_offset, key); + if (entry == kCollision) { + return true; + } + + if (entry == kNoEntry) { + return false; + } + + return entry == restart_point; +} + +// Random KV generator similer to block_test +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} +std::string GenerateKey(int primary_key, int secondary_key, int padding_size, + Random* rnd) { + char buf[50]; + char* p = &buf[0]; + snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); + std::string k(p); + if (padding_size) { + k += RandomString(rnd, padding_size); + } + + return k; +} + +// Generate random key value pairs. +// The generated key will be sorted. You can tune the parameters to generated +// different kinds of test key/value pairs for different scenario. +void GenerateRandomKVs(std::vector<std::string>* keys, + std::vector<std::string>* values, const int from, + const int len, const int step = 1, + const int padding_size = 0, + const int keys_share_prefix = 1) { + Random rnd(302); + + // generate different prefix + for (int i = from; i < from + len; i += step) { + // generating keys that shares the prefix + for (int j = 0; j < keys_share_prefix; ++j) { + keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + + // 100 bytes values + values->emplace_back(RandomString(&rnd, 100)); + } + } +} + +TEST(DataBlockHashIndex, DataBlockHashTestSmall) { + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + for (int j = 0; j < 5; j++) { + for (uint8_t i = 0; i < 2 + j; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("fake"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 2; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } + builder.Reset(); + } +} + +TEST(DataBlockHashIndex, DataBlockHashTest) { + // bucket_num = 200, #keys = 100. 50% utilization + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("fake content"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } +} + +TEST(DataBlockHashIndex, DataBlockHashTestCollision) { + // bucket_num = 2. There will be intense hash collisions + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("some other fake content to take up space"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } +} + +TEST(DataBlockHashIndex, DataBlockHashTestLarge) { + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + std::unordered_map<std::string, uint8_t> m; + + for (uint8_t i = 0; i < 100; i++) { + if (i % 2) { + continue; // leave half of the keys out + } + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + builder.Add(key, restart_point); + m[key] = restart_point; + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("filling stuff"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 100; i++) { + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + if (m.count(key)) { + ASSERT_TRUE(m[key] == restart_point); + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } else { + // we allow false positve, so don't test the nonexisting keys. + // when false positive happens, the search will continue to the + // restart intervals to see if the key really exist. + } + } +} + +TEST(DataBlockHashIndex, RestartIndexExceedMax) { + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + std::unordered_map<std::string, uint8_t> m; + + for (uint8_t i = 0; i <= 253; i++) { + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + ASSERT_TRUE(builder.Valid()); + + builder.Reset(); + + for (uint8_t i = 0; i <= 254; i++) { + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + ASSERT_FALSE(builder.Valid()); + + builder.Reset(); + ASSERT_TRUE(builder.Valid()); +} + +TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) { + Options options = Options(); + + BlockBuilder builder(1 /* block_restart_interval */, + true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + + // #restarts <= 253. HashIndex is valid + for (int i = 0; i <= 253; i++) { + std::string ukey = "key" + std::to_string(i); + InternalKey ikey(ukey, 0, kTypeValue); + builder.Add(ikey.Encode().ToString(), "value"); + } + + { + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinaryAndHash); + } + + builder.Reset(); + + // #restarts > 253. HashIndex is not used + for (int i = 0; i <= 254; i++) { + std::string ukey = "key" + std::to_string(i); + InternalKey ikey(ukey, 0, kTypeValue); + builder.Add(ikey.Encode().ToString(), "value"); + } + + { + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinarySearch); + } +} + +TEST(DataBlockHashIndex, BlockSizeExceedMax) { + Options options = Options(); + std::string ukey(10, 'k'); + InternalKey ikey(ukey, 0, kTypeValue); + + BlockBuilder builder(1 /* block_restart_interval */, + false /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + + { + // insert a large value. The block size plus HashIndex is 65536. + std::string value(65502, 'v'); + + builder.Add(ikey.Encode().ToString(), value); + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex); + std::cerr << "block size: " << rawblock.size() << std::endl; + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinaryAndHash); + } + + builder.Reset(); + + { + // insert a large value. The block size plus HashIndex would be 65537. + // This excceed the max block size supported by HashIndex (65536). + // So when build finishes HashIndex will not be created for the block. + std::string value(65503, 'v'); + + builder.Add(ikey.Encode().ToString(), value); + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex); + std::cerr << "block size: " << rawblock.size() << std::endl; + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + // the index type have fallen back to binary when build finish. + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinarySearch); + } +} + +TEST(DataBlockHashIndex, BlockTestSingleKey) { + Options options = Options(); + + BlockBuilder builder(16 /* block_restart_interval */, + true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + + std::string ukey("gopher"); + std::string value("gold"); + InternalKey ikey(ukey, 10, kTypeValue); + builder.Add(ikey.Encode().ToString(), value /*value*/); + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + const InternalKeyComparator icmp(BytewiseComparator()); + auto iter = reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator()); + bool may_exist; + // search in block for the key just inserted + { + InternalKey seek_ikey(ukey, 10, kValueTypeForSeek); + may_exist = iter->SeekForGet(seek_ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ( + options.comparator->Compare(iter->key(), ikey.Encode().ToString()), 0); + ASSERT_EQ(iter->value(), value); + } + + // search in block for the existing ukey, but with higher seqno + { + InternalKey seek_ikey(ukey, 20, kValueTypeForSeek); + + // HashIndex should be able to set the iter correctly + may_exist = iter->SeekForGet(seek_ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_TRUE(iter->Valid()); + + // user key should match + ASSERT_EQ(options.comparator->Compare(ExtractUserKey(iter->key()), ukey), + 0); + + // seek_key seqno number should be greater than that of iter result + ASSERT_GT(GetInternalKeySeqno(seek_ikey.Encode()), + GetInternalKeySeqno(iter->key())); + + ASSERT_EQ(iter->value(), value); + } + + // Search in block for the existing ukey, but with lower seqno + // in this case, hash can find the only occurrence of the user_key, but + // ParseNextDataKey() will skip it as it does not have a older seqno. + // In this case, GetForSeek() is effective to locate the user_key, and + // iter->Valid() == false indicates that we've reached to the end of + // the block and the caller should continue searching the next block. + { + InternalKey seek_ikey(ukey, 5, kValueTypeForSeek); + may_exist = iter->SeekForGet(seek_ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_FALSE(iter->Valid()); // should have reached to the end of block + } + + delete iter; +} + +TEST(DataBlockHashIndex, BlockTestLarge) { + Random rnd(1019); + Options options = Options(); + std::vector<std::string> keys; + std::vector<std::string> values; + + BlockBuilder builder(16 /* block_restart_interval */, + true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + int num_records = 500; + + GenerateRandomKVs(&keys, &values, 0, num_records); + + // Generate keys. Adding a trailing "1" to indicate existent keys. + // Later will Seeking for keys with a trailing "0" to test seeking + // non-existent keys. + for (int i = 0; i < num_records; i++) { + std::string ukey(keys[i] + "1" /* existing key marker */); + InternalKey ikey(ukey, 0, kTypeValue); + builder.Add(ikey.Encode().ToString(), values[i]); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + const InternalKeyComparator icmp(BytewiseComparator()); + + // random seek existent keys + for (int i = 0; i < num_records; i++) { + auto iter = + reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator()); + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + std::string ukey(keys[index] + "1" /* existing key marker */); + InternalKey ikey(ukey, 0, kTypeValue); + + // search in block for this key + bool may_exist = iter->SeekForGet(ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(values[index], iter->value()); + + delete iter; + } + + // random seek non-existent user keys + // In this case A), the user_key cannot be found in HashIndex. The key may + // exist in the next block. So the iter is set invalidated to tell the + // caller to search the next block. This test case belongs to this case A). + // + // Note that for non-existent keys, there is possibility of false positive, + // i.e. the key is still hashed into some restart interval. + // Two additional possible outcome: + // B) linear seek the restart interval and not found, the iter stops at the + // starting of the next restart interval. The key does not exist + // anywhere. + // C) linear seek the restart interval and not found, the iter stops at the + // the end of the block, i.e. restarts_. The key may exist in the next + // block. + // So these combinations are possible when searching non-existent user_key: + // + // case# may_exist iter->Valid() + // A true false + // B false true + // C true false + + for (int i = 0; i < num_records; i++) { + auto iter = + reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator()); + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + std::string ukey(keys[index] + "0" /* non-existing key marker */); + InternalKey ikey(ukey, 0, kTypeValue); + + // search in block for this key + bool may_exist = iter->SeekForGet(ikey.Encode().ToString()); + if (!may_exist) { + ASSERT_TRUE(iter->Valid()); + } + if (!iter->Valid()) { + ASSERT_TRUE(may_exist); + } + + delete iter; + } +} + +// helper routine for DataBlockHashIndex.BlockBoundary +void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, + std::string& v2, InternalKey& seek_ikey, + GetContext& get_context, Options& options) { + std::unique_ptr<WritableFileWriter> file_writer; + std::unique_ptr<RandomAccessFileReader> file_reader; + std::unique_ptr<TableReader> table_reader; + int level_ = -1; + + std::vector<std::string> keys; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + const InternalKeyComparator internal_comparator(options.comparator); + + EnvOptions soptions; + + soptions.use_mmap_reads = ioptions.allow_mmap_reads; + file_writer.reset( + test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */)); + std::unique_ptr<TableBuilder> builder; + std::vector<std::unique_ptr<IntTblPropCollectorFactory>> + int_tbl_prop_collector_factories; + std::string column_family_name; + builder.reset(ioptions.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, internal_comparator, + &int_tbl_prop_collector_factories, + options.compression, options.sample_for_compression, + CompressionOptions(), false /* skip_filters */, + column_family_name, level_), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + file_writer.get())); + + builder->Add(ik1.Encode().ToString(), v1); + builder->Add(ik2.Encode().ToString(), v2); + EXPECT_TRUE(builder->status().ok()); + + Status s = builder->Finish(); + file_writer->Flush(); + EXPECT_TRUE(s.ok()) << s.ToString(); + + EXPECT_EQ(static_cast<test::StringSink*>(file_writer->writable_file()) + ->contents() + .size(), + builder->FileSize()); + + // Open the table + file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource( + static_cast<test::StringSink*>(file_writer->writable_file())->contents(), + 0 /*uniq_id*/, ioptions.allow_mmap_reads))); + const bool kSkipFilters = true; + const bool kImmortal = true; + ioptions.table_factory->NewTableReader( + TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, + internal_comparator, !kSkipFilters, !kImmortal, + level_), + std::move(file_reader), + static_cast<test::StringSink*>(file_writer->writable_file()) + ->contents() + .size(), + &table_reader); + // Search using Get() + ReadOptions ro; + + ASSERT_OK(table_reader->Get(ro, seek_ikey.Encode().ToString(), &get_context, + moptions.prefix_extractor.get())); +} + +TEST(DataBlockHashIndex, BlockBoundary) { + BlockBasedTableOptions table_options; + table_options.data_block_index_type = + BlockBasedTableOptions::kDataBlockBinaryAndHash; + table_options.block_restart_interval = 1; + table_options.block_size = 4096; + + Options options; + options.comparator = BytewiseComparator(); + + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // insert two large k/v pair. Given that the block_size is 4096, one k/v + // pair will take up one block. + // [ k1/v1 ][ k2/v2 ] + // [ Block N ][ Block N+1 ] + + { + // [ "aab"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@60 + std::string uk1("aab"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 60, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value, v2); + value.Reset(); + } + + { + // [ "axy"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@60 + std::string uk1("axy"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 60, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value, v2); + value.Reset(); + } + + { + // [ "axy"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@120 + std::string uk1("axy"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 120, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value, v1); + value.Reset(); + } + + { + // [ "axy"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@5 + std::string uk1("axy"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 5, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kNotFound); + value.Reset(); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/filter_block.h b/src/rocksdb/table/filter_block.h new file mode 100644 index 00000000..a9304954 --- /dev/null +++ b/src/rocksdb/table/filter_block.h @@ -0,0 +1,149 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A filter block is stored near the end of a Table file. It contains +// filters (e.g., bloom filters) for all data blocks in the table combined +// into a single filter block. +// +// It is a base class for BlockBasedFilter and FullFilter. +// These two are both used in BlockBasedTable. The first one contain filter +// For a part of keys in sst file, the second contain filter for all keys +// in sst file. + +#pragma once + +#include <memory> +#include <stddef.h> +#include <stdint.h> +#include <string> +#include <vector> +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "util/hash.h" +#include "format.h" + +namespace rocksdb { + +const uint64_t kNotValid = ULLONG_MAX; +class FilterPolicy; + +// A FilterBlockBuilder is used to construct all of the filters for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// +// The sequence of calls to FilterBlockBuilder must match the regexp: +// (StartBlock Add*)* Finish +// +// BlockBased/Full FilterBlock would be called in the same way. +class FilterBlockBuilder { + public: + explicit FilterBlockBuilder() {} + virtual ~FilterBlockBuilder() {} + + virtual bool IsBlockBased() = 0; // If is blockbased filter + virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter + virtual void Add(const Slice& key) = 0; // Add a key to current filter + virtual size_t NumAdded() const = 0; // Number of keys added + Slice Finish() { // Generate Filter + const BlockHandle empty_handle; + Status dont_care_status; + auto ret = Finish(empty_handle, &dont_care_status); + assert(dont_care_status.ok()); + return ret; + } + virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0; + + private: + // No copying allowed + FilterBlockBuilder(const FilterBlockBuilder&); + void operator=(const FilterBlockBuilder&); +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +// +// BlockBased/Full FilterBlock would be called in the same way. +class FilterBlockReader { + public: + explicit FilterBlockReader() + : whole_key_filtering_(true), size_(0), statistics_(nullptr) {} + explicit FilterBlockReader(size_t s, Statistics* stats, + bool _whole_key_filtering) + : whole_key_filtering_(_whole_key_filtering), + size_(s), + statistics_(stats) {} + virtual ~FilterBlockReader() {} + + virtual bool IsBlockBased() = 0; // If is blockbased filter + /** + * If no_io is set, then it returns true if it cannot answer the query without + * reading data from disk. This is used in PartitionedFilterBlockReader to + * avoid reading partitions that are not in block cache already + * + * Normally filters are built on only the user keys and the InternalKey is not + * needed for a query. The index in PartitionedFilterBlockReader however is + * built upon InternalKey and must be provided via const_ikey_ptr when running + * queries. + */ + virtual bool KeyMayMatch(const Slice& key, + const SliceTransform* prefix_extractor, + uint64_t block_offset = kNotValid, + const bool no_io = false, + const Slice* const const_ikey_ptr = nullptr) = 0; + + /** + * no_io and const_ikey_ptr here means the same as in KeyMayMatch + */ + virtual bool PrefixMayMatch(const Slice& prefix, + const SliceTransform* prefix_extractor, + uint64_t block_offset = kNotValid, + const bool no_io = false, + const Slice* const const_ikey_ptr = nullptr) = 0; + + virtual size_t ApproximateMemoryUsage() const = 0; + virtual size_t size() const { return size_; } + virtual Statistics* statistics() const { return statistics_; } + + bool whole_key_filtering() const { return whole_key_filtering_; } + + // convert this object to a human readable form + virtual std::string ToString() const { + std::string error_msg("Unsupported filter \n"); + return error_msg; + } + + virtual void CacheDependencies(bool /*pin*/, + const SliceTransform* /*prefix_extractor*/) {} + + virtual bool RangeMayExist( + const Slice* /*iterate_upper_bound*/, const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* /*comparator*/, const Slice* const const_ikey_ptr, + bool* filter_checked, bool /*need_upper_bound_check*/) { + *filter_checked = true; + Slice prefix = prefix_extractor->Transform(user_key); + return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, + const_ikey_ptr); + } + + protected: + bool whole_key_filtering_; + + private: + // No copying allowed + FilterBlockReader(const FilterBlockReader&); + void operator=(const FilterBlockReader&); + size_t size_; + Statistics* statistics_; + int level_ = -1; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/flush_block_policy.cc b/src/rocksdb/table/flush_block_policy.cc new file mode 100644 index 00000000..1b167582 --- /dev/null +++ b/src/rocksdb/table/flush_block_policy.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "table/block_builder.h" +#include "table/format.h" + +#include <cassert> + +namespace rocksdb { + +// Flush block by size +class FlushBlockBySizePolicy : public FlushBlockPolicy { + public: + // @params block_size: Approximate size of user data packed per + // block. + // @params block_size_deviation: This is used to close a block before it + // reaches the configured + FlushBlockBySizePolicy(const uint64_t block_size, + const uint64_t block_size_deviation, + const bool align, + const BlockBuilder& data_block_builder) + : block_size_(block_size), + block_size_deviation_limit_( + ((block_size * (100 - block_size_deviation)) + 99) / 100), + align_(align), + data_block_builder_(data_block_builder) {} + + bool Update(const Slice& key, const Slice& value) override { + // it makes no sense to flush when the data block is empty + if (data_block_builder_.empty()) { + return false; + } + + auto curr_size = data_block_builder_.CurrentSizeEstimate(); + + // Do flush if one of the below two conditions is true: + // 1) if the current estimated size already exceeds the block size, + // 2) block_size_deviation is set and the estimated size after appending + // the kv will exceed the block size and the current size is under the + // the deviation. + return curr_size >= block_size_ || BlockAlmostFull(key, value); + } + + private: + bool BlockAlmostFull(const Slice& key, const Slice& value) const { + if (block_size_deviation_limit_ == 0) { + return false; + } + + const auto curr_size = data_block_builder_.CurrentSizeEstimate(); + auto estimated_size_after = + data_block_builder_.EstimateSizeAfterKV(key, value); + + if (align_) { + estimated_size_after += kBlockTrailerSize; + return estimated_size_after > block_size_; + } + + return estimated_size_after > block_size_ && + curr_size > block_size_deviation_limit_; + } + + const uint64_t block_size_; + const uint64_t block_size_deviation_limit_; + const bool align_; + const BlockBuilder& data_block_builder_; +}; + +FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + const BlockBasedTableOptions& table_options, + const BlockBuilder& data_block_builder) const { + return new FlushBlockBySizePolicy( + table_options.block_size, table_options.block_size_deviation, + table_options.block_align, data_block_builder); +} + +FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + const uint64_t size, const int deviation, + const BlockBuilder& data_block_builder) { + return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder); +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/format.cc b/src/rocksdb/table/format.cc new file mode 100644 index 00000000..476db85f --- /dev/null +++ b/src/rocksdb/table/format.cc @@ -0,0 +1,412 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/format.h" + +#include <inttypes.h> +#include <string> + +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "rocksdb/env.h" +#include "table/block.h" +#include "table/block_based_table_reader.h" +#include "table/block_fetcher.h" +#include "table/persistent_cache_helper.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/file_reader_writer.h" +#include "util/logging.h" +#include "util/memory_allocator.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/xxhash.h" + +namespace rocksdb { + +extern const uint64_t kLegacyBlockBasedTableMagicNumber; +extern const uint64_t kBlockBasedTableMagicNumber; + +#ifndef ROCKSDB_LITE +extern const uint64_t kLegacyPlainTableMagicNumber; +extern const uint64_t kPlainTableMagicNumber; +#else +// ROCKSDB_LITE doesn't have plain table +const uint64_t kLegacyPlainTableMagicNumber = 0; +const uint64_t kPlainTableMagicNumber = 0; +#endif + +bool ShouldReportDetailedTime(Env* env, Statistics* stats) { + return env != nullptr && stats != nullptr && + stats->get_stats_level() > kExceptDetailedTimers; +} + +void BlockHandle::EncodeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != ~static_cast<uint64_t>(0)); + assert(size_ != ~static_cast<uint64_t>(0)); + PutVarint64Varint64(dst, offset_, size_); +} + +Status BlockHandle::DecodeFrom(Slice* input) { + if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) { + return Status::OK(); + } else { + // reset in case failure after partially decoding + offset_ = 0; + size_ = 0; + return Status::Corruption("bad block handle"); + } +} + +Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) { + if (GetVarint64(input, &size_)) { + offset_ = _offset; + return Status::OK(); + } else { + // reset in case failure after partially decoding + offset_ = 0; + size_ = 0; + return Status::Corruption("bad block handle"); + } +} + +// Return a string that contains the copy of handle. +std::string BlockHandle::ToString(bool hex) const { + std::string handle_str; + EncodeTo(&handle_str); + if (hex) { + return Slice(handle_str).ToString(true); + } else { + return handle_str; + } +} + +const BlockHandle BlockHandle::kNullBlockHandle(0, 0); + +namespace { +inline bool IsLegacyFooterFormat(uint64_t magic_number) { + return magic_number == kLegacyBlockBasedTableMagicNumber || + magic_number == kLegacyPlainTableMagicNumber; +} +inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { + if (magic_number == kLegacyBlockBasedTableMagicNumber) { + return kBlockBasedTableMagicNumber; + } + if (magic_number == kLegacyPlainTableMagicNumber) { + return kPlainTableMagicNumber; + } + assert(false); + return 0; +} +} // namespace + +// legacy footer format: +// metaindex handle (varint64 offset, varint64 size) +// index handle (varint64 offset, varint64 size) +// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength +// table_magic_number (8 bytes) +// new footer format: +// checksum type (char, 1 byte) +// metaindex handle (varint64 offset, varint64 size) +// index handle (varint64 offset, varint64 size) +// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1 +// footer version (4 bytes) +// table_magic_number (8 bytes) +void Footer::EncodeTo(std::string* dst) const { + assert(HasInitializedTableMagicNumber()); + if (IsLegacyFooterFormat(table_magic_number())) { + // has to be default checksum with legacy footer + assert(checksum_ == kCRC32c); + const size_t original_size = dst->size(); + metaindex_handle_.EncodeTo(dst); + index_handle_.EncodeTo(dst); + dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding + PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu)); + PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32)); + assert(dst->size() == original_size + kVersion0EncodedLength); + } else { + const size_t original_size = dst->size(); + dst->push_back(static_cast<char>(checksum_)); + metaindex_handle_.EncodeTo(dst); + index_handle_.EncodeTo(dst); + dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding + PutFixed32(dst, version()); + PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu)); + PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32)); + assert(dst->size() == original_size + kNewVersionsEncodedLength); + } +} + +Footer::Footer(uint64_t _table_magic_number, uint32_t _version) + : version_(_version), + checksum_(kCRC32c), + table_magic_number_(_table_magic_number) { + // This should be guaranteed by constructor callers + assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0); +} + +Status Footer::DecodeFrom(Slice* input) { + assert(!HasInitializedTableMagicNumber()); + assert(input != nullptr); + assert(input->size() >= kMinEncodedLength); + + const char* magic_ptr = + input->data() + input->size() - kMagicNumberLengthByte; + const uint32_t magic_lo = DecodeFixed32(magic_ptr); + const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); + uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) | + (static_cast<uint64_t>(magic_lo))); + + // We check for legacy formats here and silently upconvert them + bool legacy = IsLegacyFooterFormat(magic); + if (legacy) { + magic = UpconvertLegacyFooterFormat(magic); + } + set_table_magic_number(magic); + + if (legacy) { + // The size is already asserted to be at least kMinEncodedLength + // at the beginning of the function + input->remove_prefix(input->size() - kVersion0EncodedLength); + version_ = 0 /* legacy */; + checksum_ = kCRC32c; + } else { + version_ = DecodeFixed32(magic_ptr - 4); + // Footer version 1 and higher will always occupy exactly this many bytes. + // It consists of the checksum type, two block handles, padding, + // a version number, and a magic number + if (input->size() < kNewVersionsEncodedLength) { + return Status::Corruption("input is too short to be an sstable"); + } else { + input->remove_prefix(input->size() - kNewVersionsEncodedLength); + } + uint32_t chksum; + if (!GetVarint32(input, &chksum)) { + return Status::Corruption("bad checksum type"); + } + checksum_ = static_cast<ChecksumType>(chksum); + } + + Status result = metaindex_handle_.DecodeFrom(input); + if (result.ok()) { + result = index_handle_.DecodeFrom(input); + } + if (result.ok()) { + // We skip over any leftover data (just padding for now) in "input" + const char* end = magic_ptr + kMagicNumberLengthByte; + *input = Slice(end, input->data() + input->size() - end); + } + return result; +} + +std::string Footer::ToString() const { + std::string result; + result.reserve(1024); + + bool legacy = IsLegacyFooterFormat(table_magic_number_); + if (legacy) { + result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); + result.append("index handle: " + index_handle_.ToString() + "\n "); + result.append("table_magic_number: " + + rocksdb::ToString(table_magic_number_) + "\n "); + } else { + result.append("checksum: " + rocksdb::ToString(checksum_) + "\n "); + result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); + result.append("index handle: " + index_handle_.ToString() + "\n "); + result.append("footer version: " + rocksdb::ToString(version_) + "\n "); + result.append("table_magic_number: " + + rocksdb::ToString(table_magic_number_) + "\n "); + } + return result; +} + +Status ReadFooterFromFile(RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, + uint64_t file_size, Footer* footer, + uint64_t enforce_table_magic_number) { + if (file_size < Footer::kMinEncodedLength) { + return Status::Corruption("file is too short (" + ToString(file_size) + + " bytes) to be an " + "sstable: " + + file->file_name()); + } + + char footer_space[Footer::kMaxEncodedLength]; + Slice footer_input; + size_t read_offset = + (file_size > Footer::kMaxEncodedLength) + ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength) + : 0; + Status s; + if (prefetch_buffer == nullptr || + !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength, + &footer_input)) { + s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input, + footer_space); + if (!s.ok()) return s; + } + + // Check that we actually read the whole footer from the file. It may be + // that size isn't correct. + if (footer_input.size() < Footer::kMinEncodedLength) { + return Status::Corruption("file is too short (" + ToString(file_size) + + " bytes) to be an " + "sstable" + + file->file_name()); + } + + s = footer->DecodeFrom(&footer_input); + if (!s.ok()) { + return s; + } + if (enforce_table_magic_number != 0 && + enforce_table_magic_number != footer->table_magic_number()) { + return Status::Corruption( + "Bad table magic number: expected " + + ToString(enforce_table_magic_number) + ", found " + + ToString(footer->table_magic_number()) + " in " + file->file_name()); + } + return Status::OK(); +} + +Status UncompressBlockContentsForCompressionType( + const UncompressionInfo& uncompression_info, const char* data, size_t n, + BlockContents* contents, uint32_t format_version, + const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) { + CacheAllocationPtr ubuf; + + assert(uncompression_info.type() != kNoCompression && + "Invalid compression type"); + + StopWatchNano timer(ioptions.env, ShouldReportDetailedTime( + ioptions.env, ioptions.statistics)); + int decompress_size = 0; + switch (uncompression_info.type()) { + case kSnappyCompression: { + size_t ulength = 0; + static char snappy_corrupt_msg[] = + "Snappy not supported or corrupted Snappy compressed block contents"; + if (!Snappy_GetUncompressedLength(data, n, &ulength)) { + return Status::Corruption(snappy_corrupt_msg); + } + ubuf = AllocateBlock(ulength, allocator); + if (!Snappy_Uncompress(data, n, ubuf.get())) { + return Status::Corruption(snappy_corrupt_msg); + } + *contents = BlockContents(std::move(ubuf), ulength); + break; + } + case kZlibCompression: + ubuf = Zlib_Uncompress( + uncompression_info, data, n, &decompress_size, + GetCompressFormatForVersion(kZlibCompression, format_version), + allocator); + if (!ubuf) { + static char zlib_corrupt_msg[] = + "Zlib not supported or corrupted Zlib compressed block contents"; + return Status::Corruption(zlib_corrupt_msg); + } + *contents = BlockContents(std::move(ubuf), decompress_size); + break; + case kBZip2Compression: + ubuf = BZip2_Uncompress( + data, n, &decompress_size, + GetCompressFormatForVersion(kBZip2Compression, format_version), + allocator); + if (!ubuf) { + static char bzip2_corrupt_msg[] = + "Bzip2 not supported or corrupted Bzip2 compressed block contents"; + return Status::Corruption(bzip2_corrupt_msg); + } + *contents = BlockContents(std::move(ubuf), decompress_size); + break; + case kLZ4Compression: + ubuf = LZ4_Uncompress( + uncompression_info, data, n, &decompress_size, + GetCompressFormatForVersion(kLZ4Compression, format_version), + allocator); + if (!ubuf) { + static char lz4_corrupt_msg[] = + "LZ4 not supported or corrupted LZ4 compressed block contents"; + return Status::Corruption(lz4_corrupt_msg); + } + *contents = BlockContents(std::move(ubuf), decompress_size); + break; + case kLZ4HCCompression: + ubuf = LZ4_Uncompress( + uncompression_info, data, n, &decompress_size, + GetCompressFormatForVersion(kLZ4HCCompression, format_version), + allocator); + if (!ubuf) { + static char lz4hc_corrupt_msg[] = + "LZ4HC not supported or corrupted LZ4HC compressed block contents"; + return Status::Corruption(lz4hc_corrupt_msg); + } + *contents = BlockContents(std::move(ubuf), decompress_size); + break; + case kXpressCompression: + // XPRESS allocates memory internally, thus no support for custom + // allocator. + ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size)); + if (!ubuf) { + static char xpress_corrupt_msg[] = + "XPRESS not supported or corrupted XPRESS compressed block " + "contents"; + return Status::Corruption(xpress_corrupt_msg); + } + *contents = BlockContents(std::move(ubuf), decompress_size); + break; + case kZSTD: + case kZSTDNotFinalCompression: + ubuf = ZSTD_Uncompress(uncompression_info, data, n, &decompress_size, + allocator); + if (!ubuf) { + static char zstd_corrupt_msg[] = + "ZSTD not supported or corrupted ZSTD compressed block contents"; + return Status::Corruption(zstd_corrupt_msg); + } + *contents = BlockContents(std::move(ubuf), decompress_size); + break; + default: + return Status::Corruption("bad block type"); + } + + if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) { + RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS, + timer.ElapsedNanos()); + } + RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED, + contents->data.size()); + RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED); + + return Status::OK(); +} + +// +// The 'data' points to the raw block contents that was read in from file. +// This method allocates a new heap buffer and the raw block +// contents are uncompresed into this buffer. This +// buffer is returned via 'result' and it is upto the caller to +// free this buffer. +// format_version is the block format as defined in include/rocksdb/table.h +Status UncompressBlockContents(const UncompressionInfo& uncompression_info, + const char* data, size_t n, + BlockContents* contents, uint32_t format_version, + const ImmutableCFOptions& ioptions, + MemoryAllocator* allocator) { + assert(data[n] != kNoCompression); + assert(data[n] == uncompression_info.type()); + return UncompressBlockContentsForCompressionType(uncompression_info, data, n, + contents, format_version, + ioptions, allocator); +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/format.h b/src/rocksdb/table/format.h new file mode 100644 index 00000000..f5858850 --- /dev/null +++ b/src/rocksdb/table/format.h @@ -0,0 +1,306 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <stdint.h> +#include <string> +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include <malloc_np.h> +#else +#include <malloc.h> +#endif +#endif +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" + +#include "options/cf_options.h" +#include "port/port.h" // noexcept +#include "table/persistent_cache_options.h" +#include "util/file_reader_writer.h" +#include "util/memory_allocator.h" + +namespace rocksdb { + +class RandomAccessFile; +struct ReadOptions; + +extern bool ShouldReportDetailedTime(Env* env, Statistics* stats); + +// the length of the magic number in bytes. +const int kMagicNumberLengthByte = 8; + +// BlockHandle is a pointer to the extent of a file that stores a data +// block or a meta block. +class BlockHandle { + public: + BlockHandle(); + BlockHandle(uint64_t offset, uint64_t size); + + // The offset of the block in the file. + uint64_t offset() const { return offset_; } + void set_offset(uint64_t _offset) { offset_ = _offset; } + + // The size of the stored block + uint64_t size() const { return size_; } + void set_size(uint64_t _size) { size_ = _size; } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + Status DecodeSizeFrom(uint64_t offset, Slice* input); + + // Return a string that contains the copy of handle. + std::string ToString(bool hex = true) const; + + // if the block handle's offset and size are both "0", we will view it + // as a null block handle that points to no where. + bool IsNull() const { return offset_ == 0 && size_ == 0; } + + static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; } + + // Maximum encoding length of a BlockHandle + enum { kMaxEncodedLength = 10 + 10 }; + + private: + uint64_t offset_; + uint64_t size_; + + static const BlockHandle kNullBlockHandle; +}; + +inline uint32_t GetCompressFormatForVersion(CompressionType compression_type, + uint32_t version) { +#ifdef NDEBUG + (void)compression_type; +#endif + // snappy is not versioned + assert(compression_type != kSnappyCompression && + compression_type != kXpressCompression && + compression_type != kNoCompression); + // As of version 2, we encode compressed block with + // compress_format_version == 2. Before that, the version is 1. + // DO NOT CHANGE THIS FUNCTION, it affects disk format + return version >= 2 ? 2 : 1; +} + +inline bool BlockBasedTableSupportedVersion(uint32_t version) { + return version <= 4; +} + +// Footer encapsulates the fixed information stored at the tail +// end of every table file. +class Footer { + public: + // Constructs a footer without specifying its table magic number. + // In such case, the table magic number of such footer should be + // initialized via @ReadFooterFromFile(). + // Use this when you plan to load Footer with DecodeFrom(). Never use this + // when you plan to EncodeTo. + Footer() : Footer(kInvalidTableMagicNumber, 0) {} + + // Use this constructor when you plan to write out the footer using + // EncodeTo(). Never use this constructor with DecodeFrom(). + Footer(uint64_t table_magic_number, uint32_t version); + + // The version of the footer in this file + uint32_t version() const { return version_; } + + // The checksum type used in this file + ChecksumType checksum() const { return checksum_; } + void set_checksum(const ChecksumType c) { checksum_ = c; } + + // The block handle for the metaindex block of the table + const BlockHandle& metaindex_handle() const { return metaindex_handle_; } + void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; } + + // The block handle for the index block of the table + const BlockHandle& index_handle() const { return index_handle_; } + + void set_index_handle(const BlockHandle& h) { index_handle_ = h; } + + uint64_t table_magic_number() const { return table_magic_number_; } + + void EncodeTo(std::string* dst) const; + + // Set the current footer based on the input slice. + // + // REQUIRES: table_magic_number_ is not set (i.e., + // HasInitializedTableMagicNumber() is true). The function will initialize the + // magic number + Status DecodeFrom(Slice* input); + + // Encoded length of a Footer. Note that the serialization of a Footer will + // always occupy at least kMinEncodedLength bytes. If fields are changed + // the version number should be incremented and kMaxEncodedLength should be + // increased accordingly. + enum { + // Footer version 0 (legacy) will always occupy exactly this many bytes. + // It consists of two block handles, padding, and a magic number. + kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8, + // Footer of versions 1 and higher will always occupy exactly this many + // bytes. It consists of the checksum type, two block handles, padding, + // a version number (bigger than 1), and a magic number + kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8, + kMinEncodedLength = kVersion0EncodedLength, + kMaxEncodedLength = kNewVersionsEncodedLength, + }; + + static const uint64_t kInvalidTableMagicNumber = 0; + + // convert this object to a human readable form + std::string ToString() const; + + private: + // REQUIRES: magic number wasn't initialized. + void set_table_magic_number(uint64_t magic_number) { + assert(!HasInitializedTableMagicNumber()); + table_magic_number_ = magic_number; + } + + // return true if @table_magic_number_ is set to a value different + // from @kInvalidTableMagicNumber. + bool HasInitializedTableMagicNumber() const { + return (table_magic_number_ != kInvalidTableMagicNumber); + } + + uint32_t version_; + ChecksumType checksum_; + BlockHandle metaindex_handle_; + BlockHandle index_handle_; + uint64_t table_magic_number_ = 0; +}; + +// Read the footer from file +// If enforce_table_magic_number != 0, ReadFooterFromFile() will return +// corruption if table_magic number is not equal to enforce_table_magic_number +Status ReadFooterFromFile(RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, + uint64_t file_size, Footer* footer, + uint64_t enforce_table_magic_number = 0); + +// 1-byte type + 32-bit crc +static const size_t kBlockTrailerSize = 5; + +inline CompressionType get_block_compression_type(const char* block_data, + size_t block_size) { + return static_cast<CompressionType>(block_data[block_size]); +} + +struct BlockContents { + Slice data; // Actual contents of data + CacheAllocationPtr allocation; + +#ifndef NDEBUG + // Whether the block is a raw block, which contains compression type + // byte. It is only used for assertion. + bool is_raw_block = false; +#endif // NDEBUG + + BlockContents() {} + + BlockContents(const Slice& _data) : data(_data) {} + + BlockContents(CacheAllocationPtr&& _data, size_t _size) + : data(_data.get(), _size), allocation(std::move(_data)) {} + + BlockContents(std::unique_ptr<char[]>&& _data, size_t _size) + : data(_data.get(), _size) { + allocation.reset(_data.release()); + } + + bool own_bytes() const { return allocation.get() != nullptr; } + + // It's the caller's responsibility to make sure that this is + // for raw block contents, which contains the compression + // byte in the end. + CompressionType get_compression_type() const { + assert(is_raw_block); + return get_block_compression_type(data.data(), data.size()); + } + + // The additional memory space taken by the block data. + size_t usable_size() const { + if (allocation.get() != nullptr) { + auto allocator = allocation.get_deleter().allocator; + if (allocator) { + return allocator->UsableSize(allocation.get(), data.size()); + } +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + return malloc_usable_size(allocation.get()); +#else + return data.size(); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + } else { + return 0; // no extra memory is occupied by the data + } + } + + size_t ApproximateMemoryUsage() const { + return usable_size() + sizeof(*this); + } + + BlockContents(BlockContents&& other) ROCKSDB_NOEXCEPT { + *this = std::move(other); + } + + BlockContents& operator=(BlockContents&& other) { + data = std::move(other.data); + allocation = std::move(other.allocation); +#ifndef NDEBUG + is_raw_block = other.is_raw_block; +#endif // NDEBUG + return *this; + } +}; + +// Read the block identified by "handle" from "file". On failure +// return non-OK. On success fill *result and return OK. +extern Status ReadBlockContents( + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const ReadOptions& options, const BlockHandle& handle, + BlockContents* contents, const ImmutableCFOptions& ioptions, + bool do_uncompress = true, const Slice& compression_dict = Slice(), + const PersistentCacheOptions& cache_options = PersistentCacheOptions()); + +// The 'data' points to the raw block contents read in from file. +// This method allocates a new heap buffer and the raw block +// contents are uncompresed into this buffer. This buffer is +// returned via 'result' and it is upto the caller to +// free this buffer. +// For description of compress_format_version and possible values, see +// util/compression.h +extern Status UncompressBlockContents(const UncompressionInfo& info, + const char* data, size_t n, + BlockContents* contents, + uint32_t compress_format_version, + const ImmutableCFOptions& ioptions, + MemoryAllocator* allocator = nullptr); + +// This is an extension to UncompressBlockContents that accepts +// a specific compression type. This is used by un-wrapped blocks +// with no compression header. +extern Status UncompressBlockContentsForCompressionType( + const UncompressionInfo& info, const char* data, size_t n, + BlockContents* contents, uint32_t compress_format_version, + const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr); + +// Implementation details follow. Clients should ignore, + +// TODO(andrewkr): we should prefer one way of representing a null/uninitialized +// BlockHandle. Currently we use zeros for null and use negation-of-zeros for +// uninitialized. +inline BlockHandle::BlockHandle() + : BlockHandle(~static_cast<uint64_t>(0), ~static_cast<uint64_t>(0)) {} + +inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size) + : offset_(_offset), size_(_size) {} + +} // namespace rocksdb diff --git a/src/rocksdb/table/full_filter_bits_builder.h b/src/rocksdb/table/full_filter_bits_builder.h new file mode 100644 index 00000000..851ed1e2 --- /dev/null +++ b/src/rocksdb/table/full_filter_bits_builder.h @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <memory> +#include <string> +#include <vector> + +#include "rocksdb/filter_policy.h" + +namespace rocksdb { + +class Slice; + +class FullFilterBitsBuilder : public FilterBitsBuilder { + public: + explicit FullFilterBitsBuilder(const size_t bits_per_key, + const size_t num_probes); + + ~FullFilterBitsBuilder(); + + virtual void AddKey(const Slice& key) override; + + // Create a filter that for hashes [0, n-1], the filter is allocated here + // When creating filter, it is ensured that + // total_bits = num_lines * CACHE_LINE_SIZE * 8 + // dst len is >= 5, 1 for num_probes, 4 for num_lines + // Then total_bits = (len - 5) * 8, and cache_line_size could be calculated + // +----------------------------------------------------------------+ + // | filter data with length total_bits/8 | + // +----------------------------------------------------------------+ + // | | + // | ... | + // | | + // +----------------------------------------------------------------+ + // | ... | num_probes : 1 byte | num_lines : 4 bytes | + // +----------------------------------------------------------------+ + virtual Slice Finish(std::unique_ptr<const char[]>* buf) override; + + // Calculate num of entries fit into a space. + virtual int CalculateNumEntry(const uint32_t space) override; + + // Calculate space for new filter. This is reverse of CalculateNumEntry. + uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + private: + friend class FullFilterBlockTest_DuplicateEntries_Test; + size_t bits_per_key_; + size_t num_probes_; + std::vector<uint32_t> hash_entries_; + + // Get totalbits that optimized for cpu cache line + uint32_t GetTotalBitsForLocality(uint32_t total_bits); + + // Reserve space for new filter + char* ReserveSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + // Assuming single threaded access to this function. + void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); + + // No Copy allowed + FullFilterBitsBuilder(const FullFilterBitsBuilder&); + void operator=(const FullFilterBitsBuilder&); +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/full_filter_block.cc b/src/rocksdb/table/full_filter_block.cc new file mode 100644 index 00000000..a7491a71 --- /dev/null +++ b/src/rocksdb/table/full_filter_block.cc @@ -0,0 +1,226 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/full_filter_block.h" + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include <malloc_np.h> +#else +#include <malloc.h> +#endif +#endif + +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "rocksdb/filter_policy.h" +#include "util/coding.h" + +namespace rocksdb { + +FullFilterBlockBuilder::FullFilterBlockBuilder( + const SliceTransform* prefix_extractor, bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder) + : prefix_extractor_(prefix_extractor), + whole_key_filtering_(whole_key_filtering), + last_whole_key_recorded_(false), + last_prefix_recorded_(false), + num_added_(0) { + assert(filter_bits_builder != nullptr); + filter_bits_builder_.reset(filter_bits_builder); +} + +void FullFilterBlockBuilder::Add(const Slice& key) { + const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key); + if (whole_key_filtering_) { + if (!add_prefix) { + AddKey(key); + } else { + // if both whole_key and prefix are added to bloom then we will have whole + // key and prefix addition being interleaved and thus cannot rely on the + // bits builder to properly detect the duplicates by comparing with the + // last item. + Slice last_whole_key = Slice(last_whole_key_str_); + if (!last_whole_key_recorded_ || last_whole_key.compare(key) != 0) { + AddKey(key); + last_whole_key_recorded_ = true; + last_whole_key_str_.assign(key.data(), key.size()); + } + } + } + if (add_prefix) { + AddPrefix(key); + } +} + +// Add key to filter if needed +inline void FullFilterBlockBuilder::AddKey(const Slice& key) { + filter_bits_builder_->AddKey(key); + num_added_++; +} + +// Add prefix to filter if needed +inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) { + Slice prefix = prefix_extractor_->Transform(key); + if (whole_key_filtering_) { + // if both whole_key and prefix are added to bloom then we will have whole + // key and prefix addition being interleaved and thus cannot rely on the + // bits builder to properly detect the duplicates by comparing with the last + // item. + Slice last_prefix = Slice(last_prefix_str_); + if (!last_prefix_recorded_ || last_prefix.compare(prefix) != 0) { + AddKey(prefix); + last_prefix_recorded_ = true; + last_prefix_str_.assign(prefix.data(), prefix.size()); + } + } else { + AddKey(prefix); + } +} + +void FullFilterBlockBuilder::Reset() { + last_whole_key_recorded_ = false; + last_prefix_recorded_ = false; +} + +Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, + Status* status) { + Reset(); + // In this impl we ignore BlockHandle + *status = Status::OK(); + if (num_added_ != 0) { + num_added_ = 0; + return filter_bits_builder_->Finish(&filter_data_); + } + return Slice(); +} + +FullFilterBlockReader::FullFilterBlockReader( + const SliceTransform* prefix_extractor, bool _whole_key_filtering, + const Slice& contents, FilterBitsReader* filter_bits_reader, + Statistics* stats) + : FilterBlockReader(contents.size(), stats, _whole_key_filtering), + prefix_extractor_(prefix_extractor), + contents_(contents) { + assert(filter_bits_reader != nullptr); + filter_bits_reader_.reset(filter_bits_reader); + if (prefix_extractor_ != nullptr) { + full_length_enabled_ = + prefix_extractor_->FullLengthEnabled(&prefix_extractor_full_length_); + } +} + +FullFilterBlockReader::FullFilterBlockReader( + const SliceTransform* prefix_extractor, bool _whole_key_filtering, + BlockContents&& contents, FilterBitsReader* filter_bits_reader, + Statistics* stats) + : FullFilterBlockReader(prefix_extractor, _whole_key_filtering, + contents.data, filter_bits_reader, stats) { + block_contents_ = std::move(contents); +} + +bool FullFilterBlockReader::KeyMayMatch( + const Slice& key, const SliceTransform* /*prefix_extractor*/, + uint64_t block_offset, const bool /*no_io*/, + const Slice* const /*const_ikey_ptr*/) { +#ifdef NDEBUG + (void)block_offset; +#endif + assert(block_offset == kNotValid); + if (!whole_key_filtering_) { + return true; + } + return MayMatch(key); +} + +bool FullFilterBlockReader::PrefixMayMatch( + const Slice& prefix, const SliceTransform* /* prefix_extractor */, + uint64_t block_offset, const bool /*no_io*/, + const Slice* const /*const_ikey_ptr*/) { +#ifdef NDEBUG + (void)block_offset; +#endif + assert(block_offset == kNotValid); + return MayMatch(prefix); +} + +bool FullFilterBlockReader::MayMatch(const Slice& entry) { + if (contents_.size() != 0) { + if (filter_bits_reader_->MayMatch(entry)) { + PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + return true; + } else { + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + return false; + } + } + return true; // remain the same with block_based filter +} + +size_t FullFilterBlockReader::ApproximateMemoryUsage() const { + size_t usage = block_contents_.usable_size(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); + usage += malloc_usable_size(filter_bits_reader_.get()); +#else + usage += sizeof(*this); + usage += sizeof(*filter_bits_reader_.get()); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; +} + +bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound, + const Slice& user_key, const SliceTransform* prefix_extractor, + const Comparator* comparator, const Slice* const const_ikey_ptr, + bool* filter_checked, bool need_upper_bound_check) { + if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) { + *filter_checked = false; + return true; + } + Slice prefix = prefix_extractor->Transform(user_key); + if (need_upper_bound_check && + !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) { + *filter_checked = false; + return true; + } else { + *filter_checked = true; + return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, + const_ikey_ptr); + } +} + +bool FullFilterBlockReader::IsFilterCompatible( + const Slice* iterate_upper_bound, const Slice& prefix, + const Comparator* comparator) { + // Try to reuse the bloom filter in the SST table if prefix_extractor in + // mutable_cf_options has changed. If range [user_key, upper_bound) all + // share the same prefix then we may still be able to use the bloom filter. + if (iterate_upper_bound != nullptr && prefix_extractor_) { + if (!prefix_extractor_->InDomain(*iterate_upper_bound)) { + return false; + } + Slice upper_bound_xform = + prefix_extractor_->Transform(*iterate_upper_bound); + // first check if user_key and upper_bound all share the same prefix + if (!comparator->Equal(prefix, upper_bound_xform)) { + // second check if user_key's prefix is the immediate predecessor of + // upper_bound and have the same length. If so, we know for sure all + // keys in the range [user_key, upper_bound) share the same prefix. + // Also need to make sure upper_bound are full length to ensure + // correctness + if (!full_length_enabled_ || + iterate_upper_bound->size() != prefix_extractor_full_length_ || + !comparator->IsSameLengthImmediateSuccessor(prefix, + *iterate_upper_bound)) { + return false; + } + } + return true; + } else { + return false; + } +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/full_filter_block.h b/src/rocksdb/table/full_filter_block.h new file mode 100644 index 00000000..e4384c91 --- /dev/null +++ b/src/rocksdb/table/full_filter_block.h @@ -0,0 +1,133 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <stddef.h> +#include <stdint.h> +#include <memory> +#include <string> +#include <vector> +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "db/dbformat.h" +#include "util/hash.h" +#include "table/filter_block.h" + +namespace rocksdb { + +class FilterPolicy; +class FilterBitsBuilder; +class FilterBitsReader; + +// A FullFilterBlockBuilder is used to construct a full filter for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// The format of full filter block is: +// +----------------------------------------------------------------+ +// | full filter for all keys in sst file | +// +----------------------------------------------------------------+ +// The full filter can be very large. At the end of it, we put +// num_probes: how many hash functions are used in bloom filter +// +class FullFilterBlockBuilder : public FilterBlockBuilder { + public: + explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor, + bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder); + // bits_builder is created in filter_policy, it should be passed in here + // directly. and be deleted here + ~FullFilterBlockBuilder() {} + + virtual bool IsBlockBased() override { return false; } + virtual void StartBlock(uint64_t /*block_offset*/) override {} + virtual void Add(const Slice& key) override; + virtual size_t NumAdded() const override { return num_added_; } + virtual Slice Finish(const BlockHandle& tmp, Status* status) override; + using FilterBlockBuilder::Finish; + + protected: + virtual void AddKey(const Slice& key); + std::unique_ptr<FilterBitsBuilder> filter_bits_builder_; + virtual void Reset(); + + private: + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + bool last_whole_key_recorded_; + std::string last_whole_key_str_; + bool last_prefix_recorded_; + std::string last_prefix_str_; + + uint32_t num_added_; + std::unique_ptr<const char[]> filter_data_; + + void AddPrefix(const Slice& key); + + // No copying allowed + FullFilterBlockBuilder(const FullFilterBlockBuilder&); + void operator=(const FullFilterBlockBuilder&); +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +class FullFilterBlockReader : public FilterBlockReader { + public: + // REQUIRES: "contents" and filter_bits_reader must stay live + // while *this is live. + explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, + bool whole_key_filtering, + const Slice& contents, + FilterBitsReader* filter_bits_reader, + Statistics* statistics); + explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, + bool whole_key_filtering, + BlockContents&& contents, + FilterBitsReader* filter_bits_reader, + Statistics* statistics); + + // bits_reader is created in filter_policy, it should be passed in here + // directly. and be deleted here + ~FullFilterBlockReader() {} + + virtual bool IsBlockBased() override { return false; } + + virtual bool KeyMayMatch( + const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset = kNotValid, const bool no_io = false, + const Slice* const const_ikey_ptr = nullptr) override; + + virtual bool PrefixMayMatch( + const Slice& prefix, const SliceTransform* prefix_extractor, + uint64_t block_offset = kNotValid, const bool no_io = false, + const Slice* const const_ikey_ptr = nullptr) override; + virtual size_t ApproximateMemoryUsage() const override; + virtual bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* comparator, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool need_upper_bound_check) override; + private: + const SliceTransform* prefix_extractor_; + Slice contents_; + std::unique_ptr<FilterBitsReader> filter_bits_reader_; + BlockContents block_contents_; + bool full_length_enabled_; + size_t prefix_extractor_full_length_; + + // No copying allowed + FullFilterBlockReader(const FullFilterBlockReader&); + bool MayMatch(const Slice& entry); + void operator=(const FullFilterBlockReader&); + bool IsFilterCompatible(const Slice* iterate_upper_bound, + const Slice& prefix, const Comparator* comparator); + +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/full_filter_block_test.cc b/src/rocksdb/table/full_filter_block_test.cc new file mode 100644 index 00000000..f01ae52b --- /dev/null +++ b/src/rocksdb/table/full_filter_block_test.cc @@ -0,0 +1,222 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/full_filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "table/full_filter_bits_builder.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/string_util.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class TestFilterBitsBuilder : public FilterBitsBuilder { + public: + explicit TestFilterBitsBuilder() {} + + // Add Key to filter + void AddKey(const Slice& key) override { + hash_entries_.push_back(Hash(key.data(), key.size(), 1)); + } + + // Generate the filter using the keys that are added + Slice Finish(std::unique_ptr<const char[]>* buf) override { + uint32_t len = static_cast<uint32_t>(hash_entries_.size()) * 4; + char* data = new char[len]; + for (size_t i = 0; i < hash_entries_.size(); i++) { + EncodeFixed32(data + i * 4, hash_entries_[i]); + } + const char* const_data = data; + buf->reset(const_data); + return Slice(data, len); + } + + private: + std::vector<uint32_t> hash_entries_; +}; + +class TestFilterBitsReader : public FilterBitsReader { + public: + explicit TestFilterBitsReader(const Slice& contents) + : data_(contents.data()), len_(static_cast<uint32_t>(contents.size())) {} + + bool MayMatch(const Slice& entry) override { + uint32_t h = Hash(entry.data(), entry.size(), 1); + for (size_t i = 0; i + 4 <= len_; i += 4) { + if (h == DecodeFixed32(data_ + i)) { + return true; + } + } + return false; + } + + private: + const char* data_; + uint32_t len_; +}; + + +class TestHashFilter : public FilterPolicy { + public: + const char* Name() const override { return "TestHashFilter"; } + + void CreateFilter(const Slice* keys, int n, std::string* dst) const override { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + bool KeyMayMatch(const Slice& key, const Slice& filter) const override { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } + + FilterBitsBuilder* GetFilterBitsBuilder() const override { + return new TestFilterBitsBuilder(); + } + + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override { + return new TestFilterBitsReader(contents); + } +}; + +class PluginFullFilterBlockTest : public testing::Test { + public: + BlockBasedTableOptions table_options_; + + PluginFullFilterBlockTest() { + table_options_.filter_policy.reset(new TestHashFilter()); + } +}; + +TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) { + FullFilterBlockBuilder builder( + nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); + Slice block = builder.Finish(); + ASSERT_EQ("", EscapeString(block)); + + FullFilterBlockReader reader( + nullptr, true, block, + table_options_.filter_policy->GetFilterBitsReader(block), nullptr); + // Remain same symantic with blockbased filter + ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); +} + +TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { + FullFilterBlockBuilder builder( + nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.Add("box"); + builder.Add("hello"); + Slice block = builder.Finish(); + FullFilterBlockReader reader( + nullptr, true, block, + table_options_.filter_policy->GetFilterBitsReader(block), nullptr); + ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr)); +} + +class FullFilterBlockTest : public testing::Test { + public: + BlockBasedTableOptions table_options_; + + FullFilterBlockTest() { + table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); + } + + ~FullFilterBlockTest() override {} +}; + +TEST_F(FullFilterBlockTest, EmptyBuilder) { + FullFilterBlockBuilder builder( + nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); + Slice block = builder.Finish(); + ASSERT_EQ("", EscapeString(block)); + + FullFilterBlockReader reader( + nullptr, true, block, + table_options_.filter_policy->GetFilterBitsReader(block), nullptr); + // Remain same symantic with blockbased filter + ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); +} + +TEST_F(FullFilterBlockTest, DuplicateEntries) { + { // empty prefixes + std::unique_ptr<const SliceTransform> prefix_extractor( + NewFixedPrefixTransform(0)); + auto bits_builder = dynamic_cast<FullFilterBitsBuilder*>( + table_options_.filter_policy->GetFilterBitsBuilder()); + const bool WHOLE_KEY = true; + FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, + bits_builder); + ASSERT_EQ(0, builder.NumAdded()); + builder.Add("key"); // test with empty prefix + ASSERT_EQ(2, bits_builder->hash_entries_.size()); + } + + // mix of empty and non-empty + std::unique_ptr<const SliceTransform> prefix_extractor( + NewFixedPrefixTransform(7)); + auto bits_builder = dynamic_cast<FullFilterBitsBuilder*>( + table_options_.filter_policy->GetFilterBitsBuilder()); + const bool WHOLE_KEY = true; + FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, + bits_builder); + ASSERT_EQ(0, builder.NumAdded()); + builder.Add(""); // test with empty key too + builder.Add("prefix1key1"); + builder.Add("prefix1key1"); + builder.Add("prefix1key2"); + builder.Add("prefix1key3"); + builder.Add("prefix2key4"); + // two prefix adn 4 keys + ASSERT_EQ(1 + 2 + 4, bits_builder->hash_entries_.size()); +} + +TEST_F(FullFilterBlockTest, SingleChunk) { + FullFilterBlockBuilder builder( + nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); + ASSERT_EQ(0, builder.NumAdded()); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.Add("box"); + builder.Add("hello"); + ASSERT_EQ(5, builder.NumAdded()); + Slice block = builder.Finish(); + FullFilterBlockReader reader( + nullptr, true, block, + table_options_.filter_policy->GetFilterBitsReader(block), nullptr); + ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr)); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/get_context.cc b/src/rocksdb/table/get_context.cc new file mode 100644 index 00000000..24c9ba7d --- /dev/null +++ b/src/rocksdb/table/get_context.cc @@ -0,0 +1,338 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/get_context.h" +#include "db/merge_helper.h" +#include "db/pinned_iterators_manager.h" +#include "db/read_callback.h" +#include "monitoring/file_read_sample.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/statistics.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/statistics.h" + +namespace rocksdb { + +namespace { + +void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) { +#ifndef ROCKSDB_LITE + if (replay_log) { + if (replay_log->empty()) { + // Optimization: in the common case of only one operation in the + // log, we allocate the exact amount of space needed. + replay_log->reserve(1 + VarintLength(value.size()) + value.size()); + } + replay_log->push_back(type); + PutLengthPrefixedSlice(replay_log, value); + } +#else + (void)replay_log; + (void)type; + (void)value; +#endif // ROCKSDB_LITE +} + +} // namespace + +GetContext::GetContext(const Comparator* ucmp, + const MergeOperator* merge_operator, Logger* logger, + Statistics* statistics, GetState init_state, + const Slice& user_key, PinnableSlice* pinnable_val, + bool* value_found, MergeContext* merge_context, + SequenceNumber* _max_covering_tombstone_seq, Env* env, + SequenceNumber* seq, + PinnedIteratorsManager* _pinned_iters_mgr, + ReadCallback* callback, bool* is_blob_index) + : ucmp_(ucmp), + merge_operator_(merge_operator), + logger_(logger), + statistics_(statistics), + state_(init_state), + user_key_(user_key), + pinnable_val_(pinnable_val), + value_found_(value_found), + merge_context_(merge_context), + max_covering_tombstone_seq_(_max_covering_tombstone_seq), + env_(env), + seq_(seq), + replay_log_(nullptr), + pinned_iters_mgr_(_pinned_iters_mgr), + callback_(callback), + is_blob_index_(is_blob_index) { + if (seq_) { + *seq_ = kMaxSequenceNumber; + } + sample_ = should_sample_file_read(); +} + +// Called from TableCache::Get and Table::Get when file/block in which +// key may exist are not there in TableCache/BlockCache respectively. In this +// case we can't guarantee that key does not exist and are not permitted to do +// IO to be certain.Set the status=kFound and value_found=false to let the +// caller know that key may exist but is not there in memory +void GetContext::MarkKeyMayExist() { + state_ = kFound; + if (value_found_ != nullptr) { + *value_found_ = false; + } +} + +void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) { + assert(state_ == kNotFound); + appendToReplayLog(replay_log_, kTypeValue, value); + + state_ = kFound; + if (LIKELY(pinnable_val_ != nullptr)) { + pinnable_val_->PinSelf(value); + } +} + +void GetContext::ReportCounters() { + if (get_context_stats_.num_cache_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit); + } + if (get_context_stats_.num_cache_index_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT, + get_context_stats_.num_cache_index_hit); + } + if (get_context_stats_.num_cache_data_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_HIT, + get_context_stats_.num_cache_data_hit); + } + if (get_context_stats_.num_cache_filter_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT, + get_context_stats_.num_cache_filter_hit); + } + if (get_context_stats_.num_cache_compression_dict_hit > 0) { + RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_HIT, + get_context_stats_.num_cache_compression_dict_hit); + } + if (get_context_stats_.num_cache_index_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS, + get_context_stats_.num_cache_index_miss); + } + if (get_context_stats_.num_cache_filter_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS, + get_context_stats_.num_cache_filter_miss); + } + if (get_context_stats_.num_cache_data_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_MISS, + get_context_stats_.num_cache_data_miss); + } + if (get_context_stats_.num_cache_compression_dict_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_MISS, + get_context_stats_.num_cache_compression_dict_miss); + } + if (get_context_stats_.num_cache_bytes_read > 0) { + RecordTick(statistics_, BLOCK_CACHE_BYTES_READ, + get_context_stats_.num_cache_bytes_read); + } + if (get_context_stats_.num_cache_miss > 0) { + RecordTick(statistics_, BLOCK_CACHE_MISS, + get_context_stats_.num_cache_miss); + } + if (get_context_stats_.num_cache_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add); + } + if (get_context_stats_.num_cache_bytes_write > 0) { + RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE, + get_context_stats_.num_cache_bytes_write); + } + if (get_context_stats_.num_cache_index_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD, + get_context_stats_.num_cache_index_add); + } + if (get_context_stats_.num_cache_index_bytes_insert > 0) { + RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT, + get_context_stats_.num_cache_index_bytes_insert); + } + if (get_context_stats_.num_cache_data_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_ADD, + get_context_stats_.num_cache_data_add); + } + if (get_context_stats_.num_cache_data_bytes_insert > 0) { + RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT, + get_context_stats_.num_cache_data_bytes_insert); + } + if (get_context_stats_.num_cache_filter_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD, + get_context_stats_.num_cache_filter_add); + } + if (get_context_stats_.num_cache_filter_bytes_insert > 0) { + RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT, + get_context_stats_.num_cache_filter_bytes_insert); + } + if (get_context_stats_.num_cache_compression_dict_add > 0) { + RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD, + get_context_stats_.num_cache_compression_dict_add); + } + if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) { + RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + get_context_stats_.num_cache_compression_dict_bytes_insert); + } +} + +bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, + const Slice& value, bool* matched, + Cleanable* value_pinner) { + assert(matched); + assert((state_ != kMerge && parsed_key.type != kTypeMerge) || + merge_context_ != nullptr); + if (ucmp_->Equal(parsed_key.user_key, user_key_)) { + *matched = true; + // If the value is not in the snapshot, skip it + if (!CheckCallback(parsed_key.sequence)) { + return true; // to continue to the next seq + } + + appendToReplayLog(replay_log_, parsed_key.type, value); + + if (seq_ != nullptr) { + // Set the sequence number if it is uninitialized + if (*seq_ == kMaxSequenceNumber) { + *seq_ = parsed_key.sequence; + } + } + + auto type = parsed_key.type; + // Key matches. Process it + if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) && + max_covering_tombstone_seq_ != nullptr && + *max_covering_tombstone_seq_ > parsed_key.sequence) { + type = kTypeRangeDeletion; + } + switch (type) { + case kTypeValue: + case kTypeBlobIndex: + assert(state_ == kNotFound || state_ == kMerge); + if (type == kTypeBlobIndex && is_blob_index_ == nullptr) { + // Blob value not supported. Stop. + state_ = kBlobIndex; + return false; + } + if (kNotFound == state_) { + state_ = kFound; + if (LIKELY(pinnable_val_ != nullptr)) { + if (LIKELY(value_pinner != nullptr)) { + // If the backing resources for the value are provided, pin them + pinnable_val_->PinSlice(value, value_pinner); + } else { + TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", this); + + // Otherwise copy the value + pinnable_val_->PinSelf(value); + } + } + } else if (kMerge == state_) { + assert(merge_operator_ != nullptr); + state_ = kFound; + if (LIKELY(pinnable_val_ != nullptr)) { + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, &value, + merge_context_->GetOperands(), pinnable_val_->GetSelf(), + logger_, statistics_, env_); + pinnable_val_->PinSelf(); + if (!merge_status.ok()) { + state_ = kCorrupt; + } + } + } + if (is_blob_index_ != nullptr) { + *is_blob_index_ = (type == kTypeBlobIndex); + } + return false; + + case kTypeDeletion: + case kTypeSingleDeletion: + case kTypeRangeDeletion: + // TODO(noetzli): Verify correctness once merge of single-deletes + // is supported + assert(state_ == kNotFound || state_ == kMerge); + if (kNotFound == state_) { + state_ = kDeleted; + } else if (kMerge == state_) { + state_ = kFound; + if (LIKELY(pinnable_val_ != nullptr)) { + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, nullptr, + merge_context_->GetOperands(), pinnable_val_->GetSelf(), + logger_, statistics_, env_); + pinnable_val_->PinSelf(); + if (!merge_status.ok()) { + state_ = kCorrupt; + } + } + } + return false; + + case kTypeMerge: + assert(state_ == kNotFound || state_ == kMerge); + state_ = kMerge; + // value_pinner is not set from plain_table_reader.cc for example. + if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() && + value_pinner != nullptr) { + value_pinner->DelegateCleanupsTo(pinned_iters_mgr()); + merge_context_->PushOperand(value, true /*value_pinned*/); + } else { + merge_context_->PushOperand(value, false); + } + if (merge_operator_ != nullptr && + merge_operator_->ShouldMerge(merge_context_->GetOperandsDirectionBackward())) { + state_ = kFound; + if (LIKELY(pinnable_val_ != nullptr)) { + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, nullptr, + merge_context_->GetOperands(), pinnable_val_->GetSelf(), + logger_, statistics_, env_); + pinnable_val_->PinSelf(); + if (!merge_status.ok()) { + state_ = kCorrupt; + } + } + return false; + } + return true; + + default: + assert(false); + break; + } + } + + // state_ could be Corrupt, merge or notfound + return false; +} + +void replayGetContextLog(const Slice& replay_log, const Slice& user_key, + GetContext* get_context, Cleanable* value_pinner) { +#ifndef ROCKSDB_LITE + Slice s = replay_log; + while (s.size()) { + auto type = static_cast<ValueType>(*s.data()); + s.remove_prefix(1); + Slice value; + bool ret = GetLengthPrefixedSlice(&s, &value); + assert(ret); + (void)ret; + + bool dont_care __attribute__((__unused__)); + // Since SequenceNumber is not stored and unknown, we will use + // kMaxSequenceNumber. + get_context->SaveValue( + ParsedInternalKey(user_key, kMaxSequenceNumber, type), value, + &dont_care, value_pinner); + } +#else // ROCKSDB_LITE + (void)replay_log; + (void)user_key; + (void)get_context; + (void)value_pinner; + assert(false); +#endif // ROCKSDB_LITE +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/get_context.h b/src/rocksdb/table/get_context.h new file mode 100644 index 00000000..d7d0e980 --- /dev/null +++ b/src/rocksdb/table/get_context.h @@ -0,0 +1,137 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include <string> +#include "db/merge_context.h" +#include "db/read_callback.h" +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" +#include "rocksdb/types.h" +#include "table/block.h" + +namespace rocksdb { +class MergeContext; +class PinnedIteratorsManager; + +struct GetContextStats { + uint64_t num_cache_hit = 0; + uint64_t num_cache_index_hit = 0; + uint64_t num_cache_data_hit = 0; + uint64_t num_cache_filter_hit = 0; + uint64_t num_cache_compression_dict_hit = 0; + uint64_t num_cache_index_miss = 0; + uint64_t num_cache_filter_miss = 0; + uint64_t num_cache_data_miss = 0; + uint64_t num_cache_compression_dict_miss = 0; + uint64_t num_cache_bytes_read = 0; + uint64_t num_cache_miss = 0; + uint64_t num_cache_add = 0; + uint64_t num_cache_bytes_write = 0; + uint64_t num_cache_index_add = 0; + uint64_t num_cache_index_bytes_insert = 0; + uint64_t num_cache_data_add = 0; + uint64_t num_cache_data_bytes_insert = 0; + uint64_t num_cache_filter_add = 0; + uint64_t num_cache_filter_bytes_insert = 0; + uint64_t num_cache_compression_dict_add = 0; + uint64_t num_cache_compression_dict_bytes_insert = 0; +}; + +class GetContext { + public: + enum GetState { + kNotFound, + kFound, + kDeleted, + kCorrupt, + kMerge, // saver contains the current merge result (the operands) + kBlobIndex, + }; + GetContextStats get_context_stats_; + + GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, + Logger* logger, Statistics* statistics, GetState init_state, + const Slice& user_key, PinnableSlice* value, bool* value_found, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, Env* env, + SequenceNumber* seq = nullptr, + PinnedIteratorsManager* _pinned_iters_mgr = nullptr, + ReadCallback* callback = nullptr, bool* is_blob_index = nullptr); + + void MarkKeyMayExist(); + + // Records this key, value, and any meta-data (such as sequence number and + // state) into this GetContext. + // + // If the parsed_key matches the user key that we are looking for, sets + // mathced to true. + // + // Returns True if more keys need to be read (due to merges) or + // False if the complete value has been found. + bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value, + bool* matched, Cleanable* value_pinner = nullptr); + + // Simplified version of the previous function. Should only be used when we + // know that the operation is a Put. + void SaveValue(const Slice& value, SequenceNumber seq); + + GetState State() const { return state_; } + + SequenceNumber* max_covering_tombstone_seq() { + return max_covering_tombstone_seq_; + } + + PinnedIteratorsManager* pinned_iters_mgr() { return pinned_iters_mgr_; } + + // If a non-null string is passed, all the SaveValue calls will be + // logged into the string. The operations can then be replayed on + // another GetContext with replayGetContextLog. + void SetReplayLog(std::string* replay_log) { replay_log_ = replay_log; } + + // Do we need to fetch the SequenceNumber for this key? + bool NeedToReadSequence() const { return (seq_ != nullptr); } + + bool sample() const { return sample_; } + + bool CheckCallback(SequenceNumber seq) { + if (callback_) { + return callback_->IsVisible(seq); + } + return true; + } + + void ReportCounters(); + + private: + const Comparator* ucmp_; + const MergeOperator* merge_operator_; + // the merge operations encountered; + Logger* logger_; + Statistics* statistics_; + + GetState state_; + Slice user_key_; + PinnableSlice* pinnable_val_; + bool* value_found_; // Is value set correctly? Used by KeyMayExist + MergeContext* merge_context_; + SequenceNumber* max_covering_tombstone_seq_; + Env* env_; + // If a key is found, seq_ will be set to the SequenceNumber of most recent + // write to the key or kMaxSequenceNumber if unknown + SequenceNumber* seq_; + std::string* replay_log_; + // Used to temporarily pin blocks when state_ == GetContext::kMerge + PinnedIteratorsManager* pinned_iters_mgr_; + ReadCallback* callback_; + bool sample_; + bool* is_blob_index_; +}; + +void replayGetContextLog(const Slice& replay_log, const Slice& user_key, + GetContext* get_context, + Cleanable* value_pinner = nullptr); + +} // namespace rocksdb diff --git a/src/rocksdb/table/index_builder.cc b/src/rocksdb/table/index_builder.cc new file mode 100644 index 00000000..cd28c42a --- /dev/null +++ b/src/rocksdb/table/index_builder.cc @@ -0,0 +1,214 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/index_builder.h" +#include <assert.h> +#include <inttypes.h> + +#include <list> +#include <string> + +#include "rocksdb/comparator.h" +#include "rocksdb/flush_block_policy.h" +#include "table/format.h" +#include "table/partitioned_filter_block.h" + +// Without anonymous namespace here, we fail the warning -Wmissing-prototypes +namespace rocksdb { +// using namespace rocksdb; +// Create a index builder based on its type. +IndexBuilder* IndexBuilder::CreateIndexBuilder( + BlockBasedTableOptions::IndexType index_type, + const InternalKeyComparator* comparator, + const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt) { + IndexBuilder* result = nullptr; + switch (index_type) { + case BlockBasedTableOptions::kBinarySearch: { + result = new ShortenedIndexBuilder( + comparator, table_opt.index_block_restart_interval, + table_opt.format_version, use_value_delta_encoding); + } + break; + case BlockBasedTableOptions::kHashSearch: { + result = new HashIndexBuilder(comparator, int_key_slice_transform, + table_opt.index_block_restart_interval, + table_opt.format_version, + use_value_delta_encoding); + } + break; + case BlockBasedTableOptions::kTwoLevelIndexSearch: { + result = PartitionedIndexBuilder::CreateIndexBuilder( + comparator, use_value_delta_encoding, table_opt); + } + break; + default: { + assert(!"Do not recognize the index type "); + } + break; + } + return result; +} + +PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder( + const InternalKeyComparator* comparator, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt) { + return new PartitionedIndexBuilder(comparator, table_opt, + use_value_delta_encoding); +} + +PartitionedIndexBuilder::PartitionedIndexBuilder( + const InternalKeyComparator* comparator, + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding) + : IndexBuilder(comparator), + index_block_builder_(table_opt.index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_block_builder_without_seq_(table_opt.index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + sub_index_builder_(nullptr), + table_opt_(table_opt), + // We start by false. After each partition we revise the value based on + // what the sub_index_builder has decided. If the feature is disabled + // entirely, this will be set to true after switching the first + // sub_index_builder. Otherwise, it could be set to true even one of the + // sub_index_builders could not safely exclude seq from the keys, then it + // wil be enforced on all sub_index_builders on ::Finish. + seperator_is_key_plus_seq_(false), + use_value_delta_encoding_(use_value_delta_encoding) {} + +PartitionedIndexBuilder::~PartitionedIndexBuilder() { + delete sub_index_builder_; +} + +void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { + assert(sub_index_builder_ == nullptr); + sub_index_builder_ = new ShortenedIndexBuilder( + comparator_, table_opt_.index_block_restart_interval, + table_opt_.format_version, use_value_delta_encoding_); + flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + table_opt_.metadata_block_size, table_opt_.block_size_deviation, + // Note: this is sub-optimal since sub_index_builder_ could later reset + // seperator_is_key_plus_seq_ but the probability of that is low. + sub_index_builder_->seperator_is_key_plus_seq_ + ? sub_index_builder_->index_block_builder_ + : sub_index_builder_->index_block_builder_without_seq_)); + partition_cut_requested_ = false; +} + +void PartitionedIndexBuilder::RequestPartitionCut() { + partition_cut_requested_ = true; +} + +void PartitionedIndexBuilder::AddIndexEntry( + std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, const BlockHandle& block_handle) { + // Note: to avoid two consecuitive flush in the same method call, we do not + // check flush policy when adding the last key + if (UNLIKELY(first_key_in_next_block == nullptr)) { // no more keys + if (sub_index_builder_ == nullptr) { + MakeNewSubIndexBuilder(); + } + sub_index_builder_->AddIndexEntry(last_key_in_current_block, + first_key_in_next_block, block_handle); + if (sub_index_builder_->seperator_is_key_plus_seq_) { + // then we need to apply it to all sub-index builders + seperator_is_key_plus_seq_ = true; + } + sub_index_last_key_ = std::string(*last_key_in_current_block); + entries_.push_back( + {sub_index_last_key_, + std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)}); + sub_index_builder_ = nullptr; + cut_filter_block = true; + } else { + // apply flush policy only to non-empty sub_index_builder_ + if (sub_index_builder_ != nullptr) { + std::string handle_encoding; + block_handle.EncodeTo(&handle_encoding); + bool do_flush = + partition_cut_requested_ || + flush_policy_->Update(*last_key_in_current_block, handle_encoding); + if (do_flush) { + entries_.push_back( + {sub_index_last_key_, + std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)}); + cut_filter_block = true; + sub_index_builder_ = nullptr; + } + } + if (sub_index_builder_ == nullptr) { + MakeNewSubIndexBuilder(); + } + sub_index_builder_->AddIndexEntry(last_key_in_current_block, + first_key_in_next_block, block_handle); + sub_index_last_key_ = std::string(*last_key_in_current_block); + if (sub_index_builder_->seperator_is_key_plus_seq_) { + // then we need to apply it to all sub-index builders + seperator_is_key_plus_seq_ = true; + } + } +} + +Status PartitionedIndexBuilder::Finish( + IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) { + if (partition_cnt_ == 0) { + partition_cnt_ = entries_.size(); + } + // It must be set to null after last key is added + assert(sub_index_builder_ == nullptr); + if (finishing_indexes == true) { + Entry& last_entry = entries_.front(); + std::string handle_encoding; + last_partition_block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64( + &handle_delta_encoding, + last_partition_block_handle.size() - last_encoded_handle_.size()); + last_encoded_handle_ = last_partition_block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); + if (!seperator_is_key_plus_seq_) { + index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key), + handle_encoding, + &handle_delta_encoding_slice); + } + entries_.pop_front(); + } + // If there is no sub_index left, then return the 2nd level index. + if (UNLIKELY(entries_.empty())) { + if (seperator_is_key_plus_seq_) { + index_blocks->index_block_contents = index_block_builder_.Finish(); + } else { + index_blocks->index_block_contents = + index_block_builder_without_seq_.Finish(); + } + top_level_index_size_ = index_blocks->index_block_contents.size(); + index_size_ += top_level_index_size_; + return Status::OK(); + } else { + // Finish the next partition index in line and Incomplete() to indicate we + // expect more calls to Finish + Entry& entry = entries_.front(); + // Apply the policy to all sub-indexes + entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_; + auto s = entry.value->Finish(index_blocks); + index_size_ += index_blocks->index_block_contents.size(); + finishing_indexes = true; + return s.ok() ? Status::Incomplete() : s; + } +} + +size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; } +} // namespace rocksdb diff --git a/src/rocksdb/table/index_builder.h b/src/rocksdb/table/index_builder.h new file mode 100644 index 00000000..87d7b7a7 --- /dev/null +++ b/src/rocksdb/table/index_builder.h @@ -0,0 +1,411 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <assert.h> +#include <inttypes.h> + +#include <list> +#include <string> +#include <unordered_map> + +#include "rocksdb/comparator.h" +#include "table/block_based_table_factory.h" +#include "table/block_builder.h" +#include "table/format.h" + +namespace rocksdb { +// The interface for building index. +// Instruction for adding a new concrete IndexBuilder: +// 1. Create a subclass instantiated from IndexBuilder. +// 2. Add a new entry associated with that subclass in TableOptions::IndexType. +// 3. Add a create function for the new subclass in CreateIndexBuilder. +// Note: we can devise more advanced design to simplify the process for adding +// new subclass, which will, on the other hand, increase the code complexity and +// catch unwanted attention from readers. Given that we won't add/change +// indexes frequently, it makes sense to just embrace a more straightforward +// design that just works. +class IndexBuilder { + public: + static IndexBuilder* CreateIndexBuilder( + BlockBasedTableOptions::IndexType index_type, + const rocksdb::InternalKeyComparator* comparator, + const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt); + + // Index builder will construct a set of blocks which contain: + // 1. One primary index block. + // 2. (Optional) a set of metablocks that contains the metadata of the + // primary index. + struct IndexBlocks { + Slice index_block_contents; + std::unordered_map<std::string, Slice> meta_blocks; + }; + explicit IndexBuilder(const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + virtual ~IndexBuilder() {} + + // Add a new index entry to index block. + // To allow further optimization, we provide `last_key_in_current_block` and + // `first_key_in_next_block`, based on which the specific implementation can + // determine the best index key to be used for the index block. + // @last_key_in_current_block: this parameter maybe overridden with the value + // "substitute key". + // @first_key_in_next_block: it will be nullptr if the entry being added is + // the last one in the table + // + // REQUIRES: Finish() has not yet been called. + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) = 0; + + // This method will be called whenever a key is added. The subclasses may + // override OnKeyAdded() if they need to collect additional information. + virtual void OnKeyAdded(const Slice& /*key*/) {} + + // Inform the index builder that all entries has been written. Block builder + // may therefore perform any operation required for block finalization. + // + // REQUIRES: Finish() has not yet been called. + inline Status Finish(IndexBlocks* index_blocks) { + // Throw away the changes to last_partition_block_handle. It has no effect + // on the first call to Finish anyway. + BlockHandle last_partition_block_handle; + return Finish(index_blocks, last_partition_block_handle); + } + + // This override of Finish can be utilized to build the 2nd level index in + // PartitionIndexBuilder. + // + // index_blocks will be filled with the resulting index data. If the return + // value is Status::InComplete() then it means that the index is partitioned + // and the callee should keep calling Finish until Status::OK() is returned. + // In that case, last_partition_block_handle is pointer to the block written + // with the result of the last call to Finish. This can be utilized to build + // the second level index pointing to each block of partitioned indexes. The + // last call to Finish() that returns Status::OK() populates index_blocks with + // the 2nd level index content. + virtual Status Finish(IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) = 0; + + // Get the size for index block. Must be called after ::Finish. + virtual size_t IndexSize() const = 0; + + virtual bool seperator_is_key_plus_seq() { return true; } + + protected: + const InternalKeyComparator* comparator_; + // Set after ::Finish is called + size_t index_size_ = 0; +}; + +// This index builder builds space-efficient index block. +// +// Optimizations: +// 1. Made block's `block_restart_interval` to be 1, which will avoid linear +// search when doing index lookup (can be disabled by setting +// index_block_restart_interval). +// 2. Shorten the key length for index block. Other than honestly using the +// last key in the data block as the index key, we instead find a shortest +// substitute key that serves the same function. +class ShortenedIndexBuilder : public IndexBuilder { + public: + explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator, + const int index_block_restart_interval, + const uint32_t format_version, + const bool use_value_delta_encoding) + : IndexBuilder(comparator), + index_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding) { + // Making the default true will disable the feature for old versions + seperator_is_key_plus_seq_ = (format_version <= 2); + } + + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { + if (first_key_in_next_block != nullptr) { + comparator_->FindShortestSeparator(last_key_in_current_block, + *first_key_in_next_block); + if (!seperator_is_key_plus_seq_ && + comparator_->user_comparator()->Compare( + ExtractUserKey(*last_key_in_current_block), + ExtractUserKey(*first_key_in_next_block)) == 0) { + seperator_is_key_plus_seq_ = true; + } + } else { + comparator_->FindShortSuccessor(last_key_in_current_block); + } + auto sep = Slice(*last_key_in_current_block); + + std::string handle_encoding; + block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64(&handle_delta_encoding, + block_handle.size() - last_encoded_handle_.size()); + assert(handle_delta_encoding.size() != 0); + last_encoded_handle_ = block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_block_builder_.Add(sep, handle_encoding, + &handle_delta_encoding_slice); + if (!seperator_is_key_plus_seq_) { + index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding, + &handle_delta_encoding_slice); + } + } + + using IndexBuilder::Finish; + virtual Status Finish( + IndexBlocks* index_blocks, + const BlockHandle& /*last_partition_block_handle*/) override { + if (seperator_is_key_plus_seq_) { + index_blocks->index_block_contents = index_block_builder_.Finish(); + } else { + index_blocks->index_block_contents = + index_block_builder_without_seq_.Finish(); + } + index_size_ = index_blocks->index_block_contents.size(); + return Status::OK(); + } + + virtual size_t IndexSize() const override { return index_size_; } + + virtual bool seperator_is_key_plus_seq() override { + return seperator_is_key_plus_seq_; + } + + friend class PartitionedIndexBuilder; + + private: + BlockBuilder index_block_builder_; + BlockBuilder index_block_builder_without_seq_; + bool seperator_is_key_plus_seq_; + BlockHandle last_encoded_handle_; +}; + +// HashIndexBuilder contains a binary-searchable primary index and the +// metadata for secondary hash index construction. +// The metadata for hash index consists two parts: +// - a metablock that compactly contains a sequence of prefixes. All prefixes +// are stored consectively without any metadata (like, prefix sizes) being +// stored, which is kept in the other metablock. +// - a metablock contains the metadata of the prefixes, including prefix size, +// restart index and number of block it spans. The format looks like: +// +// +-----------------+---------------------------+---------------------+ +// <=prefix 1 +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ +// <=prefix 2 +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ +// | | +// | .... | +// | | +// +-----------------+---------------------------+---------------------+ +// <=prefix n +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ +// +// The reason of separating these two metablocks is to enable the efficiently +// reuse the first metablock during hash index construction without unnecessary +// data copy or small heap allocations for prefixes. +class HashIndexBuilder : public IndexBuilder { + public: + explicit HashIndexBuilder(const InternalKeyComparator* comparator, + const SliceTransform* hash_key_extractor, + int index_block_restart_interval, + int format_version, bool use_value_delta_encoding) + : IndexBuilder(comparator), + primary_index_builder_(comparator, index_block_restart_interval, + format_version, use_value_delta_encoding), + hash_key_extractor_(hash_key_extractor) {} + + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { + ++current_restart_index_; + primary_index_builder_.AddIndexEntry(last_key_in_current_block, + first_key_in_next_block, block_handle); + } + + virtual void OnKeyAdded(const Slice& key) override { + auto key_prefix = hash_key_extractor_->Transform(key); + bool is_first_entry = pending_block_num_ == 0; + + // Keys may share the prefix + if (is_first_entry || pending_entry_prefix_ != key_prefix) { + if (!is_first_entry) { + FlushPendingPrefix(); + } + + // need a hard copy otherwise the underlying data changes all the time. + // TODO(kailiu) ToString() is expensive. We may speed up can avoid data + // copy. + pending_entry_prefix_ = key_prefix.ToString(); + pending_block_num_ = 1; + pending_entry_index_ = static_cast<uint32_t>(current_restart_index_); + } else { + // entry number increments when keys share the prefix reside in + // different data blocks. + auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1; + assert(last_restart_index <= current_restart_index_); + if (last_restart_index != current_restart_index_) { + ++pending_block_num_; + } + } + } + + virtual Status Finish( + IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) override { + if (pending_block_num_ != 0) { + FlushPendingPrefix(); + } + primary_index_builder_.Finish(index_blocks, last_partition_block_handle); + index_blocks->meta_blocks.insert( + {kHashIndexPrefixesBlock.c_str(), prefix_block_}); + index_blocks->meta_blocks.insert( + {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_}); + return Status::OK(); + } + + virtual size_t IndexSize() const override { + return primary_index_builder_.IndexSize() + prefix_block_.size() + + prefix_meta_block_.size(); + } + + virtual bool seperator_is_key_plus_seq() override { + return primary_index_builder_.seperator_is_key_plus_seq(); + } + + private: + void FlushPendingPrefix() { + prefix_block_.append(pending_entry_prefix_.data(), + pending_entry_prefix_.size()); + PutVarint32Varint32Varint32( + &prefix_meta_block_, + static_cast<uint32_t>(pending_entry_prefix_.size()), + pending_entry_index_, pending_block_num_); + } + + ShortenedIndexBuilder primary_index_builder_; + const SliceTransform* hash_key_extractor_; + + // stores a sequence of prefixes + std::string prefix_block_; + // stores the metadata of prefixes + std::string prefix_meta_block_; + + // The following 3 variables keeps unflushed prefix and its metadata. + // The details of block_num and entry_index can be found in + // "block_hash_index.{h,cc}" + uint32_t pending_block_num_ = 0; + uint32_t pending_entry_index_ = 0; + std::string pending_entry_prefix_; + + uint64_t current_restart_index_ = 0; +}; + +/** + * IndexBuilder for two-level indexing. Internally it creates a new index for + * each partition and Finish then in order when Finish is called on it + * continiously until Status::OK() is returned. + * + * The format on the disk would be I I I I I I IP where I is block containing a + * partition of indexes built using ShortenedIndexBuilder and IP is a block + * containing a secondary index on the partitions, built using + * ShortenedIndexBuilder. + */ +class PartitionedIndexBuilder : public IndexBuilder { + public: + static PartitionedIndexBuilder* CreateIndexBuilder( + const rocksdb::InternalKeyComparator* comparator, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt); + + explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator, + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding); + + virtual ~PartitionedIndexBuilder(); + + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override; + + virtual Status Finish( + IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) override; + + virtual size_t IndexSize() const override { return index_size_; } + size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; } + size_t NumPartitions() const; + + inline bool ShouldCutFilterBlock() { + // Current policy is to align the partitions of index and filters + if (cut_filter_block) { + cut_filter_block = false; + return true; + } + return false; + } + + std::string& GetPartitionKey() { return sub_index_last_key_; } + + // Called when an external entity (such as filter partition builder) request + // cutting the next partition + void RequestPartitionCut(); + + virtual bool seperator_is_key_plus_seq() override { + return seperator_is_key_plus_seq_; + } + + bool get_use_value_delta_encoding() { return use_value_delta_encoding_; } + + private: + // Set after ::Finish is called + size_t top_level_index_size_ = 0; + // Set after ::Finish is called + size_t partition_cnt_ = 0; + + void MakeNewSubIndexBuilder(); + + struct Entry { + std::string key; + std::unique_ptr<ShortenedIndexBuilder> value; + }; + std::list<Entry> entries_; // list of partitioned indexes and their keys + BlockBuilder index_block_builder_; // top-level index builder + BlockBuilder index_block_builder_without_seq_; // same for user keys + // the active partition index builder + ShortenedIndexBuilder* sub_index_builder_; + // the last key in the active partition index builder + std::string sub_index_last_key_; + std::unique_ptr<FlushBlockPolicy> flush_policy_; + // true if Finish is called once but not complete yet. + bool finishing_indexes = false; + const BlockBasedTableOptions& table_opt_; + bool seperator_is_key_plus_seq_; + bool use_value_delta_encoding_; + // true if an external entity (such as filter partition builder) request + // cutting the next partition + bool partition_cut_requested_ = true; + // true if it should cut the next filter partition block + bool cut_filter_block = false; + BlockHandle last_encoded_handle_; +}; +} // namespace rocksdb diff --git a/src/rocksdb/table/internal_iterator.h b/src/rocksdb/table/internal_iterator.h new file mode 100644 index 00000000..a173d606 --- /dev/null +++ b/src/rocksdb/table/internal_iterator.h @@ -0,0 +1,142 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include <string> +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/status.h" +#include "table/format.h" + +namespace rocksdb { + +class PinnedIteratorsManager; + +template <class TValue> +class InternalIteratorBase : public Cleanable { + public: + InternalIteratorBase() {} + virtual ~InternalIteratorBase() {} + + // An iterator is either positioned at a key/value pair, or + // not valid. This method returns true iff the iterator is valid. + // Always returns false if !status().ok(). + virtual bool Valid() const = 0; + + // Position at the first key in the source. The iterator is Valid() + // after this call iff the source is not empty. + virtual void SeekToFirst() = 0; + + // Position at the last key in the source. The iterator is + // Valid() after this call iff the source is not empty. + virtual void SeekToLast() = 0; + + // Position at the first key in the source that at or past target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or past target. + // All Seek*() methods clear any error status() that the iterator had prior to + // the call; after the seek, status() indicates only the error (if any) that + // happened during the seek, not any past errors. + virtual void Seek(const Slice& target) = 0; + + // Position at the first key in the source that at or before target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or before target. + virtual void SeekForPrev(const Slice& target) = 0; + + // Moves to the next entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the last entry in the source. + // REQUIRES: Valid() + virtual void Next() = 0; + + // Moves to the previous entry in the source. After this call, Valid() is + // true iff the iterator was not positioned at the first entry in source. + // REQUIRES: Valid() + virtual void Prev() = 0; + + // Return the key for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual Slice key() const = 0; + + // Return the value for the current entry. The underlying storage for + // the returned slice is valid only until the next modification of + // the iterator. + // REQUIRES: Valid() + virtual TValue value() const = 0; + + // If an error has occurred, return it. Else return an ok status. + // If non-blocking IO is requested and this operation cannot be + // satisfied without doing some IO, then this returns Status::Incomplete(). + virtual Status status() const = 0; + + // True if the iterator is invalidated because it is out of the iterator + // upper bound + virtual bool IsOutOfBound() { return false; } + + // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont + // communicate with PinnedIteratorsManager so default implementation is no-op + // but for Iterators that need to communicate with PinnedIteratorsManager + // they will implement this function and use the passed pointer to communicate + // with PinnedIteratorsManager. + virtual void SetPinnedItersMgr(PinnedIteratorsManager* /*pinned_iters_mgr*/) { + } + + // If true, this means that the Slice returned by key() is valid as long as + // PinnedIteratorsManager::ReleasePinnedData is not called and the + // Iterator is not deleted. + // + // IsKeyPinned() is guaranteed to always return true if + // - Iterator is created with ReadOptions::pin_data = true + // - DB tables were created with BlockBasedTableOptions::use_delta_encoding + // set to false. + virtual bool IsKeyPinned() const { return false; } + + // If true, this means that the Slice returned by value() is valid as long as + // PinnedIteratorsManager::ReleasePinnedData is not called and the + // Iterator is not deleted. + virtual bool IsValuePinned() const { return false; } + + virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) { + return Status::NotSupported(""); + } + + protected: + void SeekForPrevImpl(const Slice& target, const Comparator* cmp) { + Seek(target); + if (!Valid()) { + SeekToLast(); + } + while (Valid() && cmp->Compare(target, key()) < 0) { + Prev(); + } + } + + private: + // No copying allowed + InternalIteratorBase(const InternalIteratorBase&) = delete; + InternalIteratorBase& operator=(const InternalIteratorBase&) = delete; +}; + +using InternalIterator = InternalIteratorBase<Slice>; + +// Return an empty iterator (yields nothing). +template <class TValue = Slice> +extern InternalIteratorBase<TValue>* NewEmptyInternalIterator(); + +// Return an empty iterator with the specified status. +template <class TValue = Slice> +extern InternalIteratorBase<TValue>* NewErrorInternalIterator( + const Status& status); + +// Return an empty iterator with the specified status, allocated arena. +template <class TValue = Slice> +extern InternalIteratorBase<TValue>* NewErrorInternalIterator( + const Status& status, Arena* arena); + +} // namespace rocksdb diff --git a/src/rocksdb/table/iter_heap.h b/src/rocksdb/table/iter_heap.h new file mode 100644 index 00000000..f30c1227 --- /dev/null +++ b/src/rocksdb/table/iter_heap.h @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include "db/dbformat.h" +#include "table/iterator_wrapper.h" + +namespace rocksdb { + +// When used with std::priority_queue, this comparison functor puts the +// iterator with the max/largest key on top. +class MaxIteratorComparator { + public: + MaxIteratorComparator(const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + bool operator()(IteratorWrapper* a, IteratorWrapper* b) const { + return comparator_->Compare(a->key(), b->key()) < 0; + } + private: + const InternalKeyComparator* comparator_; +}; + +// When used with std::priority_queue, this comparison functor puts the +// iterator with the min/smallest key on top. +class MinIteratorComparator { + public: + MinIteratorComparator(const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + bool operator()(IteratorWrapper* a, IteratorWrapper* b) const { + return comparator_->Compare(a->key(), b->key()) > 0; + } + private: + const InternalKeyComparator* comparator_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/iterator.cc b/src/rocksdb/table/iterator.cc new file mode 100644 index 00000000..0475b9d1 --- /dev/null +++ b/src/rocksdb/table/iterator.cc @@ -0,0 +1,210 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/iterator.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "util/arena.h" + +namespace rocksdb { + +Cleanable::Cleanable() { + cleanup_.function = nullptr; + cleanup_.next = nullptr; +} + +Cleanable::~Cleanable() { DoCleanup(); } + +Cleanable::Cleanable(Cleanable&& other) { + *this = std::move(other); +} + +Cleanable& Cleanable::operator=(Cleanable&& other) { + if (this != &other) { + cleanup_ = other.cleanup_; + other.cleanup_.function = nullptr; + other.cleanup_.next = nullptr; + } + return *this; +} + +// If the entire linked list was on heap we could have simply add attach one +// link list to another. However the head is an embeded object to avoid the cost +// of creating objects for most of the use cases when the Cleanable has only one +// Cleanup to do. We could put evernything on heap if benchmarks show no +// negative impact on performance. +// Also we need to iterate on the linked list since there is no pointer to the +// tail. We can add the tail pointer but maintainin it might negatively impact +// the perforamnce for the common case of one cleanup where tail pointer is not +// needed. Again benchmarks could clarify that. +// Even without a tail pointer we could iterate on the list, find the tail, and +// have only that node updated without the need to insert the Cleanups one by +// one. This however would be redundant when the source Cleanable has one or a +// few Cleanups which is the case most of the time. +// TODO(myabandeh): if the list is too long we should maintain a tail pointer +// and have the entire list (minus the head that has to be inserted separately) +// merged with the target linked list at once. +void Cleanable::DelegateCleanupsTo(Cleanable* other) { + assert(other != nullptr); + if (cleanup_.function == nullptr) { + return; + } + Cleanup* c = &cleanup_; + other->RegisterCleanup(c->function, c->arg1, c->arg2); + c = c->next; + while (c != nullptr) { + Cleanup* next = c->next; + other->RegisterCleanup(c); + c = next; + } + cleanup_.function = nullptr; + cleanup_.next = nullptr; +} + +void Cleanable::RegisterCleanup(Cleanable::Cleanup* c) { + assert(c != nullptr); + if (cleanup_.function == nullptr) { + cleanup_.function = c->function; + cleanup_.arg1 = c->arg1; + cleanup_.arg2 = c->arg2; + delete c; + } else { + c->next = cleanup_.next; + cleanup_.next = c; + } +} + +void Cleanable::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != nullptr); + Cleanup* c; + if (cleanup_.function == nullptr) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +Status Iterator::GetProperty(std::string prop_name, std::string* prop) { + if (prop == nullptr) { + return Status::InvalidArgument("prop is nullptr"); + } + if (prop_name == "rocksdb.iterator.is-key-pinned") { + *prop = "0"; + return Status::OK(); + } + return Status::InvalidArgument("Unidentified property."); +} + +namespace { +class EmptyIterator : public Iterator { + public: + explicit EmptyIterator(const Status& s) : status_(s) { } + bool Valid() const override { return false; } + void Seek(const Slice& /*target*/) override {} + void SeekForPrev(const Slice& /*target*/) override {} + void SeekToFirst() override {} + void SeekToLast() override {} + void Next() override { assert(false); } + void Prev() override { assert(false); } + Slice key() const override { + assert(false); + return Slice(); + } + Slice value() const override { + assert(false); + return Slice(); + } + Status status() const override { return status_; } + + private: + Status status_; +}; + +template <class TValue = Slice> +class EmptyInternalIterator : public InternalIteratorBase<TValue> { + public: + explicit EmptyInternalIterator(const Status& s) : status_(s) {} + bool Valid() const override { return false; } + void Seek(const Slice& /*target*/) override {} + void SeekForPrev(const Slice& /*target*/) override {} + void SeekToFirst() override {} + void SeekToLast() override {} + void Next() override { assert(false); } + void Prev() override { assert(false); } + Slice key() const override { + assert(false); + return Slice(); + } + TValue value() const override { + assert(false); + return TValue(); + } + Status status() const override { return status_; } + + private: + Status status_; +}; +} // namespace + +Iterator* NewEmptyIterator() { return new EmptyIterator(Status::OK()); } + +Iterator* NewErrorIterator(const Status& status) { + return new EmptyIterator(status); +} + +template <class TValue> +InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status) { + return new EmptyInternalIterator<TValue>(status); +} +template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator( + const Status& status); +template InternalIteratorBase<Slice>* NewErrorInternalIterator( + const Status& status); + +template <class TValue> +InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status, + Arena* arena) { + if (arena == nullptr) { + return NewErrorInternalIterator<TValue>(status); + } else { + auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator<TValue>)); + return new (mem) EmptyInternalIterator<TValue>(status); + } +} +template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator( + const Status& status, Arena* arena); +template InternalIteratorBase<Slice>* NewErrorInternalIterator( + const Status& status, Arena* arena); + +template <class TValue> +InternalIteratorBase<TValue>* NewEmptyInternalIterator() { + return new EmptyInternalIterator<TValue>(Status::OK()); +} +template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator(); +template InternalIteratorBase<Slice>* NewEmptyInternalIterator(); + +template <class TValue> +InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena) { + if (arena == nullptr) { + return NewEmptyInternalIterator<TValue>(); + } else { + auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator<TValue>)); + return new (mem) EmptyInternalIterator<TValue>(Status::OK()); + } +} +template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator( + Arena* arena); +template InternalIteratorBase<Slice>* NewEmptyInternalIterator(Arena* arena); + +} // namespace rocksdb diff --git a/src/rocksdb/table/iterator_wrapper.h b/src/rocksdb/table/iterator_wrapper.h new file mode 100644 index 00000000..5941b846 --- /dev/null +++ b/src/rocksdb/table/iterator_wrapper.h @@ -0,0 +1,111 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <set> + +#include "table/internal_iterator.h" + +namespace rocksdb { + +// A internal wrapper class with an interface similar to Iterator that caches +// the valid() and key() results for an underlying iterator. +// This can help avoid virtual function calls and also gives better +// cache locality. +template <class TValue = Slice> +class IteratorWrapperBase { + public: + IteratorWrapperBase() : iter_(nullptr), valid_(false) {} + explicit IteratorWrapperBase(InternalIteratorBase<TValue>* _iter) + : iter_(nullptr) { + Set(_iter); + } + ~IteratorWrapperBase() {} + InternalIteratorBase<TValue>* iter() const { return iter_; } + + // Set the underlying Iterator to _iter and return + // previous underlying Iterator. + InternalIteratorBase<TValue>* Set(InternalIteratorBase<TValue>* _iter) { + InternalIteratorBase<TValue>* old_iter = iter_; + + iter_ = _iter; + if (iter_ == nullptr) { + valid_ = false; + } else { + Update(); + } + return old_iter; + } + + void DeleteIter(bool is_arena_mode) { + if (iter_) { + if (!is_arena_mode) { + delete iter_; + } else { + iter_->~InternalIteratorBase<TValue>(); + } + } + } + + // Iterator interface methods + bool Valid() const { return valid_; } + Slice key() const { assert(Valid()); return key_; } + TValue value() const { + assert(Valid()); + return iter_->value(); + } + // Methods below require iter() != nullptr + Status status() const { assert(iter_); return iter_->status(); } + void Next() { assert(iter_); iter_->Next(); Update(); } + void Prev() { assert(iter_); iter_->Prev(); Update(); } + void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void SeekForPrev(const Slice& k) { + assert(iter_); + iter_->SeekForPrev(k); + Update(); + } + void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } + void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) { + assert(iter_); + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + bool IsKeyPinned() const { + assert(Valid()); + return iter_->IsKeyPinned(); + } + bool IsValuePinned() const { + assert(Valid()); + return iter_->IsValuePinned(); + } + + private: + void Update() { + valid_ = iter_->Valid(); + if (valid_) { + key_ = iter_->key(); + assert(iter_->status().ok()); + } + } + + InternalIteratorBase<TValue>* iter_; + bool valid_; + Slice key_; +}; + +using IteratorWrapper = IteratorWrapperBase<Slice>; + +class Arena; +// Return an empty iterator (yields nothing) allocated from arena. +template <class TValue = Slice> +extern InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena); + +} // namespace rocksdb diff --git a/src/rocksdb/table/merger_test.cc b/src/rocksdb/table/merger_test.cc new file mode 100644 index 00000000..1b04d065 --- /dev/null +++ b/src/rocksdb/table/merger_test.cc @@ -0,0 +1,180 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <vector> +#include <string> + +#include "table/merging_iterator.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class MergerTest : public testing::Test { + public: + MergerTest() + : icomp_(BytewiseComparator()), + rnd_(3), + merging_iterator_(nullptr), + single_iterator_(nullptr) {} + ~MergerTest() override = default; + std::vector<std::string> GenerateStrings(size_t len, int string_len) { + std::vector<std::string> ret; + + for (size_t i = 0; i < len; ++i) { + InternalKey ik(test::RandomHumanReadableString(&rnd_, string_len), 0, + ValueType::kTypeValue); + ret.push_back(ik.Encode().ToString(false)); + } + return ret; + } + + void AssertEquivalence() { + auto a = merging_iterator_.get(); + auto b = single_iterator_.get(); + if (!a->Valid()) { + ASSERT_TRUE(!b->Valid()); + } else { + ASSERT_TRUE(b->Valid()); + ASSERT_EQ(b->key().ToString(), a->key().ToString()); + ASSERT_EQ(b->value().ToString(), a->value().ToString()); + } + } + + void SeekToRandom() { + InternalKey ik(test::RandomHumanReadableString(&rnd_, 5), 0, + ValueType::kTypeValue); + Seek(ik.Encode().ToString(false)); + } + + void Seek(std::string target) { + merging_iterator_->Seek(target); + single_iterator_->Seek(target); + } + + void SeekToFirst() { + merging_iterator_->SeekToFirst(); + single_iterator_->SeekToFirst(); + } + + void SeekToLast() { + merging_iterator_->SeekToLast(); + single_iterator_->SeekToLast(); + } + + void Next(int times) { + for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { + AssertEquivalence(); + merging_iterator_->Next(); + single_iterator_->Next(); + } + AssertEquivalence(); + } + + void Prev(int times) { + for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { + AssertEquivalence(); + merging_iterator_->Prev(); + single_iterator_->Prev(); + } + AssertEquivalence(); + } + + void NextAndPrev(int times) { + for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { + AssertEquivalence(); + if (rnd_.OneIn(2)) { + merging_iterator_->Prev(); + single_iterator_->Prev(); + } else { + merging_iterator_->Next(); + single_iterator_->Next(); + } + } + AssertEquivalence(); + } + + void Generate(size_t num_iterators, size_t strings_per_iterator, + int letters_per_string) { + std::vector<InternalIterator*> small_iterators; + for (size_t i = 0; i < num_iterators; ++i) { + auto strings = GenerateStrings(strings_per_iterator, letters_per_string); + small_iterators.push_back(new test::VectorIterator(strings)); + all_keys_.insert(all_keys_.end(), strings.begin(), strings.end()); + } + + merging_iterator_.reset( + NewMergingIterator(&icomp_, &small_iterators[0], + static_cast<int>(small_iterators.size()))); + single_iterator_.reset(new test::VectorIterator(all_keys_)); + } + + InternalKeyComparator icomp_; + Random rnd_; + std::unique_ptr<InternalIterator> merging_iterator_; + std::unique_ptr<InternalIterator> single_iterator_; + std::vector<std::string> all_keys_; +}; + +TEST_F(MergerTest, SeekToRandomNextTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToRandom(); + AssertEquivalence(); + Next(50000); + } +} + +TEST_F(MergerTest, SeekToRandomNextSmallStringsTest) { + Generate(1000, 50, 2); + for (int i = 0; i < 10; ++i) { + SeekToRandom(); + AssertEquivalence(); + Next(50000); + } +} + +TEST_F(MergerTest, SeekToRandomPrevTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToRandom(); + AssertEquivalence(); + Prev(50000); + } +} + +TEST_F(MergerTest, SeekToRandomRandomTest) { + Generate(200, 50, 50); + for (int i = 0; i < 3; ++i) { + SeekToRandom(); + AssertEquivalence(); + NextAndPrev(5000); + } +} + +TEST_F(MergerTest, SeekToFirstTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToFirst(); + AssertEquivalence(); + Next(50000); + } +} + +TEST_F(MergerTest, SeekToLastTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToLast(); + AssertEquivalence(); + Prev(50000); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/merging_iterator.cc b/src/rocksdb/table/merging_iterator.cc new file mode 100644 index 00000000..bd4a186b --- /dev/null +++ b/src/rocksdb/table/merging_iterator.cc @@ -0,0 +1,442 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/merging_iterator.h" +#include <string> +#include <vector> +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "table/internal_iterator.h" +#include "table/iter_heap.h" +#include "table/iterator_wrapper.h" +#include "util/arena.h" +#include "util/autovector.h" +#include "util/heap.h" +#include "util/stop_watch.h" +#include "util/sync_point.h" + +namespace rocksdb { +// Without anonymous namespace here, we fail the warning -Wmissing-prototypes +namespace { +typedef BinaryHeap<IteratorWrapper*, MaxIteratorComparator> MergerMaxIterHeap; +typedef BinaryHeap<IteratorWrapper*, MinIteratorComparator> MergerMinIterHeap; +} // namespace + +const size_t kNumIterReserve = 4; + +class MergingIterator : public InternalIterator { + public: + MergingIterator(const InternalKeyComparator* comparator, + InternalIterator** children, int n, bool is_arena_mode, + bool prefix_seek_mode) + : is_arena_mode_(is_arena_mode), + comparator_(comparator), + current_(nullptr), + direction_(kForward), + minHeap_(comparator_), + prefix_seek_mode_(prefix_seek_mode), + pinned_iters_mgr_(nullptr) { + children_.resize(n); + for (int i = 0; i < n; i++) { + children_[i].Set(children[i]); + } + for (auto& child : children_) { + if (child.Valid()) { + assert(child.status().ok()); + minHeap_.push(&child); + } else { + considerStatus(child.status()); + } + } + current_ = CurrentForward(); + } + + void considerStatus(Status s) { + if (!s.ok() && status_.ok()) { + status_ = s; + } + } + + virtual void AddIterator(InternalIterator* iter) { + assert(direction_ == kForward); + children_.emplace_back(iter); + if (pinned_iters_mgr_) { + iter->SetPinnedItersMgr(pinned_iters_mgr_); + } + auto new_wrapper = children_.back(); + if (new_wrapper.Valid()) { + assert(new_wrapper.status().ok()); + minHeap_.push(&new_wrapper); + current_ = CurrentForward(); + } else { + considerStatus(new_wrapper.status()); + } + } + + ~MergingIterator() override { + for (auto& child : children_) { + child.DeleteIter(is_arena_mode_); + } + } + + bool Valid() const override { return current_ != nullptr && status_.ok(); } + + Status status() const override { return status_; } + + void SeekToFirst() override { + ClearHeaps(); + status_ = Status::OK(); + for (auto& child : children_) { + child.SeekToFirst(); + if (child.Valid()) { + assert(child.status().ok()); + minHeap_.push(&child); + } else { + considerStatus(child.status()); + } + } + direction_ = kForward; + current_ = CurrentForward(); + } + + void SeekToLast() override { + ClearHeaps(); + InitMaxHeap(); + status_ = Status::OK(); + for (auto& child : children_) { + child.SeekToLast(); + if (child.Valid()) { + assert(child.status().ok()); + maxHeap_->push(&child); + } else { + considerStatus(child.status()); + } + } + direction_ = kReverse; + current_ = CurrentReverse(); + } + + void Seek(const Slice& target) override { + ClearHeaps(); + status_ = Status::OK(); + for (auto& child : children_) { + { + PERF_TIMER_GUARD(seek_child_seek_time); + child.Seek(target); + } + PERF_COUNTER_ADD(seek_child_seek_count, 1); + + if (child.Valid()) { + assert(child.status().ok()); + PERF_TIMER_GUARD(seek_min_heap_time); + minHeap_.push(&child); + } else { + considerStatus(child.status()); + } + } + direction_ = kForward; + { + PERF_TIMER_GUARD(seek_min_heap_time); + current_ = CurrentForward(); + } + } + + void SeekForPrev(const Slice& target) override { + ClearHeaps(); + InitMaxHeap(); + status_ = Status::OK(); + + for (auto& child : children_) { + { + PERF_TIMER_GUARD(seek_child_seek_time); + child.SeekForPrev(target); + } + PERF_COUNTER_ADD(seek_child_seek_count, 1); + + if (child.Valid()) { + assert(child.status().ok()); + PERF_TIMER_GUARD(seek_max_heap_time); + maxHeap_->push(&child); + } else { + considerStatus(child.status()); + } + } + direction_ = kReverse; + { + PERF_TIMER_GUARD(seek_max_heap_time); + current_ = CurrentReverse(); + } + } + + void Next() override { + assert(Valid()); + + // Ensure that all children are positioned after key(). + // If we are moving in the forward direction, it is already + // true for all of the non-current children since current_ is + // the smallest child and key() == current_->key(). + if (direction_ != kForward) { + SwitchToForward(); + // The loop advanced all non-current children to be > key() so current_ + // should still be strictly the smallest key. + assert(current_ == CurrentForward()); + } + + // For the heap modifications below to be correct, current_ must be the + // current top of the heap. + assert(current_ == CurrentForward()); + + // as the current points to the current record. move the iterator forward. + current_->Next(); + if (current_->Valid()) { + // current is still valid after the Next() call above. Call + // replace_top() to restore the heap property. When the same child + // iterator yields a sequence of keys, this is cheap. + assert(current_->status().ok()); + minHeap_.replace_top(current_); + } else { + // current stopped being valid, remove it from the heap. + considerStatus(current_->status()); + minHeap_.pop(); + } + current_ = CurrentForward(); + } + + void Prev() override { + assert(Valid()); + // Ensure that all children are positioned before key(). + // If we are moving in the reverse direction, it is already + // true for all of the non-current children since current_ is + // the largest child and key() == current_->key(). + if (direction_ != kReverse) { + // Otherwise, retreat the non-current children. We retreat current_ + // just after the if-block. + ClearHeaps(); + InitMaxHeap(); + Slice target = key(); + for (auto& child : children_) { + if (&child != current_) { + child.SeekForPrev(target); + TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child); + considerStatus(child.status()); + if (child.Valid() && comparator_->Equal(target, child.key())) { + child.Prev(); + considerStatus(child.status()); + } + } + if (child.Valid()) { + assert(child.status().ok()); + maxHeap_->push(&child); + } + } + direction_ = kReverse; + if (!prefix_seek_mode_) { + // Note that we don't do assert(current_ == CurrentReverse()) here + // because it is possible to have some keys larger than the seek-key + // inserted between Seek() and SeekToLast(), which makes current_ not + // equal to CurrentReverse(). + current_ = CurrentReverse(); + } + // The loop advanced all non-current children to be < key() so current_ + // should still be strictly the smallest key. + assert(current_ == CurrentReverse()); + } + + // For the heap modifications below to be correct, current_ must be the + // current top of the heap. + assert(current_ == CurrentReverse()); + + current_->Prev(); + if (current_->Valid()) { + // current is still valid after the Prev() call above. Call + // replace_top() to restore the heap property. When the same child + // iterator yields a sequence of keys, this is cheap. + assert(current_->status().ok()); + maxHeap_->replace_top(current_); + } else { + // current stopped being valid, remove it from the heap. + considerStatus(current_->status()); + maxHeap_->pop(); + } + current_ = CurrentReverse(); + } + + Slice key() const override { + assert(Valid()); + return current_->key(); + } + + Slice value() const override { + assert(Valid()); + return current_->value(); + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + for (auto& child : children_) { + child.SetPinnedItersMgr(pinned_iters_mgr); + } + } + + bool IsKeyPinned() const override { + assert(Valid()); + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsKeyPinned(); + } + + bool IsValuePinned() const override { + assert(Valid()); + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + current_->IsValuePinned(); + } + + private: + // Clears heaps for both directions, used when changing direction or seeking + void ClearHeaps(); + // Ensures that maxHeap_ is initialized when starting to go in the reverse + // direction + void InitMaxHeap(); + + bool is_arena_mode_; + const InternalKeyComparator* comparator_; + autovector<IteratorWrapper, kNumIterReserve> children_; + + // Cached pointer to child iterator with the current key, or nullptr if no + // child iterators are valid. This is the top of minHeap_ or maxHeap_ + // depending on the direction. + IteratorWrapper* current_; + // If any of the children have non-ok status, this is one of them. + Status status_; + // Which direction is the iterator moving? + enum Direction { + kForward, + kReverse + }; + Direction direction_; + MergerMinIterHeap minHeap_; + bool prefix_seek_mode_; + + // Max heap is used for reverse iteration, which is way less common than + // forward. Lazily initialize it to save memory. + std::unique_ptr<MergerMaxIterHeap> maxHeap_; + PinnedIteratorsManager* pinned_iters_mgr_; + + void SwitchToForward(); + + IteratorWrapper* CurrentForward() const { + assert(direction_ == kForward); + return !minHeap_.empty() ? minHeap_.top() : nullptr; + } + + IteratorWrapper* CurrentReverse() const { + assert(direction_ == kReverse); + assert(maxHeap_); + return !maxHeap_->empty() ? maxHeap_->top() : nullptr; + } +}; + +void MergingIterator::SwitchToForward() { + // Otherwise, advance the non-current children. We advance current_ + // just after the if-block. + ClearHeaps(); + Slice target = key(); + for (auto& child : children_) { + if (&child != current_) { + child.Seek(target); + considerStatus(child.status()); + if (child.Valid() && comparator_->Equal(target, child.key())) { + child.Next(); + considerStatus(child.status()); + } + } + if (child.Valid()) { + minHeap_.push(&child); + } + } + direction_ = kForward; +} + +void MergingIterator::ClearHeaps() { + minHeap_.clear(); + if (maxHeap_) { + maxHeap_->clear(); + } +} + +void MergingIterator::InitMaxHeap() { + if (!maxHeap_) { + maxHeap_.reset(new MergerMaxIterHeap(comparator_)); + } +} + +InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp, + InternalIterator** list, int n, + Arena* arena, bool prefix_seek_mode) { + assert(n >= 0); + if (n == 0) { + return NewEmptyInternalIterator<Slice>(arena); + } else if (n == 1) { + return list[0]; + } else { + if (arena == nullptr) { + return new MergingIterator(cmp, list, n, false, prefix_seek_mode); + } else { + auto mem = arena->AllocateAligned(sizeof(MergingIterator)); + return new (mem) MergingIterator(cmp, list, n, true, prefix_seek_mode); + } + } +} + +MergeIteratorBuilder::MergeIteratorBuilder( + const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode) + : first_iter(nullptr), use_merging_iter(false), arena(a) { + auto mem = arena->AllocateAligned(sizeof(MergingIterator)); + merge_iter = + new (mem) MergingIterator(comparator, nullptr, 0, true, prefix_seek_mode); +} + +MergeIteratorBuilder::~MergeIteratorBuilder() { + if (first_iter != nullptr) { + first_iter->~InternalIterator(); + } + if (merge_iter != nullptr) { + merge_iter->~MergingIterator(); + } +} + +void MergeIteratorBuilder::AddIterator(InternalIterator* iter) { + if (!use_merging_iter && first_iter != nullptr) { + merge_iter->AddIterator(first_iter); + use_merging_iter = true; + first_iter = nullptr; + } + if (use_merging_iter) { + merge_iter->AddIterator(iter); + } else { + first_iter = iter; + } +} + +InternalIterator* MergeIteratorBuilder::Finish() { + InternalIterator* ret = nullptr; + if (!use_merging_iter) { + ret = first_iter; + first_iter = nullptr; + } else { + ret = merge_iter; + merge_iter = nullptr; + } + return ret; +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/merging_iterator.h b/src/rocksdb/table/merging_iterator.h new file mode 100644 index 00000000..21ff79bf --- /dev/null +++ b/src/rocksdb/table/merging_iterator.h @@ -0,0 +1,64 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/dbformat.h" +#include "rocksdb/types.h" + +namespace rocksdb { + +class Comparator; +class Env; +class Arena; +template <class TValue> +class InternalIteratorBase; +using InternalIterator = InternalIteratorBase<Slice>; + +// Return an iterator that provided the union of the data in +// children[0,n-1]. Takes ownership of the child iterators and +// will delete them when the result iterator is deleted. +// +// The result does no duplicate suppression. I.e., if a particular +// key is present in K child iterators, it will be yielded K times. +// +// REQUIRES: n >= 0 +extern InternalIterator* NewMergingIterator( + const InternalKeyComparator* comparator, InternalIterator** children, int n, + Arena* arena = nullptr, bool prefix_seek_mode = false); + +class MergingIterator; + +// A builder class to build a merging iterator by adding iterators one by one. +class MergeIteratorBuilder { + public: + // comparator: the comparator used in merging comparator + // arena: where the merging iterator needs to be allocated from. + explicit MergeIteratorBuilder(const InternalKeyComparator* comparator, + Arena* arena, bool prefix_seek_mode = false); + ~MergeIteratorBuilder(); + + // Add iter to the merging iterator. + void AddIterator(InternalIterator* iter); + + // Get arena used to build the merging iterator. It is called one a child + // iterator needs to be allocated. + Arena* GetArena() { return arena; } + + // Return the result merging iterator. + InternalIterator* Finish(); + + private: + MergingIterator* merge_iter; + InternalIterator* first_iter; + bool use_merging_iter; + Arena* arena; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/meta_blocks.cc b/src/rocksdb/table/meta_blocks.cc new file mode 100644 index 00000000..57111cfe --- /dev/null +++ b/src/rocksdb/table/meta_blocks.cc @@ -0,0 +1,514 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "table/meta_blocks.h" + +#include <map> +#include <string> + +#include "db/table_properties_collector.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block.h" +#include "table/block_fetcher.h" +#include "table/format.h" +#include "table/internal_iterator.h" +#include "table/persistent_cache_helper.h" +#include "table/table_properties_internal.h" +#include "util/coding.h" +#include "util/file_reader_writer.h" + +namespace rocksdb { + +MetaIndexBuilder::MetaIndexBuilder() + : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {} + +void MetaIndexBuilder::Add(const std::string& key, + const BlockHandle& handle) { + std::string handle_encoding; + handle.EncodeTo(&handle_encoding); + meta_block_handles_.insert({key, handle_encoding}); +} + +Slice MetaIndexBuilder::Finish() { + for (const auto& metablock : meta_block_handles_) { + meta_index_block_->Add(metablock.first, metablock.second); + } + return meta_index_block_->Finish(); +} + +// Property block will be read sequentially and cached in a heap located +// object, so there's no need for restart points. Thus we set the restart +// interval to infinity to save space. +PropertyBlockBuilder::PropertyBlockBuilder() + : properties_block_( + new BlockBuilder(port::kMaxInt32 /* restart interval */)) {} + +void PropertyBlockBuilder::Add(const std::string& name, + const std::string& val) { + props_.insert({name, val}); +} + +void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) { + assert(props_.find(name) == props_.end()); + + std::string dst; + PutVarint64(&dst, val); + + Add(name, dst); +} + +void PropertyBlockBuilder::Add( + const UserCollectedProperties& user_collected_properties) { + for (const auto& prop : user_collected_properties) { + Add(prop.first, prop.second); + } +} + +void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { + Add(TablePropertiesNames::kRawKeySize, props.raw_key_size); + Add(TablePropertiesNames::kRawValueSize, props.raw_value_size); + Add(TablePropertiesNames::kDataSize, props.data_size); + Add(TablePropertiesNames::kIndexSize, props.index_size); + if (props.index_partitions != 0) { + Add(TablePropertiesNames::kIndexPartitions, props.index_partitions); + Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size); + } + Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key); + Add(TablePropertiesNames::kIndexValueIsDeltaEncoded, + props.index_value_is_delta_encoded); + Add(TablePropertiesNames::kNumEntries, props.num_entries); + Add(TablePropertiesNames::kDeletedKeys, props.num_deletions); + Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands); + Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions); + Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks); + Add(TablePropertiesNames::kFilterSize, props.filter_size); + Add(TablePropertiesNames::kFormatVersion, props.format_version); + Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len); + Add(TablePropertiesNames::kColumnFamilyId, props.column_family_id); + Add(TablePropertiesNames::kCreationTime, props.creation_time); + Add(TablePropertiesNames::kOldestKeyTime, props.oldest_key_time); + + if (!props.filter_policy_name.empty()) { + Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name); + } + if (!props.comparator_name.empty()) { + Add(TablePropertiesNames::kComparator, props.comparator_name); + } + + if (!props.merge_operator_name.empty()) { + Add(TablePropertiesNames::kMergeOperator, props.merge_operator_name); + } + if (!props.prefix_extractor_name.empty()) { + Add(TablePropertiesNames::kPrefixExtractorName, + props.prefix_extractor_name); + } + if (!props.property_collectors_names.empty()) { + Add(TablePropertiesNames::kPropertyCollectors, + props.property_collectors_names); + } + if (!props.column_family_name.empty()) { + Add(TablePropertiesNames::kColumnFamilyName, props.column_family_name); + } + + if (!props.compression_name.empty()) { + Add(TablePropertiesNames::kCompression, props.compression_name); + } + if (!props.compression_options.empty()) { + Add(TablePropertiesNames::kCompressionOptions, props.compression_options); + } +} + +Slice PropertyBlockBuilder::Finish() { + for (const auto& prop : props_) { + properties_block_->Add(prop.first, prop.second); + } + + return properties_block_->Finish(); +} + +void LogPropertiesCollectionError( + Logger* info_log, const std::string& method, const std::string& name) { + assert(method == "Add" || method == "Finish"); + + std::string msg = + "Encountered error when calling TablePropertiesCollector::" + + method + "() with collector name: " + name; + ROCKS_LOG_ERROR(info_log, "%s", msg.c_str()); +} + +bool NotifyCollectTableCollectorsOnAdd( + const Slice& key, const Slice& value, uint64_t file_size, + const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors, + Logger* info_log) { + bool all_succeeded = true; + for (auto& collector : collectors) { + Status s = collector->InternalAdd(key, value, file_size); + all_succeeded = all_succeeded && s.ok(); + if (!s.ok()) { + LogPropertiesCollectionError(info_log, "Add" /* method */, + collector->Name()); + } + } + return all_succeeded; +} + +void NotifyCollectTableCollectorsOnBlockAdd( + const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors, + const uint64_t blockRawBytes, const uint64_t blockCompressedBytesFast, + const uint64_t blockCompressedBytesSlow) { + for (auto& collector : collectors) { + collector->BlockAdd(blockRawBytes, blockCompressedBytesFast, + blockCompressedBytesSlow); + } +} + +bool NotifyCollectTableCollectorsOnFinish( + const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors, + Logger* info_log, PropertyBlockBuilder* builder) { + bool all_succeeded = true; + for (auto& collector : collectors) { + UserCollectedProperties user_collected_properties; + Status s = collector->Finish(&user_collected_properties); + + all_succeeded = all_succeeded && s.ok(); + if (!s.ok()) { + LogPropertiesCollectionError(info_log, "Finish" /* method */, + collector->Name()); + } else { + builder->Add(user_collected_properties); + } + } + + return all_succeeded; +} + +Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, const Footer& footer, + const ImmutableCFOptions& ioptions, + TableProperties** table_properties, bool verify_checksum, + BlockHandle* ret_block_handle, + CacheAllocationPtr* verification_buf, + bool /*compression_type_missing*/, + MemoryAllocator* memory_allocator) { + assert(table_properties); + + Slice v = handle_value; + BlockHandle handle; + if (!handle.DecodeFrom(&v).ok()) { + return Status::InvalidArgument("Failed to decode properties block handle"); + } + + BlockContents block_contents; + ReadOptions read_options; + read_options.verify_checksums = verify_checksum; + Status s; + PersistentCacheOptions cache_options; + + BlockFetcher block_fetcher( + file, prefetch_buffer, footer, read_options, handle, &block_contents, + ioptions, false /* decompress */, false /*maybe_compressed*/, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + s = block_fetcher.ReadBlockContents(); + // property block is never compressed. Need to add uncompress logic if we are + // to compress it.. + + if (!s.ok()) { + return s; + } + + Block properties_block(std::move(block_contents), + kDisableGlobalSequenceNumber); + DataBlockIter iter; + properties_block.NewIterator<DataBlockIter>(BytewiseComparator(), + BytewiseComparator(), &iter); + + auto new_table_properties = new TableProperties(); + // All pre-defined properties of type uint64_t + std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = { + {TablePropertiesNames::kDataSize, &new_table_properties->data_size}, + {TablePropertiesNames::kIndexSize, &new_table_properties->index_size}, + {TablePropertiesNames::kIndexPartitions, + &new_table_properties->index_partitions}, + {TablePropertiesNames::kTopLevelIndexSize, + &new_table_properties->top_level_index_size}, + {TablePropertiesNames::kIndexKeyIsUserKey, + &new_table_properties->index_key_is_user_key}, + {TablePropertiesNames::kIndexValueIsDeltaEncoded, + &new_table_properties->index_value_is_delta_encoded}, + {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size}, + {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size}, + {TablePropertiesNames::kRawValueSize, + &new_table_properties->raw_value_size}, + {TablePropertiesNames::kNumDataBlocks, + &new_table_properties->num_data_blocks}, + {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries}, + {TablePropertiesNames::kDeletedKeys, + &new_table_properties->num_deletions}, + {TablePropertiesNames::kMergeOperands, + &new_table_properties->num_merge_operands}, + {TablePropertiesNames::kNumRangeDeletions, + &new_table_properties->num_range_deletions}, + {TablePropertiesNames::kFormatVersion, + &new_table_properties->format_version}, + {TablePropertiesNames::kFixedKeyLen, + &new_table_properties->fixed_key_len}, + {TablePropertiesNames::kColumnFamilyId, + &new_table_properties->column_family_id}, + {TablePropertiesNames::kCreationTime, + &new_table_properties->creation_time}, + {TablePropertiesNames::kOldestKeyTime, + &new_table_properties->oldest_key_time}, + }; + + std::string last_key; + for (iter.SeekToFirstOrReport(); iter.Valid(); iter.NextOrReport()) { + s = iter.status(); + if (!s.ok()) { + break; + } + + auto key = iter.key().ToString(); + // properties block should be strictly sorted with no duplicate key. + if (!last_key.empty() && + BytewiseComparator()->Compare(key, last_key) <= 0) { + s = Status::Corruption("properties unsorted"); + break; + } + last_key = key; + + auto raw_val = iter.value(); + auto pos = predefined_uint64_properties.find(key); + + new_table_properties->properties_offsets.insert( + {key, handle.offset() + iter.ValueOffset()}); + + if (pos != predefined_uint64_properties.end()) { + if (key == TablePropertiesNames::kDeletedKeys || + key == TablePropertiesNames::kMergeOperands) { + // Insert in user-collected properties for API backwards compatibility + new_table_properties->user_collected_properties.insert( + {key, raw_val.ToString()}); + } + // handle predefined rocksdb properties + uint64_t val; + if (!GetVarint64(&raw_val, &val)) { + // skip malformed value + auto error_msg = + "Detect malformed value in properties meta-block:" + "\tkey: " + key + "\tval: " + raw_val.ToString(); + ROCKS_LOG_ERROR(ioptions.info_log, "%s", error_msg.c_str()); + continue; + } + *(pos->second) = val; + } else if (key == TablePropertiesNames::kFilterPolicy) { + new_table_properties->filter_policy_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kColumnFamilyName) { + new_table_properties->column_family_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kComparator) { + new_table_properties->comparator_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kMergeOperator) { + new_table_properties->merge_operator_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kPrefixExtractorName) { + new_table_properties->prefix_extractor_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kPropertyCollectors) { + new_table_properties->property_collectors_names = raw_val.ToString(); + } else if (key == TablePropertiesNames::kCompression) { + new_table_properties->compression_name = raw_val.ToString(); + } else if (key == TablePropertiesNames::kCompressionOptions) { + new_table_properties->compression_options = raw_val.ToString(); + } else { + // handle user-collected properties + new_table_properties->user_collected_properties.insert( + {key, raw_val.ToString()}); + } + } + if (s.ok()) { + *table_properties = new_table_properties; + if (ret_block_handle != nullptr) { + *ret_block_handle = handle; + } + if (verification_buf != nullptr) { + size_t len = handle.size() + kBlockTrailerSize; + *verification_buf = rocksdb::AllocateBlock(len, memory_allocator); + if (verification_buf->get() != nullptr) { + memcpy(verification_buf->get(), block_contents.data.data(), len); + } + } + } else { + delete new_table_properties; + } + + return s; +} + +Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableCFOptions& ioptions, + TableProperties** properties, + bool compression_type_missing, + MemoryAllocator* memory_allocator) { + // -- Read metaindex block + Footer footer; + auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size, + &footer, table_magic_number); + if (!s.ok()) { + return s; + } + + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + ReadOptions read_options; + read_options.verify_checksums = false; + PersistentCacheOptions cache_options; + + BlockFetcher block_fetcher( + file, nullptr /* prefetch_buffer */, footer, read_options, + metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, + false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), + cache_options, memory_allocator); + s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + return s; + } + // property blocks are never compressed. Need to add uncompress logic if we + // are to compress it. + Block metaindex_block(std::move(metaindex_contents), + kDisableGlobalSequenceNumber); + std::unique_ptr<InternalIterator> meta_iter( + metaindex_block.NewIterator<DataBlockIter>(BytewiseComparator(), + BytewiseComparator())); + + // -- Read property block + bool found_properties_block = true; + s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block); + if (!s.ok()) { + return s; + } + + TableProperties table_properties; + if (found_properties_block == true) { + s = ReadProperties( + meta_iter->value(), file, nullptr /* prefetch_buffer */, footer, + ioptions, properties, false /* verify_checksum */, + nullptr /* ret_block_hanel */, nullptr /* ret_block_contents */, + compression_type_missing, memory_allocator); + } else { + s = Status::NotFound(); + } + + return s; +} + +Status FindMetaBlock(InternalIterator* meta_index_iter, + const std::string& meta_block_name, + BlockHandle* block_handle) { + meta_index_iter->Seek(meta_block_name); + if (meta_index_iter->status().ok() && meta_index_iter->Valid() && + meta_index_iter->key() == meta_block_name) { + Slice v = meta_index_iter->value(); + return block_handle->DecodeFrom(&v); + } else { + return Status::Corruption("Cannot find the meta block", meta_block_name); + } +} + +Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableCFOptions& ioptions, + const std::string& meta_block_name, + BlockHandle* block_handle, + bool /*compression_type_missing*/, + MemoryAllocator* memory_allocator) { + Footer footer; + auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size, + &footer, table_magic_number); + if (!s.ok()) { + return s; + } + + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + ReadOptions read_options; + read_options.verify_checksums = false; + PersistentCacheOptions cache_options; + BlockFetcher block_fetcher( + file, nullptr /* prefetch_buffer */, footer, read_options, + metaindex_handle, &metaindex_contents, ioptions, + false /* do decompression */, false /*maybe_compressed*/, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + return s; + } + // meta blocks are never compressed. Need to add uncompress logic if we are to + // compress it. + Block metaindex_block(std::move(metaindex_contents), + kDisableGlobalSequenceNumber); + + std::unique_ptr<InternalIterator> meta_iter; + meta_iter.reset(metaindex_block.NewIterator<DataBlockIter>( + BytewiseComparator(), BytewiseComparator())); + + return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); +} + +Status ReadMetaBlock(RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableCFOptions& ioptions, + const std::string& meta_block_name, + BlockContents* contents, bool /*compression_type_missing*/, + MemoryAllocator* memory_allocator) { + Status status; + Footer footer; + status = ReadFooterFromFile(file, prefetch_buffer, file_size, &footer, + table_magic_number); + if (!status.ok()) { + return status; + } + + // Reading metaindex block + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + ReadOptions read_options; + read_options.verify_checksums = false; + PersistentCacheOptions cache_options; + + BlockFetcher block_fetcher(file, prefetch_buffer, footer, read_options, + metaindex_handle, &metaindex_contents, ioptions, + false /* decompress */, false /*maybe_compressed*/, + UncompressionDict::GetEmptyDict(), cache_options, + memory_allocator); + status = block_fetcher.ReadBlockContents(); + if (!status.ok()) { + return status; + } + // meta block is never compressed. Need to add uncompress logic if we are to + // compress it. + + // Finding metablock + Block metaindex_block(std::move(metaindex_contents), + kDisableGlobalSequenceNumber); + + std::unique_ptr<InternalIterator> meta_iter; + meta_iter.reset(metaindex_block.NewIterator<DataBlockIter>( + BytewiseComparator(), BytewiseComparator())); + + BlockHandle block_handle; + status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); + + if (!status.ok()) { + return status; + } + + // Reading metablock + BlockFetcher block_fetcher2( + file, prefetch_buffer, footer, read_options, block_handle, contents, + ioptions, false /* decompress */, false /*maybe_compressed*/, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + return block_fetcher2.ReadBlockContents(); +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/meta_blocks.h b/src/rocksdb/table/meta_blocks.h new file mode 100644 index 00000000..6efd1225 --- /dev/null +++ b/src/rocksdb/table/meta_blocks.h @@ -0,0 +1,151 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <map> +#include <memory> +#include <string> +#include <vector> + +#include "db/builder.h" +#include "db/table_properties_collector.h" +#include "rocksdb/comparator.h" +#include "rocksdb/memory_allocator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "util/kv_map.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class Env; +class Footer; +class Logger; +class RandomAccessFile; +struct TableProperties; + +class MetaIndexBuilder { + public: + MetaIndexBuilder(const MetaIndexBuilder&) = delete; + MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete; + + MetaIndexBuilder(); + void Add(const std::string& key, const BlockHandle& handle); + + // Write all the added key/value pairs to the block and return the contents + // of the block. + Slice Finish(); + + private: + // store the sorted key/handle of the metablocks. + stl_wrappers::KVMap meta_block_handles_; + std::unique_ptr<BlockBuilder> meta_index_block_; +}; + +class PropertyBlockBuilder { + public: + PropertyBlockBuilder(const PropertyBlockBuilder&) = delete; + PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete; + + PropertyBlockBuilder(); + + void AddTableProperty(const TableProperties& props); + void Add(const std::string& key, uint64_t value); + void Add(const std::string& key, const std::string& value); + void Add(const UserCollectedProperties& user_collected_properties); + + // Write all the added entries to the block and return the block contents + Slice Finish(); + + private: + std::unique_ptr<BlockBuilder> properties_block_; + stl_wrappers::KVMap props_; +}; + +// Were we encounter any error occurs during user-defined statistics collection, +// we'll write the warning message to info log. +void LogPropertiesCollectionError( + Logger* info_log, const std::string& method, const std::string& name); + +// Utility functions help table builder to trigger batch events for user +// defined property collectors. +// Return value indicates if there is any error occurred; if error occurred, +// the warning message will be logged. +// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all +// property collectors. +bool NotifyCollectTableCollectorsOnAdd( + const Slice& key, const Slice& value, uint64_t file_size, + const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors, + Logger* info_log); + +void NotifyCollectTableCollectorsOnBlockAdd( + const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors, + uint64_t blockRawBytes, uint64_t blockCompressedBytesFast, + uint64_t blockCompressedBytesSlow); + +// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all +// property collectors. The collected properties will be added to `builder`. +bool NotifyCollectTableCollectorsOnFinish( + const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors, + Logger* info_log, PropertyBlockBuilder* builder); + +// Read the properties from the table. +// @returns a status to indicate if the operation succeeded. On success, +// *table_properties will point to a heap-allocated TableProperties +// object, otherwise value of `table_properties` will not be modified. +Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, const Footer& footer, + const ImmutableCFOptions& ioptions, + TableProperties** table_properties, bool verify_checksum, + BlockHandle* block_handle, + CacheAllocationPtr* verification_buf, + bool compression_type_missing = false, + MemoryAllocator* memory_allocator = nullptr); + +// Directly read the properties from the properties block of a plain table. +// @returns a status to indicate if the operation succeeded. On success, +// *table_properties will point to a heap-allocated TableProperties +// object, otherwise value of `table_properties` will not be modified. +// certain tables do not have compression_type byte setup properly for +// uncompressed blocks, caller can request to reset compression type by +// passing compression_type_missing = true, the same applies to +// `ReadProperties`, `FindMetaBlock`, and `ReadMetaBlock` +Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableCFOptions& ioptions, + TableProperties** properties, + bool compression_type_missing = false, + MemoryAllocator* memory_allocator = nullptr); + +// Find the meta block from the meta index block. +Status FindMetaBlock(InternalIterator* meta_index_iter, + const std::string& meta_block_name, + BlockHandle* block_handle); + +// Find the meta block +Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableCFOptions& ioptions, + const std::string& meta_block_name, + BlockHandle* block_handle, + bool compression_type_missing = false, + MemoryAllocator* memory_allocator = nullptr); + +// Read the specified meta block with name meta_block_name +// from `file` and initialize `contents` with contents of this block. +// Return Status::OK in case of success. +Status ReadMetaBlock(RandomAccessFileReader* file, + FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, + uint64_t table_magic_number, + const ImmutableCFOptions& ioptions, + const std::string& meta_block_name, + BlockContents* contents, + bool compression_type_missing = false, + MemoryAllocator* memory_allocator = nullptr); + +} // namespace rocksdb diff --git a/src/rocksdb/table/mock_table.cc b/src/rocksdb/table/mock_table.cc new file mode 100644 index 00000000..65a43616 --- /dev/null +++ b/src/rocksdb/table/mock_table.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/mock_table.h" + +#include "db/dbformat.h" +#include "port/port.h" +#include "rocksdb/table_properties.h" +#include "table/get_context.h" +#include "util/coding.h" +#include "util/file_reader_writer.h" + +namespace rocksdb { +namespace mock { + +namespace { + +const InternalKeyComparator icmp_(BytewiseComparator()); + +} // namespace + +stl_wrappers::KVMap MakeMockFile( + std::initializer_list<std::pair<const std::string, std::string>> l) { + return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_)); +} + +InternalIterator* MockTableReader::NewIterator( + const ReadOptions&, const SliceTransform* /* prefix_extractor */, + Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/) { + return new MockTableIterator(table_); +} + +Status MockTableReader::Get(const ReadOptions&, const Slice& key, + GetContext* get_context, + const SliceTransform* /*prefix_extractor*/, + bool /*skip_filters*/) { + std::unique_ptr<MockTableIterator> iter(new MockTableIterator(table_)); + for (iter->Seek(key); iter->Valid(); iter->Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(iter->key(), &parsed_key)) { + return Status::Corruption(Slice()); + } + + bool dont_care __attribute__((__unused__)); + if (!get_context->SaveValue(parsed_key, iter->value(), &dont_care)) { + break; + } + } + return Status::OK(); +} + +std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties() + const { + return std::shared_ptr<const TableProperties>(new TableProperties()); +} + +MockTableFactory::MockTableFactory() : next_id_(1) {} + +Status MockTableFactory::NewTableReader( + const TableReaderOptions& /*table_reader_options*/, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /*file_size*/, + std::unique_ptr<TableReader>* table_reader, + bool /*prefetch_index_and_filter_in_cache*/) const { + uint32_t id = GetIDFromFile(file.get()); + + MutexLock lock_guard(&file_system_.mutex); + + auto it = file_system_.files.find(id); + if (it == file_system_.files.end()) { + return Status::IOError("Mock file not found"); + } + + table_reader->reset(new MockTableReader(it->second)); + + return Status::OK(); +} + +TableBuilder* MockTableFactory::NewTableBuilder( + const TableBuilderOptions& /*table_builder_options*/, + uint32_t /*column_family_id*/, WritableFileWriter* file) const { + uint32_t id = GetAndWriteNextID(file); + + return new MockTableBuilder(id, &file_system_); +} + +Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname, + stl_wrappers::KVMap file_contents) { + std::unique_ptr<WritableFile> file; + auto s = env->NewWritableFile(fname, &file, EnvOptions()); + if (!s.ok()) { + return s; + } + + WritableFileWriter file_writer(std::move(file), fname, EnvOptions()); + + uint32_t id = GetAndWriteNextID(&file_writer); + file_system_.files.insert({id, std::move(file_contents)}); + return Status::OK(); +} + +uint32_t MockTableFactory::GetAndWriteNextID(WritableFileWriter* file) const { + uint32_t next_id = next_id_.fetch_add(1); + char buf[4]; + EncodeFixed32(buf, next_id); + file->Append(Slice(buf, 4)); + return next_id; +} + +uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const { + char buf[4]; + Slice result; + file->Read(0, 4, &result, buf); + assert(result.size() == 4); + return DecodeFixed32(buf); +} + +void MockTableFactory::AssertSingleFile( + const stl_wrappers::KVMap& file_contents) { + ASSERT_EQ(file_system_.files.size(), 1U); + ASSERT_EQ(file_contents, file_system_.files.begin()->second); +} + +void MockTableFactory::AssertLatestFile( + const stl_wrappers::KVMap& file_contents) { + ASSERT_GE(file_system_.files.size(), 1U); + auto latest = file_system_.files.end(); + --latest; + + if (file_contents != latest->second) { + std::cout << "Wrong content! Content of latest file:" << std::endl; + for (const auto& kv : latest->second) { + ParsedInternalKey ikey; + std::string key, value; + std::tie(key, value) = kv; + ParseInternalKey(Slice(key), &ikey); + std::cout << ikey.DebugString(false) << " -> " << value << std::endl; + } + FAIL(); + } +} + +} // namespace mock +} // namespace rocksdb diff --git a/src/rocksdb/table/mock_table.h b/src/rocksdb/table/mock_table.h new file mode 100644 index 00000000..2f123a96 --- /dev/null +++ b/src/rocksdb/table/mock_table.h @@ -0,0 +1,197 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <algorithm> +#include <atomic> +#include <map> +#include <memory> +#include <set> +#include <string> +#include <utility> + +#include "util/kv_map.h" +#include "port/port.h" +#include "rocksdb/comparator.h" +#include "rocksdb/table.h" +#include "table/internal_iterator.h" +#include "table/table_builder.h" +#include "table/table_reader.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { +namespace mock { + +stl_wrappers::KVMap MakeMockFile( + std::initializer_list<std::pair<const std::string, std::string>> l = {}); + +struct MockTableFileSystem { + port::Mutex mutex; + std::map<uint32_t, stl_wrappers::KVMap> files; +}; + +class MockTableReader : public TableReader { + public: + explicit MockTableReader(const stl_wrappers::KVMap& table) : table_(table) {} + + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena = nullptr, + bool skip_filters = false, + bool for_compaction = false) override; + + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; } + + virtual size_t ApproximateMemoryUsage() const override { return 0; } + + void SetupForCompaction() override {} + + std::shared_ptr<const TableProperties> GetTableProperties() const override; + + ~MockTableReader() {} + + private: + const stl_wrappers::KVMap& table_; +}; + +class MockTableIterator : public InternalIterator { + public: + explicit MockTableIterator(const stl_wrappers::KVMap& table) : table_(table) { + itr_ = table_.end(); + } + + bool Valid() const override { return itr_ != table_.end(); } + + void SeekToFirst() override { itr_ = table_.begin(); } + + void SeekToLast() override { + itr_ = table_.end(); + --itr_; + } + + void Seek(const Slice& target) override { + std::string str_target(target.data(), target.size()); + itr_ = table_.lower_bound(str_target); + } + + void SeekForPrev(const Slice& target) override { + std::string str_target(target.data(), target.size()); + itr_ = table_.upper_bound(str_target); + Prev(); + } + + void Next() override { ++itr_; } + + void Prev() override { + if (itr_ == table_.begin()) { + itr_ = table_.end(); + } else { + --itr_; + } + } + + Slice key() const override { return Slice(itr_->first); } + + Slice value() const override { return Slice(itr_->second); } + + Status status() const override { return Status::OK(); } + + private: + const stl_wrappers::KVMap& table_; + stl_wrappers::KVMap::const_iterator itr_; +}; + +class MockTableBuilder : public TableBuilder { + public: + MockTableBuilder(uint32_t id, MockTableFileSystem* file_system) + : id_(id), file_system_(file_system) { + table_ = MakeMockFile({}); + } + + // REQUIRES: Either Finish() or Abandon() has been called. + ~MockTableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override { + table_.insert({key.ToString(), value.ToString()}); + } + + // Return non-ok iff some error has been detected. + Status status() const override { return Status::OK(); } + + Status Finish() override { + MutexLock lock_guard(&file_system_->mutex); + file_system_->files.insert({id_, table_}); + return Status::OK(); + } + + void Abandon() override {} + + uint64_t NumEntries() const override { return table_.size(); } + + uint64_t FileSize() const override { return table_.size(); } + + TableProperties GetTableProperties() const override { + return TableProperties(); + } + + private: + uint32_t id_; + MockTableFileSystem* file_system_; + stl_wrappers::KVMap table_; +}; + +class MockTableFactory : public TableFactory { + public: + MockTableFactory(); + const char* Name() const override { return "MockTable"; } + Status NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, + bool prefetch_index_and_filter_in_cache = true) const override; + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + uint32_t column_familly_id, WritableFileWriter* file) const override; + + // This function will directly create mock table instead of going through + // MockTableBuilder. file_contents has to have a format of <internal_key, + // value>. Those key-value pairs will then be inserted into the mock table. + Status CreateMockTable(Env* env, const std::string& fname, + stl_wrappers::KVMap file_contents); + + virtual Status SanitizeOptions( + const DBOptions& /*db_opts*/, + const ColumnFamilyOptions& /*cf_opts*/) const override { + return Status::OK(); + } + + virtual std::string GetPrintableTableOptions() const override { + return std::string(); + } + + // This function will assert that only a single file exists and that the + // contents are equal to file_contents + void AssertSingleFile(const stl_wrappers::KVMap& file_contents); + void AssertLatestFile(const stl_wrappers::KVMap& file_contents); + + private: + uint32_t GetAndWriteNextID(WritableFileWriter* file) const; + uint32_t GetIDFromFile(RandomAccessFileReader* file) const; + + mutable MockTableFileSystem file_system_; + mutable std::atomic<uint32_t> next_id_; +}; + +} // namespace mock +} // namespace rocksdb diff --git a/src/rocksdb/table/partitioned_filter_block.cc b/src/rocksdb/table/partitioned_filter_block.cc new file mode 100644 index 00000000..aab0f550 --- /dev/null +++ b/src/rocksdb/table/partitioned_filter_block.cc @@ -0,0 +1,355 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/partitioned_filter_block.h" + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include <malloc_np.h> +#else +#include <malloc.h> +#endif +#endif +#include <utility> + +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "rocksdb/filter_policy.h" +#include "table/block.h" +#include "table/block_based_table_reader.h" +#include "util/coding.h" + +namespace rocksdb { + +PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( + const SliceTransform* prefix_extractor, bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, + PartitionedIndexBuilder* const p_index_builder, + const uint32_t partition_size) + : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering, + filter_bits_builder), + index_on_filter_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_on_filter_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + p_index_builder_(p_index_builder), + filters_in_partition_(0), + num_added_(0) { + filters_per_partition_ = + filter_bits_builder_->CalculateNumEntry(partition_size); +} + +PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {} + +void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() { + // Use == to send the request only once + if (filters_in_partition_ == filters_per_partition_) { + // Currently only index builder is in charge of cutting a partition. We keep + // requesting until it is granted. + p_index_builder_->RequestPartitionCut(); + } + if (!p_index_builder_->ShouldCutFilterBlock()) { + return; + } + filter_gc.push_back(std::unique_ptr<const char[]>(nullptr)); + Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); + std::string& index_key = p_index_builder_->GetPartitionKey(); + filters.push_back({index_key, filter}); + filters_in_partition_ = 0; + Reset(); +} + +void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { + MaybeCutAFilterBlock(); + filter_bits_builder_->AddKey(key); + filters_in_partition_++; + num_added_++; +} + +Slice PartitionedFilterBlockBuilder::Finish( + const BlockHandle& last_partition_block_handle, Status* status) { + if (finishing_filters == true) { + // Record the handle of the last written filter block in the index + FilterEntry& last_entry = filters.front(); + std::string handle_encoding; + last_partition_block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64( + &handle_delta_encoding, + last_partition_block_handle.size() - last_encoded_handle_.size()); + last_encoded_handle_ = last_partition_block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_on_filter_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); + if (!p_index_builder_->seperator_is_key_plus_seq()) { + index_on_filter_block_builder_without_seq_.Add( + ExtractUserKey(last_entry.key), handle_encoding, + &handle_delta_encoding_slice); + } + filters.pop_front(); + } else { + MaybeCutAFilterBlock(); + } + // If there is no filter partition left, then return the index on filter + // partitions + if (UNLIKELY(filters.empty())) { + *status = Status::OK(); + if (finishing_filters) { + if (p_index_builder_->seperator_is_key_plus_seq()) { + return index_on_filter_block_builder_.Finish(); + } else { + return index_on_filter_block_builder_without_seq_.Finish(); + } + } else { + // This is the rare case where no key was added to the filter + return Slice(); + } + } else { + // Return the next filter partition in line and set Incomplete() status to + // indicate we expect more calls to Finish + *status = Status::Incomplete(); + finishing_filters = true; + return filters.front().filter; + } +} + +PartitionedFilterBlockReader::PartitionedFilterBlockReader( + const SliceTransform* prefix_extractor, bool _whole_key_filtering, + BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/, + Statistics* stats, const InternalKeyComparator comparator, + const BlockBasedTable* table, const bool index_key_includes_seq, + const bool index_value_is_full) + : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), + prefix_extractor_(prefix_extractor), + comparator_(comparator), + table_(table), + index_key_includes_seq_(index_key_includes_seq), + index_value_is_full_(index_value_is_full) { + idx_on_fltr_blk_.reset(new Block(std::move(contents), + kDisableGlobalSequenceNumber, + 0 /* read_amp_bytes_per_bit */, stats)); +} + +PartitionedFilterBlockReader::~PartitionedFilterBlockReader() { + // TODO(myabandeh): if instead of filter object we store only the blocks in + // block cache, then we don't have to manually earse them from block cache + // here. + auto block_cache = table_->rep_->table_options.block_cache.get(); + if (UNLIKELY(block_cache == nullptr)) { + return; + } + char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + IndexBlockIter biter; + BlockHandle handle; + Statistics* kNullStats = nullptr; + idx_on_fltr_blk_->NewIterator<IndexBlockIter>( + &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, + index_key_includes_seq_, index_value_is_full_); + biter.SeekToFirst(); + for (; biter.Valid(); biter.Next()) { + handle = biter.value(); + auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix, + table_->rep_->cache_key_prefix_size, + handle, cache_key); + block_cache->Erase(key); + } +} + +bool PartitionedFilterBlockReader::KeyMayMatch( + const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr) { + assert(const_ikey_ptr != nullptr); + assert(block_offset == kNotValid); + if (!whole_key_filtering_) { + return true; + } + if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) { + return true; + } + auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr); + if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range + return false; + } + bool cached = false; + auto filter_partition = + GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, + &cached, prefix_extractor); + if (UNLIKELY(!filter_partition.value)) { + return true; + } + auto res = filter_partition.value->KeyMayMatch(key, prefix_extractor, + block_offset, no_io); + if (cached) { + return res; + } + if (LIKELY(filter_partition.IsSet())) { + filter_partition.Release(table_->rep_->table_options.block_cache.get()); + } else { + delete filter_partition.value; + } + return res; +} + +bool PartitionedFilterBlockReader::PrefixMayMatch( + const Slice& prefix, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr) { +#ifdef NDEBUG + (void)block_offset; +#endif + assert(const_ikey_ptr != nullptr); + assert(block_offset == kNotValid); + if (!prefix_extractor_ && !prefix_extractor) { + return true; + } + if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) { + return true; + } + auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr); + if (UNLIKELY(filter_handle.size() == 0)) { // prefix is out of range + return false; + } + bool cached = false; + auto filter_partition = + GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, + &cached, prefix_extractor); + if (UNLIKELY(!filter_partition.value)) { + return true; + } + auto res = filter_partition.value->PrefixMayMatch(prefix, prefix_extractor, + kNotValid, no_io); + if (cached) { + return res; + } + if (LIKELY(filter_partition.IsSet())) { + filter_partition.Release(table_->rep_->table_options.block_cache.get()); + } else { + delete filter_partition.value; + } + return res; +} + +BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( + const Slice& entry) { + IndexBlockIter iter; + Statistics* kNullStats = nullptr; + idx_on_fltr_blk_->NewIterator<IndexBlockIter>( + &comparator_, comparator_.user_comparator(), &iter, kNullStats, true, + index_key_includes_seq_, index_value_is_full_); + iter.Seek(entry); + if (UNLIKELY(!iter.Valid())) { + return BlockHandle(0, 0); + } + assert(iter.Valid()); + BlockHandle fltr_blk_handle = iter.value(); + return fltr_blk_handle; +} + +BlockBasedTable::CachableEntry<FilterBlockReader> +PartitionedFilterBlockReader::GetFilterPartition( + FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle, + const bool no_io, bool* cached, const SliceTransform* prefix_extractor) { + const bool is_a_filter_partition = true; + auto block_cache = table_->rep_->table_options.block_cache.get(); + if (LIKELY(block_cache != nullptr)) { + if (filter_map_.size() != 0) { + auto iter = filter_map_.find(fltr_blk_handle.offset()); + // This is a possible scenario since block cache might not have had space + // for the partition + if (iter != filter_map_.end()) { + PERF_COUNTER_ADD(block_cache_hit_count, 1); + RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT); + RecordTick(statistics(), BLOCK_CACHE_HIT); + RecordTick(statistics(), BLOCK_CACHE_BYTES_READ, + block_cache->GetUsage(iter->second.cache_handle)); + *cached = true; + return iter->second; + } + } + return table_->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle, + is_a_filter_partition, no_io, + /* get_context */ nullptr, prefix_extractor); + } else { + auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle, + is_a_filter_partition, prefix_extractor); + return {filter, nullptr}; + } +} + +size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { + size_t usage = idx_on_fltr_blk_->usable_size(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + // TODO(myabandeh): better estimation for filter_map_ size +} + +// Release the cached entry and decrement its ref count. +void ReleaseFilterCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast<Cache*>(arg); + Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h); + cache->Release(handle); +} + +// TODO(myabandeh): merge this with the same function in IndexReader +void PartitionedFilterBlockReader::CacheDependencies( + bool pin, const SliceTransform* prefix_extractor) { + // Before read partitions, prefetch them to avoid lots of IOs + auto rep = table_->rep_; + IndexBlockIter biter; + Statistics* kNullStats = nullptr; + idx_on_fltr_blk_->NewIterator<IndexBlockIter>( + &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, + index_key_includes_seq_, index_value_is_full_); + // Index partitions are assumed to be consecuitive. Prefetch them all. + // Read the first block offset + biter.SeekToFirst(); + BlockHandle handle = biter.value(); + uint64_t prefetch_off = handle.offset(); + + // Read the last block's offset + biter.SeekToLast(); + handle = biter.value(); + uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; + uint64_t prefetch_len = last_off - prefetch_off; + std::unique_ptr<FilePrefetchBuffer> prefetch_buffer; + auto& file = table_->rep_->file; + prefetch_buffer.reset(new FilePrefetchBuffer()); + Status s; + s = prefetch_buffer->Prefetch(file.get(), prefetch_off, + static_cast<size_t>(prefetch_len)); + + // After prefetch, read the partitions one by one + biter.SeekToFirst(); + Cache* block_cache = rep->table_options.block_cache.get(); + for (; biter.Valid(); biter.Next()) { + handle = biter.value(); + const bool no_io = true; + const bool is_a_filter_partition = true; + auto filter = table_->GetFilter( + prefetch_buffer.get(), handle, is_a_filter_partition, !no_io, + /* get_context */ nullptr, prefix_extractor); + if (LIKELY(filter.IsSet())) { + if (pin) { + filter_map_[handle.offset()] = std::move(filter); + RegisterCleanup(&ReleaseFilterCachedEntry, block_cache, + filter.cache_handle); + } else { + block_cache->Release(filter.cache_handle); + } + } else { + delete filter.value; + } + } +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/partitioned_filter_block.h b/src/rocksdb/table/partitioned_filter_block.h new file mode 100644 index 00000000..5d55da54 --- /dev/null +++ b/src/rocksdb/table/partitioned_filter_block.h @@ -0,0 +1,114 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <list> +#include <string> +#include <unordered_map> +#include "db/dbformat.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" + +#include "table/block.h" +#include "table/block_based_table_reader.h" +#include "table/full_filter_block.h" +#include "table/index_builder.h" +#include "util/autovector.h" + +namespace rocksdb { + +class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { + public: + explicit PartitionedFilterBlockBuilder( + const SliceTransform* prefix_extractor, bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, + PartitionedIndexBuilder* const p_index_builder, + const uint32_t partition_size); + + virtual ~PartitionedFilterBlockBuilder(); + + void AddKey(const Slice& key) override; + + size_t NumAdded() const override { return num_added_; } + + virtual Slice Finish(const BlockHandle& last_partition_block_handle, + Status* status) override; + + private: + // Filter data + BlockBuilder index_on_filter_block_builder_; // top-level index builder + BlockBuilder + index_on_filter_block_builder_without_seq_; // same for user keys + struct FilterEntry { + std::string key; + Slice filter; + }; + std::list<FilterEntry> filters; // list of partitioned indexes and their keys + std::unique_ptr<IndexBuilder> value; + std::vector<std::unique_ptr<const char[]>> filter_gc; + bool finishing_filters = + false; // true if Finish is called once but not complete yet. + // The policy of when cut a filter block and Finish it + void MaybeCutAFilterBlock(); + // Currently we keep the same number of partitions for filters and indexes. + // This would allow for some potentioal optimizations in future. If such + // optimizations did not realize we can use different number of partitions and + // eliminate p_index_builder_ + PartitionedIndexBuilder* const p_index_builder_; + // The desired number of filters per partition + uint32_t filters_per_partition_; + // The current number of filters in the last partition + uint32_t filters_in_partition_; + // Number of keys added + size_t num_added_; + BlockHandle last_encoded_handle_; +}; + +class PartitionedFilterBlockReader : public FilterBlockReader, + public Cleanable { + public: + explicit PartitionedFilterBlockReader( + const SliceTransform* prefix_extractor, bool whole_key_filtering, + BlockContents&& contents, FilterBitsReader* filter_bits_reader, + Statistics* stats, const InternalKeyComparator comparator, + const BlockBasedTable* table, const bool index_key_includes_seq, + const bool index_value_is_full); + virtual ~PartitionedFilterBlockReader(); + + virtual bool IsBlockBased() override { return false; } + virtual bool KeyMayMatch( + const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset = kNotValid, const bool no_io = false, + const Slice* const const_ikey_ptr = nullptr) override; + virtual bool PrefixMayMatch( + const Slice& prefix, const SliceTransform* prefix_extractor, + uint64_t block_offset = kNotValid, const bool no_io = false, + const Slice* const const_ikey_ptr = nullptr) override; + virtual size_t ApproximateMemoryUsage() const override; + + private: + BlockHandle GetFilterPartitionHandle(const Slice& entry); + BlockBasedTable::CachableEntry<FilterBlockReader> GetFilterPartition( + FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle, + const bool no_io, bool* cached, + const SliceTransform* prefix_extractor = nullptr); + virtual void CacheDependencies( + bool bin, const SliceTransform* prefix_extractor) override; + + const SliceTransform* prefix_extractor_; + std::unique_ptr<Block> idx_on_fltr_blk_; + const InternalKeyComparator comparator_; + const BlockBasedTable* table_; + const bool index_key_includes_seq_; + const bool index_value_is_full_; + std::unordered_map<uint64_t, + BlockBasedTable::CachableEntry<FilterBlockReader>> + filter_map_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/partitioned_filter_block_test.cc b/src/rocksdb/table/partitioned_filter_block_test.cc new file mode 100644 index 00000000..8068f14d --- /dev/null +++ b/src/rocksdb/table/partitioned_filter_block_test.cc @@ -0,0 +1,367 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <map> + +#include "rocksdb/filter_policy.h" + +#include "table/full_filter_bits_builder.h" +#include "table/index_builder.h" +#include "table/partitioned_filter_block.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +std::map<uint64_t, Slice> slices; + +class MockedBlockBasedTable : public BlockBasedTable { + public: + explicit MockedBlockBasedTable(Rep* rep) : BlockBasedTable(rep) { + // Initialize what Open normally does as much as necessary for the test + rep->cache_key_prefix_size = 10; + } + + CachableEntry<FilterBlockReader> GetFilter( + FilePrefetchBuffer*, const BlockHandle& filter_blk_handle, + const bool /* unused */, bool /* unused */, GetContext* /* unused */, + const SliceTransform* prefix_extractor) const override { + Slice slice = slices[filter_blk_handle.offset()]; + auto obj = new FullFilterBlockReader( + prefix_extractor, true, BlockContents(slice), + rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr); + return {obj, nullptr}; + } + + FilterBlockReader* ReadFilter( + FilePrefetchBuffer*, const BlockHandle& filter_blk_handle, + const bool /* unused */, + const SliceTransform* prefix_extractor) const override { + Slice slice = slices[filter_blk_handle.offset()]; + auto obj = new FullFilterBlockReader( + prefix_extractor, true, BlockContents(slice), + rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr); + return obj; + } +}; + +class PartitionedFilterBlockTest + : public testing::Test, + virtual public ::testing::WithParamInterface<uint32_t> { + public: + BlockBasedTableOptions table_options_; + InternalKeyComparator icomp = InternalKeyComparator(BytewiseComparator()); + + PartitionedFilterBlockTest() { + table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); + table_options_.no_block_cache = true; // Otherwise BlockBasedTable::Close + // will access variable that are not + // initialized in our mocked version + table_options_.format_version = GetParam(); + table_options_.index_block_restart_interval = 3; + } + + std::shared_ptr<Cache> cache_; + ~PartitionedFilterBlockTest() override {} + + const std::string keys[4] = {"afoo", "bar", "box", "hello"}; + const std::string missing_keys[2] = {"missing", "other"}; + + uint64_t MaxIndexSize() { + int num_keys = sizeof(keys) / sizeof(*keys); + uint64_t max_key_size = 0; + for (int i = 1; i < num_keys; i++) { + max_key_size = std::max(max_key_size, static_cast<uint64_t>(keys[i].size())); + } + uint64_t max_index_size = num_keys * (max_key_size + 8 /*handle*/); + return max_index_size; + } + + uint64_t MaxFilterSize() { + uint32_t dont_care1, dont_care2; + int num_keys = sizeof(keys) / sizeof(*keys); + auto filter_bits_reader = dynamic_cast<rocksdb::FullFilterBitsBuilder*>( + table_options_.filter_policy->GetFilterBitsBuilder()); + assert(filter_bits_reader); + auto partition_size = + filter_bits_reader->CalculateSpace(num_keys, &dont_care1, &dont_care2); + delete filter_bits_reader; + return partition_size + + partition_size * table_options_.block_size_deviation / 100; + } + + int last_offset = 10; + BlockHandle Write(const Slice& slice) { + BlockHandle bh(last_offset + 1, slice.size()); + slices[bh.offset()] = slice; + last_offset += bh.size(); + return bh; + } + + PartitionedIndexBuilder* NewIndexBuilder() { + const bool kValueDeltaEncoded = true; + return PartitionedIndexBuilder::CreateIndexBuilder( + &icomp, !kValueDeltaEncoded, table_options_); + } + + PartitionedFilterBlockBuilder* NewBuilder( + PartitionedIndexBuilder* const p_index_builder, + const SliceTransform* prefix_extractor = nullptr) { + assert(table_options_.block_size_deviation <= 100); + auto partition_size = static_cast<uint32_t>( + ((table_options_.metadata_block_size * + (100 - table_options_.block_size_deviation)) + + 99) / + 100); + partition_size = std::max(partition_size, static_cast<uint32_t>(1)); + const bool kValueDeltaEncoded = true; + return new PartitionedFilterBlockBuilder( + prefix_extractor, table_options_.whole_key_filtering, + table_options_.filter_policy->GetFilterBitsBuilder(), + table_options_.index_block_restart_interval, !kValueDeltaEncoded, + p_index_builder, partition_size); + } + + std::unique_ptr<MockedBlockBasedTable> table; + + PartitionedFilterBlockReader* NewReader( + PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib, + const SliceTransform* prefix_extractor) { + BlockHandle bh; + Status status; + Slice slice; + do { + slice = builder->Finish(bh, &status); + bh = Write(slice); + } while (status.IsIncomplete()); + const Options options; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + const EnvOptions env_options; + const bool kSkipFilters = true; + const bool kImmortal = true; + table.reset(new MockedBlockBasedTable( + new BlockBasedTable::Rep(ioptions, env_options, table_options_, icomp, + !kSkipFilters, 0, !kImmortal))); + auto reader = new PartitionedFilterBlockReader( + prefix_extractor, true, BlockContents(slice), nullptr, nullptr, icomp, + table.get(), pib->seperator_is_key_plus_seq(), + !pib->get_use_value_delta_encoding()); + return reader; + } + + void VerifyReader(PartitionedFilterBlockBuilder* builder, + PartitionedIndexBuilder* pib, bool empty = false, + const SliceTransform* prefix_extractor = nullptr) { + std::unique_ptr<PartitionedFilterBlockReader> reader( + NewReader(builder, pib, prefix_extractor)); + // Querying added keys + const bool no_io = true; + for (auto key : keys) { + auto ikey = InternalKey(key, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io, + &ikey_slice)); + } + { + // querying a key twice + auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->KeyMayMatch(keys[0], prefix_extractor, kNotValid, + !no_io, &ikey_slice)); + } + // querying missing keys + for (auto key : missing_keys) { + auto ikey = InternalKey(key, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + if (empty) { + ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, + !no_io, &ikey_slice)); + } else { + // assuming a good hash function + ASSERT_FALSE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, + !no_io, &ikey_slice)); + } + } + } + + int TestBlockPerKey() { + std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder()); + std::unique_ptr<PartitionedFilterBlockBuilder> builder( + NewBuilder(pib.get())); + int i = 0; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i], keys[i + 1]); + i++; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i], keys[i + 1]); + i++; + builder->Add(keys[i]); + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i], keys[i + 1]); + i++; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i]); + + VerifyReader(builder.get(), pib.get()); + return CountNumOfIndexPartitions(pib.get()); + } + + void TestBlockPerTwoKeys(const SliceTransform* prefix_extractor = nullptr) { + std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder()); + std::unique_ptr<PartitionedFilterBlockBuilder> builder( + NewBuilder(pib.get(), prefix_extractor)); + int i = 0; + builder->Add(keys[i]); + i++; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i], keys[i + 1]); + i++; + builder->Add(keys[i]); + builder->Add(keys[i]); + i++; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i]); + + VerifyReader(builder.get(), pib.get(), prefix_extractor); + } + + void TestBlockPerAllKeys() { + std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder()); + std::unique_ptr<PartitionedFilterBlockBuilder> builder( + NewBuilder(pib.get())); + int i = 0; + builder->Add(keys[i]); + i++; + builder->Add(keys[i]); + i++; + builder->Add(keys[i]); + builder->Add(keys[i]); + i++; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i]); + + VerifyReader(builder.get(), pib.get()); + } + + void CutABlock(PartitionedIndexBuilder* builder, + const std::string& user_key) { + // Assuming a block is cut, add an entry to the index + std::string key = + std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep()); + BlockHandle dont_care_block_handle(1, 1); + builder->AddIndexEntry(&key, nullptr, dont_care_block_handle); + } + + void CutABlock(PartitionedIndexBuilder* builder, const std::string& user_key, + const std::string& next_user_key) { + // Assuming a block is cut, add an entry to the index + std::string key = + std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep()); + std::string next_key = std::string( + *InternalKey(next_user_key, 0, ValueType::kTypeValue).rep()); + BlockHandle dont_care_block_handle(1, 1); + Slice slice = Slice(next_key.data(), next_key.size()); + builder->AddIndexEntry(&key, &slice, dont_care_block_handle); + } + + int CountNumOfIndexPartitions(PartitionedIndexBuilder* builder) { + IndexBuilder::IndexBlocks dont_care_ib; + BlockHandle dont_care_bh(10, 10); + Status s; + int cnt = 0; + do { + s = builder->Finish(&dont_care_ib, dont_care_bh); + cnt++; + } while (s.IsIncomplete()); + return cnt - 1; // 1 is 2nd level index + } +}; + +INSTANTIATE_TEST_CASE_P(FormatDef, PartitionedFilterBlockTest, + testing::Values(test::kDefaultFormatVersion)); +INSTANTIATE_TEST_CASE_P(FormatLatest, PartitionedFilterBlockTest, + testing::Values(test::kLatestFormatVersion)); + +TEST_P(PartitionedFilterBlockTest, EmptyBuilder) { + std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder()); + std::unique_ptr<PartitionedFilterBlockBuilder> builder(NewBuilder(pib.get())); + const bool empty = true; + VerifyReader(builder.get(), pib.get(), empty); +} + +TEST_P(PartitionedFilterBlockTest, OneBlock) { + uint64_t max_index_size = MaxIndexSize(); + for (uint64_t i = 1; i < max_index_size + 1; i++) { + table_options_.metadata_block_size = i; + TestBlockPerAllKeys(); + } +} + +TEST_P(PartitionedFilterBlockTest, TwoBlocksPerKey) { + uint64_t max_index_size = MaxIndexSize(); + for (uint64_t i = 1; i < max_index_size + 1; i++) { + table_options_.metadata_block_size = i; + TestBlockPerTwoKeys(); + } +} + +// This reproduces the bug that a prefix is the same among multiple consecutive +// blocks but the bug would add it only to the first block. +TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { + // some small number to cause partition cuts + table_options_.metadata_block_size = 1; + std::unique_ptr<const SliceTransform> prefix_extractor + (rocksdb::NewFixedPrefixTransform(1)); + std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder()); + std::unique_ptr<PartitionedFilterBlockBuilder> builder( + NewBuilder(pib.get(), prefix_extractor.get())); + const std::string pkeys[3] = {"p-key1", "p-key2", "p-key3"}; + builder->Add(pkeys[0]); + CutABlock(pib.get(), pkeys[0], pkeys[1]); + builder->Add(pkeys[1]); + CutABlock(pib.get(), pkeys[1], pkeys[2]); + builder->Add(pkeys[2]); + CutABlock(pib.get(), pkeys[2]); + std::unique_ptr<PartitionedFilterBlockReader> reader( + NewReader(builder.get(), pib.get(), prefix_extractor.get())); + for (auto key : pkeys) { + auto ikey = InternalKey(key, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key), + prefix_extractor.get(), kNotValid, + false /*no_io*/, &ikey_slice)); + } +} + +TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) { + uint64_t max_index_size = MaxIndexSize(); + for (uint64_t i = 1; i < max_index_size + 1; i++) { + table_options_.metadata_block_size = i; + TestBlockPerKey(); + } +} + +TEST_P(PartitionedFilterBlockTest, PartitionCount) { + int num_keys = sizeof(keys) / sizeof(*keys); + table_options_.metadata_block_size = + std::max(MaxIndexSize(), MaxFilterSize()); + int partitions = TestBlockPerKey(); + ASSERT_EQ(partitions, 1); + // A low number ensures cutting a block after each key + table_options_.metadata_block_size = 1; + partitions = TestBlockPerKey(); + ASSERT_EQ(partitions, num_keys - 1 /* last two keys make one flush */); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/persistent_cache_helper.cc b/src/rocksdb/table/persistent_cache_helper.cc new file mode 100644 index 00000000..4e90697a --- /dev/null +++ b/src/rocksdb/table/persistent_cache_helper.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/persistent_cache_helper.h" +#include "table/block_based_table_reader.h" +#include "table/format.h" + +namespace rocksdb { + +void PersistentCacheHelper::InsertRawPage( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + const char* data, const size_t size) { + assert(cache_options.persistent_cache); + assert(cache_options.persistent_cache->IsCompressed()); + + // construct the page key + char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), + cache_options.key_prefix.size(), + handle, cache_key); + // insert content to cache + cache_options.persistent_cache->Insert(key, data, size); +} + +void PersistentCacheHelper::InsertUncompressedPage( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + const BlockContents& contents) { + assert(cache_options.persistent_cache); + assert(!cache_options.persistent_cache->IsCompressed()); + // Precondition: + // (1) content is cacheable + // (2) content is not compressed + + // construct the page key + char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), + cache_options.key_prefix.size(), + handle, cache_key); + // insert block contents to page cache + cache_options.persistent_cache->Insert(key, contents.data.data(), + contents.data.size()); +} + +Status PersistentCacheHelper::LookupRawPage( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + std::unique_ptr<char[]>* raw_data, const size_t raw_data_size) { +#ifdef NDEBUG + (void)raw_data_size; +#endif + assert(cache_options.persistent_cache); + assert(cache_options.persistent_cache->IsCompressed()); + + // construct the page key + char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), + cache_options.key_prefix.size(), + handle, cache_key); + // Lookup page + size_t size; + Status s = cache_options.persistent_cache->Lookup(key, raw_data, &size); + if (!s.ok()) { + // cache miss + RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS); + return s; + } + + // cache hit + assert(raw_data_size == handle.size() + kBlockTrailerSize); + assert(size == raw_data_size); + RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT); + return Status::OK(); +} + +Status PersistentCacheHelper::LookupUncompressedPage( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + BlockContents* contents) { + assert(cache_options.persistent_cache); + assert(!cache_options.persistent_cache->IsCompressed()); + if (!contents) { + // We shouldn't lookup in the cache. Either + // (1) Nowhere to store + return Status::NotFound(); + } + + // construct the page key + char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(), + cache_options.key_prefix.size(), + handle, cache_key); + // Lookup page + std::unique_ptr<char[]> data; + size_t size; + Status s = cache_options.persistent_cache->Lookup(key, &data, &size); + if (!s.ok()) { + // cache miss + RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS); + return s; + } + + // please note we are potentially comparing compressed data size with + // uncompressed data size + assert(handle.size() <= size); + + // update stats + RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT); + // construct result and return + *contents = BlockContents(std::move(data), size); + return Status::OK(); +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/persistent_cache_helper.h b/src/rocksdb/table/persistent_cache_helper.h new file mode 100644 index 00000000..ac8ee038 --- /dev/null +++ b/src/rocksdb/table/persistent_cache_helper.h @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <string> + +#include "monitoring/statistics.h" +#include "table/format.h" +#include "table/persistent_cache_options.h" + +namespace rocksdb { + +struct BlockContents; + +// PersistentCacheHelper +// +// Encapsulates some of the helper logic for read and writing from the cache +class PersistentCacheHelper { + public: + // insert block into raw page cache + static void InsertRawPage(const PersistentCacheOptions& cache_options, + const BlockHandle& handle, const char* data, + const size_t size); + + // insert block into uncompressed cache + static void InsertUncompressedPage( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + const BlockContents& contents); + + // lookup block from raw page cacge + static Status LookupRawPage(const PersistentCacheOptions& cache_options, + const BlockHandle& handle, + std::unique_ptr<char[]>* raw_data, + const size_t raw_data_size); + + // lookup block from uncompressed cache + static Status LookupUncompressedPage( + const PersistentCacheOptions& cache_options, const BlockHandle& handle, + BlockContents* contents); +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/persistent_cache_options.h b/src/rocksdb/table/persistent_cache_options.h new file mode 100644 index 00000000..acd64036 --- /dev/null +++ b/src/rocksdb/table/persistent_cache_options.h @@ -0,0 +1,34 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <string> + +#include "monitoring/statistics.h" +#include "rocksdb/persistent_cache.h" + +namespace rocksdb { + +// PersistentCacheOptions +// +// This describe the caching behavior for page cache +// This is used to pass the context for caching and the cache handle +struct PersistentCacheOptions { + PersistentCacheOptions() {} + explicit PersistentCacheOptions( + const std::shared_ptr<PersistentCache>& _persistent_cache, + const std::string _key_prefix, Statistics* const _statistics) + : persistent_cache(_persistent_cache), + key_prefix(_key_prefix), + statistics(_statistics) {} + + virtual ~PersistentCacheOptions() {} + + std::shared_ptr<PersistentCache> persistent_cache; + std::string key_prefix; + Statistics* statistics = nullptr; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/plain_table_builder.cc b/src/rocksdb/table/plain_table_builder.cc new file mode 100644 index 00000000..453b6c76 --- /dev/null +++ b/src/rocksdb/table/plain_table_builder.cc @@ -0,0 +1,303 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include "table/plain_table_builder.h" + +#include <assert.h> + +#include <string> +#include <limits> +#include <map> + +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/plain_table_factory.h" +#include "db/dbformat.h" +#include "table/block_builder.h" +#include "table/bloom_block.h" +#include "table/plain_table_index.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/file_reader_writer.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +namespace { + +// a utility that helps writing block content to the file +// @offset will advance if @block_contents was successfully written. +// @block_handle the block handle this particular block. +Status WriteBlock(const Slice& block_contents, WritableFileWriter* file, + uint64_t* offset, BlockHandle* block_handle) { + block_handle->set_offset(*offset); + block_handle->set_size(block_contents.size()); + Status s = file->Append(block_contents); + + if (s.ok()) { + *offset += block_contents.size(); + } + return s; +} + +} // namespace + +// kPlainTableMagicNumber was picked by running +// echo rocksdb.table.plain | sha1sum +// and taking the leading 64 bits. +extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; +extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; + +PlainTableBuilder::PlainTableBuilder( + const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_len, + EncodingType encoding_type, size_t index_sparseness, + uint32_t bloom_bits_per_key, const std::string& column_family_name, + uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio, + bool store_index_in_file) + : ioptions_(ioptions), + moptions_(moptions), + bloom_block_(num_probes), + file_(file), + bloom_bits_per_key_(bloom_bits_per_key), + huge_page_tlb_size_(huge_page_tlb_size), + encoder_(encoding_type, user_key_len, moptions.prefix_extractor.get(), + index_sparseness), + store_index_in_file_(store_index_in_file), + prefix_extractor_(moptions.prefix_extractor.get()) { + // Build index block and save it in the file if hash_table_ratio > 0 + if (store_index_in_file_) { + assert(hash_table_ratio > 0 || IsTotalOrderMode()); + index_builder_.reset(new PlainTableIndexBuilder( + &arena_, ioptions, moptions.prefix_extractor.get(), index_sparseness, + hash_table_ratio, huge_page_tlb_size_)); + properties_.user_collected_properties + [PlainTablePropertyNames::kBloomVersion] = "1"; // For future use + } + + properties_.fixed_key_len = user_key_len; + + // for plain table, we put all the data in a big chuck. + properties_.num_data_blocks = 1; + // Fill it later if store_index_in_file_ == true + properties_.index_size = 0; + properties_.filter_size = 0; + // To support roll-back to previous version, now still use version 0 for + // plain encoding. + properties_.format_version = (encoding_type == kPlain) ? 0 : 1; + properties_.column_family_id = column_family_id; + properties_.column_family_name = column_family_name; + properties_.prefix_extractor_name = moptions_.prefix_extractor != nullptr + ? moptions_.prefix_extractor->Name() + : "nullptr"; + + std::string val; + PutFixed32(&val, static_cast<uint32_t>(encoder_.GetEncodingType())); + properties_.user_collected_properties + [PlainTablePropertyNames::kEncodingType] = val; + + for (auto& collector_factories : *int_tbl_prop_collector_factories) { + table_properties_collectors_.emplace_back( + collector_factories->CreateIntTblPropCollector(column_family_id)); + } +} + +PlainTableBuilder::~PlainTableBuilder() { +} + +void PlainTableBuilder::Add(const Slice& key, const Slice& value) { + // temp buffer for metadata bytes between key and value. + char meta_bytes_buf[6]; + size_t meta_bytes_buf_size = 0; + + ParsedInternalKey internal_key; + if (!ParseInternalKey(key, &internal_key)) { + assert(false); + return; + } + if (internal_key.type == kTypeRangeDeletion) { + status_ = Status::NotSupported("Range deletion unsupported"); + return; + } + + // Store key hash + if (store_index_in_file_) { + if (moptions_.prefix_extractor == nullptr) { + keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key)); + } else { + Slice prefix = + moptions_.prefix_extractor->Transform(internal_key.user_key); + keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix)); + } + } + + // Write value + assert(offset_ <= std::numeric_limits<uint32_t>::max()); + auto prev_offset = static_cast<uint32_t>(offset_); + // Write out the key + encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf, + &meta_bytes_buf_size); + if (SaveIndexInFile()) { + index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset); + } + + // Write value length + uint32_t value_size = static_cast<uint32_t>(value.size()); + char* end_ptr = + EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size); + assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf)); + meta_bytes_buf_size = end_ptr - meta_bytes_buf; + file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size)); + + // Write value + file_->Append(value); + offset_ += value_size + meta_bytes_buf_size; + + properties_.num_entries++; + properties_.raw_key_size += key.size(); + properties_.raw_value_size += value.size(); + if (internal_key.type == kTypeDeletion || + internal_key.type == kTypeSingleDeletion) { + properties_.num_deletions++; + } else if (internal_key.type == kTypeMerge) { + properties_.num_merge_operands++; + } + + // notify property collectors + NotifyCollectTableCollectorsOnAdd( + key, value, offset_, table_properties_collectors_, ioptions_.info_log); +} + +Status PlainTableBuilder::status() const { return status_; } + +Status PlainTableBuilder::Finish() { + assert(!closed_); + closed_ = true; + + properties_.data_size = offset_; + + // Write the following blocks + // 1. [meta block: bloom] - optional + // 2. [meta block: index] - optional + // 3. [meta block: properties] + // 4. [metaindex block] + // 5. [footer] + + MetaIndexBuilder meta_index_builer; + + if (store_index_in_file_ && (properties_.num_entries > 0)) { + assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max()); + Status s; + BlockHandle bloom_block_handle; + if (bloom_bits_per_key_ > 0) { + bloom_block_.SetTotalBits( + &arena_, + static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_, + ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.info_log); + + PutVarint32(&properties_.user_collected_properties + [PlainTablePropertyNames::kNumBloomBlocks], + bloom_block_.GetNumBlocks()); + + bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_); + + Slice bloom_finish_result = bloom_block_.Finish(); + + properties_.filter_size = bloom_finish_result.size(); + s = WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle); + + if (!s.ok()) { + return s; + } + meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle); + } + BlockHandle index_block_handle; + Slice index_finish_result = index_builder_->Finish(); + + properties_.index_size = index_finish_result.size(); + s = WriteBlock(index_finish_result, file_, &offset_, &index_block_handle); + + if (!s.ok()) { + return s; + } + + meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock, + index_block_handle); + } + + // Calculate bloom block size and index block size + PropertyBlockBuilder property_block_builder; + // -- Add basic properties + property_block_builder.AddTableProperty(properties_); + + property_block_builder.Add(properties_.user_collected_properties); + + // -- Add user collected properties + NotifyCollectTableCollectorsOnFinish(table_properties_collectors_, + ioptions_.info_log, + &property_block_builder); + + // -- Write property block + BlockHandle property_block_handle; + auto s = WriteBlock( + property_block_builder.Finish(), + file_, + &offset_, + &property_block_handle + ); + if (!s.ok()) { + return s; + } + meta_index_builer.Add(kPropertiesBlock, property_block_handle); + + // -- write metaindex block + BlockHandle metaindex_block_handle; + s = WriteBlock( + meta_index_builer.Finish(), + file_, + &offset_, + &metaindex_block_handle + ); + if (!s.ok()) { + return s; + } + + // Write Footer + // no need to write out new footer if we're using default checksum + Footer footer(kLegacyPlainTableMagicNumber, 0); + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(BlockHandle::NullBlockHandle()); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + s = file_->Append(footer_encoding); + if (s.ok()) { + offset_ += footer_encoding.size(); + } + + return s; +} + +void PlainTableBuilder::Abandon() { + closed_ = true; +} + +uint64_t PlainTableBuilder::NumEntries() const { + return properties_.num_entries; +} + +uint64_t PlainTableBuilder::FileSize() const { + return offset_; +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain_table_builder.h b/src/rocksdb/table/plain_table_builder.h new file mode 100644 index 00000000..ca0879a4 --- /dev/null +++ b/src/rocksdb/table/plain_table_builder.h @@ -0,0 +1,138 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE +#include <stdint.h> +#include <string> +#include <vector> +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/bloom_block.h" +#include "table/plain_table_index.h" +#include "table/plain_table_key_coding.h" +#include "table/table_builder.h" + +namespace rocksdb { + +class BlockBuilder; +class BlockHandle; +class WritableFile; +class TableBuilder; + +class PlainTableBuilder: public TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + PlainTableBuilder( + const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, WritableFileWriter* file, + uint32_t user_key_size, EncodingType encoding_type, + size_t index_sparseness, uint32_t bloom_bits_per_key, + const std::string& column_family_name, uint32_t num_probes = 6, + size_t huge_page_tlb_size = 0, double hash_table_ratio = 0, + bool store_index_in_file = false); + + // REQUIRES: Either Finish() or Abandon() has been called. + ~PlainTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + TableProperties GetTableProperties() const override { return properties_; } + + bool SaveIndexInFile() const { return store_index_in_file_; } + + private: + Arena arena_; + const ImmutableCFOptions& ioptions_; + const MutableCFOptions& moptions_; + std::vector<std::unique_ptr<IntTblPropCollector>> + table_properties_collectors_; + + BloomBlockBuilder bloom_block_; + std::unique_ptr<PlainTableIndexBuilder> index_builder_; + + WritableFileWriter* file_; + uint64_t offset_ = 0; + uint32_t bloom_bits_per_key_; + size_t huge_page_tlb_size_; + Status status_; + TableProperties properties_; + PlainTableKeyEncoder encoder_; + + bool store_index_in_file_; + + std::vector<uint32_t> keys_or_prefixes_hashes_; + bool closed_ = false; // Either Finish() or Abandon() has been called. + + const SliceTransform* prefix_extractor_; + + Slice GetPrefix(const Slice& target) const { + assert(target.size() >= 8); // target is internal key + return GetPrefixFromUserKey(GetUserKey(target)); + } + + Slice GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); + } + + Slice GetUserKey(const Slice& key) const { + return Slice(key.data(), key.size() - 8); + } + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return prefix_extractor_->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. + // In that case, + // it falls back to pure binary search and + // total iterator seek is supported. + return Slice(); + } + } + + bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } + + // No copying allowed + PlainTableBuilder(const PlainTableBuilder&) = delete; + void operator=(const PlainTableBuilder&) = delete; +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain_table_factory.cc b/src/rocksdb/table/plain_table_factory.cc new file mode 100644 index 00000000..a6e59c14 --- /dev/null +++ b/src/rocksdb/table/plain_table_factory.cc @@ -0,0 +1,234 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#include "table/plain_table_factory.h" + +#include <stdint.h> +#include <memory> +#include "db/dbformat.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "table/plain_table_builder.h" +#include "table/plain_table_reader.h" +#include "util/string_util.h" + +namespace rocksdb { + +Status PlainTableFactory::NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table, + bool /*prefetch_index_and_filter_in_cache*/) const { + return PlainTableReader::Open( + table_reader_options.ioptions, table_reader_options.env_options, + table_reader_options.internal_comparator, std::move(file), file_size, + table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio, + table_options_.index_sparseness, table_options_.huge_page_tlb_size, + table_options_.full_scan_mode, table_reader_options.immortal, + table_reader_options.prefix_extractor); +} + +TableBuilder* PlainTableFactory::NewTableBuilder( + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + WritableFileWriter* file) const { + // Ignore the skip_filters flag. PlainTable format is optimized for small + // in-memory dbs. The skip_filters optimization is not useful for plain + // tables + // + return new PlainTableBuilder( + table_builder_options.ioptions, table_builder_options.moptions, + table_builder_options.int_tbl_prop_collector_factories, column_family_id, + file, table_options_.user_key_len, table_options_.encoding_type, + table_options_.index_sparseness, table_options_.bloom_bits_per_key, + table_builder_options.column_family_name, 6, + table_options_.huge_page_tlb_size, table_options_.hash_table_ratio, + table_options_.store_index_in_file); +} + +std::string PlainTableFactory::GetPrintableTableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " user_key_len: %u\n", + table_options_.user_key_len); + ret.append(buffer); + snprintf(buffer, kBufferSize, " bloom_bits_per_key: %d\n", + table_options_.bloom_bits_per_key); + ret.append(buffer); + snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", + table_options_.hash_table_ratio); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_sparseness: %" ROCKSDB_PRIszt "\n", + table_options_.index_sparseness); + ret.append(buffer); + snprintf(buffer, kBufferSize, " huge_page_tlb_size: %" ROCKSDB_PRIszt "\n", + table_options_.huge_page_tlb_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " encoding_type: %d\n", + table_options_.encoding_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " full_scan_mode: %d\n", + table_options_.full_scan_mode); + ret.append(buffer); + snprintf(buffer, kBufferSize, " store_index_in_file: %d\n", + table_options_.store_index_in_file); + ret.append(buffer); + return ret; +} + +const PlainTableOptions& PlainTableFactory::table_options() const { + return table_options_; +} + +Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options, + const std::string& opts_str, + PlainTableOptions* new_table_options) { + std::unordered_map<std::string, std::string> opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + return s; + } + return GetPlainTableOptionsFromMap(table_options, opts_map, + new_table_options); +} + +Status GetMemTableRepFactoryFromString( + const std::string& opts_str, + std::unique_ptr<MemTableRepFactory>* new_mem_factory) { + std::vector<std::string> opts_list = StringSplit(opts_str, ':'); + size_t len = opts_list.size(); + + if (opts_list.empty() || opts_list.size() > 2) { + return Status::InvalidArgument("Can't parse memtable_factory option ", + opts_str); + } + + MemTableRepFactory* mem_factory = nullptr; + + if (opts_list[0] == "skip_list") { + // Expecting format + // skip_list:<lookahead> + if (2 == len) { + size_t lookahead = ParseSizeT(opts_list[1]); + mem_factory = new SkipListFactory(lookahead); + } else if (1 == len) { + mem_factory = new SkipListFactory(); + } + } else if (opts_list[0] == "prefix_hash") { + // Expecting format + // prfix_hash:<hash_bucket_count> + if (2 == len) { + size_t hash_bucket_count = ParseSizeT(opts_list[1]); + mem_factory = NewHashSkipListRepFactory(hash_bucket_count); + } else if (1 == len) { + mem_factory = NewHashSkipListRepFactory(); + } + } else if (opts_list[0] == "hash_linkedlist") { + // Expecting format + // hash_linkedlist:<hash_bucket_count> + if (2 == len) { + size_t hash_bucket_count = ParseSizeT(opts_list[1]); + mem_factory = NewHashLinkListRepFactory(hash_bucket_count); + } else if (1 == len) { + mem_factory = NewHashLinkListRepFactory(); + } + } else if (opts_list[0] == "vector") { + // Expecting format + // vector:<count> + if (2 == len) { + size_t count = ParseSizeT(opts_list[1]); + mem_factory = new VectorRepFactory(count); + } else if (1 == len) { + mem_factory = new VectorRepFactory(); + } + } else if (opts_list[0] == "cuckoo") { + return Status::NotSupported( + "cuckoo hash memtable is not supported anymore."); + } else { + return Status::InvalidArgument("Unrecognized memtable_factory option ", + opts_str); + } + + if (mem_factory != nullptr) { + new_mem_factory->reset(mem_factory); + } + + return Status::OK(); +} + +std::string ParsePlainTableOptions(const std::string& name, + const std::string& org_value, + PlainTableOptions* new_options, + bool input_strings_escaped = false, + bool ignore_unknown_options = false) { + const std::string& value = + input_strings_escaped ? UnescapeOptionString(org_value) : org_value; + const auto iter = plain_table_type_info.find(name); + if (iter == plain_table_type_info.end()) { + if (ignore_unknown_options) { + return ""; + } else { + return "Unrecognized option"; + } + } + const auto& opt_info = iter->second; + if (opt_info.verification != OptionVerificationType::kDeprecated && + !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset, + opt_info.type, value)) { + return "Invalid value"; + } + return ""; +} + +Status GetPlainTableOptionsFromMap( + const PlainTableOptions& table_options, + const std::unordered_map<std::string, std::string>& opts_map, + PlainTableOptions* new_table_options, bool input_strings_escaped, + bool /*ignore_unknown_options*/) { + assert(new_table_options); + *new_table_options = table_options; + for (const auto& o : opts_map) { + auto error_message = ParsePlainTableOptions( + o.first, o.second, new_table_options, input_strings_escaped); + if (error_message != "") { + const auto iter = plain_table_type_info.find(o.first); + if (iter == plain_table_type_info.end() || + !input_strings_escaped || // !input_strings_escaped indicates + // the old API, where everything is + // parsable. + (iter->second.verification != OptionVerificationType::kByName && + iter->second.verification != + OptionVerificationType::kByNameAllowNull && + iter->second.verification != + OptionVerificationType::kByNameAllowFromNull && + iter->second.verification != OptionVerificationType::kDeprecated)) { + // Restore "new_options" to the default "base_options". + *new_table_options = table_options; + return Status::InvalidArgument("Can't parse PlainTableOptions:", + o.first + " " + error_message); + } + } + } + return Status::OK(); +} + +extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) { + return new PlainTableFactory(options); +} + +const std::string PlainTablePropertyNames::kEncodingType = + "rocksdb.plain.table.encoding.type"; + +const std::string PlainTablePropertyNames::kBloomVersion = + "rocksdb.plain.table.bloom.version"; + +const std::string PlainTablePropertyNames::kNumBloomBlocks = + "rocksdb.plain.table.bloom.numblocks"; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain_table_factory.h b/src/rocksdb/table/plain_table_factory.h new file mode 100644 index 00000000..990df482 --- /dev/null +++ b/src/rocksdb/table/plain_table_factory.h @@ -0,0 +1,210 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE +#include <memory> +#include <string> +#include <stdint.h> + +#include "options/options_helper.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +struct EnvOptions; + +class Status; +class RandomAccessFile; +class WritableFile; +class Table; +class TableBuilder; + +// IndexedTable requires fixed length key, configured as a constructor +// parameter of the factory class. Output file format: +// +-------------+-----------------+ +// | version | user_key_length | +// +------------++------------+-----------------+ <= key1 offset +// | encoded key1 | value_size | | +// +------------+-------------+-------------+ | +// | value1 | +// | | +// +--------------------------+-------------+---+ <= key2 offset +// | encoded key2 | value_size | | +// +------------+-------------+-------------+ | +// | value2 | +// | | +// | ...... | +// +-----------------+--------------------------+ +// +// When the key encoding type is kPlain. Key part is encoded as: +// +------------+--------------------+ +// | [key_size] | internal key | +// +------------+--------------------+ +// for the case of user_key_len = kPlainTableVariableLength case, +// and simply: +// +----------------------+ +// | internal key | +// +----------------------+ +// for user_key_len != kPlainTableVariableLength case. +// +// If key encoding type is kPrefix. Keys are encoding in this format. +// There are three ways to encode a key: +// (1) Full Key +// +---------------+---------------+-------------------+ +// | Full Key Flag | Full Key Size | Full Internal Key | +// +---------------+---------------+-------------------+ +// which simply encodes a full key +// +// (2) A key shared the same prefix as the previous key, which is encoded as +// format of (1). +// +-------------+-------------+-------------+-------------+------------+ +// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix | +// +-------------+-------------+-------------+-------------+------------+ +// where key is the suffix part of the key, including the internal bytes. +// the actual key will be constructed by concatenating prefix part of the +// previous key, with the suffix part of the key here, with sizes given here. +// +// (3) A key shared the same prefix as the previous key, which is encoded as +// the format of (2). +// +-----------------+-----------------+------------------------+ +// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key | +// +-----------------+-----------------+------------------------+ +// The key will be constructed by concatenating previous key's prefix (which is +// also a prefix which the last key encoded in the format of (1)) and the +// key given here. +// +// For example, we for following keys (prefix and suffix are separated by +// spaces): +// 0000 0001 +// 0000 00021 +// 0000 0002 +// 00011 00 +// 0002 0001 +// Will be encoded like this: +// FK 8 00000001 +// PF 4 SF 5 00021 +// SF 4 0002 +// FK 7 0001100 +// FK 8 00020001 +// (where FK means full key flag, PF means prefix flag and SF means suffix flag) +// +// All those "key flag + key size" shown above are in this format: +// The 8 bits of the first byte: +// +----+----+----+----+----+----+----+----+ +// | Type | Size | +// +----+----+----+----+----+----+----+----+ +// Type indicates: full key, prefix, or suffix. +// The last 6 bits are for size. If the size bits are not all 1, it means the +// size of the key. Otherwise, varint32 is read after this byte. This varint +// value + 0x3F (the value of all 1) will be the key size. +// +// For example, full key with length 16 will be encoded as (binary): +// 00 010000 +// (00 means full key) +// and a prefix with 100 bytes will be encoded as: +// 01 111111 00100101 +// (63) (37) +// (01 means key suffix) +// +// All the internal keys above (including kPlain and kPrefix) are encoded in +// this format: +// There are two types: +// (1) normal internal key format +// +----------- ...... -------------+----+---+---+---+---+---+---+---+ +// | user key |type| sequence ID | +// +----------- ..... --------------+----+---+---+---+---+---+---+---+ +// (2) Special case for keys whose sequence ID is 0 and is value type +// +----------- ...... -------------+----+ +// | user key |0x80| +// +----------- ..... --------------+----+ +// To save 7 bytes for the special case where sequence ID = 0. +// +// +class PlainTableFactory : public TableFactory { + public: + ~PlainTableFactory() {} + // user_key_len is the length of the user key. If it is set to be + // kPlainTableVariableLength, then it means variable length. Otherwise, all + // the keys need to have the fix length of this value. bloom_bits_per_key is + // number of bits used for bloom filer per key. hash_table_ratio is + // the desired utilization of the hash table used for prefix hashing. + // hash_table_ratio = number of prefixes / #buckets in the hash table + // hash_table_ratio = 0 means skip hash table but only replying on binary + // search. + // index_sparseness determines index interval for keys + // inside the same prefix. It will be the maximum number of linear search + // required after hash and binary search. + // index_sparseness = 0 means index for every key. + // huge_page_tlb_size determines whether to allocate hash indexes from huge + // page TLB and the page size if allocating from there. See comments of + // Arena::AllocateAligned() for details. + explicit PlainTableFactory( + const PlainTableOptions& _table_options = PlainTableOptions()) + : table_options_(_table_options) {} + + const char* Name() const override { return "PlainTable"; } + Status NewTableReader(const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, + uint64_t file_size, std::unique_ptr<TableReader>* table, + bool prefetch_index_and_filter_in_cache) const override; + + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + uint32_t column_family_id, WritableFileWriter* file) const override; + + std::string GetPrintableTableOptions() const override; + + const PlainTableOptions& table_options() const; + + static const char kValueTypeSeqId0 = char(~0); + + // Sanitizes the specified DB Options. + Status SanitizeOptions( + const DBOptions& /*db_opts*/, + const ColumnFamilyOptions& /*cf_opts*/) const override { + return Status::OK(); + } + + void* GetOptions() override { return &table_options_; } + + Status GetOptionString(std::string* /*opt_string*/, + const std::string& /*delimiter*/) const override { + return Status::OK(); + } + + private: + PlainTableOptions table_options_; +}; + +static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = { + {"user_key_len", + {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T, + OptionVerificationType::kNormal, false, 0}}, + {"bloom_bits_per_key", + {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt, + OptionVerificationType::kNormal, false, 0}}, + {"hash_table_ratio", + {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble, + OptionVerificationType::kNormal, false, 0}}, + {"index_sparseness", + {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT, + OptionVerificationType::kNormal, false, 0}}, + {"huge_page_tlb_size", + {offsetof(struct PlainTableOptions, huge_page_tlb_size), + OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, + {"encoding_type", + {offsetof(struct PlainTableOptions, encoding_type), + OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}}, + {"full_scan_mode", + {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean, + OptionVerificationType::kNormal, false, 0}}, + {"store_index_in_file", + {offsetof(struct PlainTableOptions, store_index_in_file), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}}; + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain_table_index.cc b/src/rocksdb/table/plain_table_index.cc new file mode 100644 index 00000000..43740923 --- /dev/null +++ b/src/rocksdb/table/plain_table_index.cc @@ -0,0 +1,215 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include <inttypes.h> + +#include "table/plain_table_index.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace rocksdb { + +namespace { +inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { + assert(num_buckets > 0); + return hash % num_buckets; +} +} + +Status PlainTableIndex::InitFromRawData(Slice data) { + if (!GetVarint32(&data, &index_size_)) { + return Status::Corruption("Couldn't read the index size!"); + } + assert(index_size_ > 0); + if (!GetVarint32(&data, &num_prefixes_)) { + return Status::Corruption("Couldn't read the index size!"); + } + sub_index_size_ = + static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen; + + char* index_data_begin = const_cast<char*>(data.data()); + index_ = reinterpret_cast<uint32_t*>(index_data_begin); + sub_index_ = reinterpret_cast<char*>(index_ + index_size_); + return Status::OK(); +} + +PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset( + uint32_t prefix_hash, uint32_t* bucket_value) const { + int bucket = GetBucketIdFromHash(prefix_hash, index_size_); + GetUnaligned(index_ + bucket, bucket_value); + if ((*bucket_value & kSubIndexMask) == kSubIndexMask) { + *bucket_value ^= kSubIndexMask; + return kSubindex; + } + if (*bucket_value >= kMaxFileSize) { + return kNoPrefixForBucket; + } else { + // point directly to the file + return kDirectToFile; + } +} + +void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash, + uint32_t offset) { + if (num_records_in_current_group_ == kNumRecordsPerGroup) { + current_group_ = AllocateNewGroup(); + num_records_in_current_group_ = 0; + } + auto& new_record = current_group_[num_records_in_current_group_++]; + new_record.hash = hash; + new_record.offset = offset; + new_record.next = nullptr; +} + +void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice, + uint32_t key_offset) { + if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) { + ++num_prefixes_; + if (!is_first_record_) { + keys_per_prefix_hist_.Add(num_keys_per_prefix_); + } + num_keys_per_prefix_ = 0; + prev_key_prefix_ = key_prefix_slice.ToString(); + prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice); + due_index_ = true; + } + + if (due_index_) { + // Add an index key for every kIndexIntervalForSamePrefixKeys keys + record_list_.AddRecord(prev_key_prefix_hash_, key_offset); + due_index_ = false; + } + + num_keys_per_prefix_++; + if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) { + due_index_ = true; + } + is_first_record_ = false; +} + +Slice PlainTableIndexBuilder::Finish() { + AllocateIndex(); + std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr); + std::vector<uint32_t> entries_per_bucket(index_size_, 0); + BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); + + keys_per_prefix_hist_.Add(num_keys_per_prefix_); + ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s", + keys_per_prefix_hist_.ToString().c_str()); + + // From the temp data structure, populate indexes. + return FillIndexes(hash_to_offsets, entries_per_bucket); +} + +void PlainTableIndexBuilder::AllocateIndex() { + if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) { + // Fall back to pure binary search if the user fails to specify a prefix + // extractor. + index_size_ = 1; + } else { + double hash_table_size_multipier = 1.0 / hash_table_ratio_; + index_size_ = + static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1; + assert(index_size_ > 0); + } +} + +void PlainTableIndexBuilder::BucketizeIndexes( + std::vector<IndexRecord*>* hash_to_offsets, + std::vector<uint32_t>* entries_per_bucket) { + bool first = true; + uint32_t prev_hash = 0; + size_t num_records = record_list_.GetNumRecords(); + for (size_t i = 0; i < num_records; i++) { + IndexRecord* index_record = record_list_.At(i); + uint32_t cur_hash = index_record->hash; + if (first || prev_hash != cur_hash) { + prev_hash = cur_hash; + first = false; + } + uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); + IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; + index_record->next = prev_bucket_head; + (*hash_to_offsets)[bucket] = index_record; + (*entries_per_bucket)[bucket]++; + } + + sub_index_size_ = 0; + for (auto entry_count : *entries_per_bucket) { + if (entry_count <= 1) { + continue; + } + // Only buckets with more than 1 entry will have subindex. + sub_index_size_ += VarintLength(entry_count); + // total bytes needed to store these entries' in-file offsets. + sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen; + } +} + +Slice PlainTableIndexBuilder::FillIndexes( + const std::vector<IndexRecord*>& hash_to_offsets, + const std::vector<uint32_t>& entries_per_bucket) { + ROCKS_LOG_DEBUG(ioptions_.info_log, + "Reserving %" PRIu32 " bytes for plain table's sub_index", + sub_index_size_); + auto total_allocate_size = GetTotalSize(); + char* allocated = arena_->AllocateAligned( + total_allocate_size, huge_page_tlb_size_, ioptions_.info_log); + + auto temp_ptr = EncodeVarint32(allocated, index_size_); + uint32_t* index = + reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_)); + char* sub_index = reinterpret_cast<char*>(index + index_size_); + + uint32_t sub_index_offset = 0; + for (uint32_t i = 0; i < index_size_; i++) { + uint32_t num_keys_for_bucket = entries_per_bucket[i]; + switch (num_keys_for_bucket) { + case 0: + // No key for bucket + PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize); + break; + case 1: + // point directly to the file offset + PutUnaligned(index + i, hash_to_offsets[i]->offset); + break; + default: + // point to second level indexes. + PutUnaligned(index + i, sub_index_offset | PlainTableIndex::kSubIndexMask); + char* prev_ptr = &sub_index[sub_index_offset]; + char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); + sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr); + char* sub_index_pos = &sub_index[sub_index_offset]; + IndexRecord* record = hash_to_offsets[i]; + int j; + for (j = num_keys_for_bucket - 1; j >= 0 && record; + j--, record = record->next) { + EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); + } + assert(j == -1 && record == nullptr); + sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket; + assert(sub_index_offset <= sub_index_size_); + break; + } + } + assert(sub_index_offset == sub_index_size_); + + ROCKS_LOG_DEBUG(ioptions_.info_log, + "hash table size: %" PRIu32 ", suffix_map length %" PRIu32, + index_size_, sub_index_size_); + return Slice(allocated, GetTotalSize()); +} + +const std::string PlainTableIndexBuilder::kPlainTableIndexBlock = + "PlainTableIndexBlock"; +}; // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain_table_index.h b/src/rocksdb/table/plain_table_index.h new file mode 100644 index 00000000..360d9982 --- /dev/null +++ b/src/rocksdb/table/plain_table_index.h @@ -0,0 +1,229 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <string> +#include <vector> + +#include "db/dbformat.h" +#include "monitoring/histogram.h" +#include "options/cf_options.h" +#include "rocksdb/options.h" +#include "util/arena.h" +#include "util/hash.h" +#include "util/murmurhash.h" + +namespace rocksdb { + +// PlainTableIndex contains buckets size of index_size_, each is a +// 32-bit integer. The lower 31 bits contain an offset value (explained below) +// and the first bit of the integer indicates type of the offset. +// +// +--------------+------------------------------------------------------+ +// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + +// +--------------+------------------------------------------------------+ +// +// Explanation for the "flag bit": +// +// 0 indicates that the bucket contains only one prefix (no conflict when +// hashing this prefix), whose first row starts from this offset of the +// file. +// 1 indicates that the bucket contains more than one prefixes, or there +// are too many rows for one prefix so we need a binary search for it. In +// this case, the offset indicates the offset of sub_index_ holding the +// binary search indexes of keys for those rows. Those binary search indexes +// are organized in this way: +// +// The first 4 bytes, indicate how many indexes (N) are stored after it. After +// it, there are N 32-bit integers, each points of an offset of the file, +// which +// points to starting of a row. Those offsets need to be guaranteed to be in +// ascending order so the keys they are pointing to are also in ascending +// order +// to make sure we can use them to do binary searches. Below is visual +// presentation of a bucket. +// +// <begin> +// number_of_records: varint32 +// record 1 file offset: fixedint32 +// record 2 file offset: fixedint32 +// .... +// record N file offset: fixedint32 +// <end> +class PlainTableIndex { + public: + enum IndexSearchResult { + kNoPrefixForBucket = 0, + kDirectToFile = 1, + kSubindex = 2 + }; + + explicit PlainTableIndex(Slice data) { InitFromRawData(data); } + + PlainTableIndex() + : index_size_(0), + sub_index_size_(0), + num_prefixes_(0), + index_(nullptr), + sub_index_(nullptr) {} + + IndexSearchResult GetOffset(uint32_t prefix_hash, + uint32_t* bucket_value) const; + + Status InitFromRawData(Slice data); + + const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset, + uint32_t* upper_bound) const { + const char* index_ptr = &sub_index_[offset]; + return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound); + } + + uint32_t GetIndexSize() const { return index_size_; } + + uint32_t GetSubIndexSize() const { return sub_index_size_; } + + uint32_t GetNumPrefixes() const { return num_prefixes_; } + + static const uint64_t kMaxFileSize = (1u << 31) - 1; + static const uint32_t kSubIndexMask = 0x80000000; + static const size_t kOffsetLen = sizeof(uint32_t); + + private: + uint32_t index_size_; + uint32_t sub_index_size_; + uint32_t num_prefixes_; + + uint32_t* index_; + char* sub_index_; +}; + +// PlainTableIndexBuilder is used to create plain table index. +// After calling Finish(), it returns Slice, which is usually +// used either to initialize PlainTableIndex or +// to save index to sst file. +// For more details about the index, please refer to: +// https://github.com/facebook/rocksdb/wiki/PlainTable-Format +// #wiki-in-memory-index-format +class PlainTableIndexBuilder { + public: + PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, + const SliceTransform* prefix_extractor, + size_t index_sparseness, double hash_table_ratio, + size_t huge_page_tlb_size) + : arena_(arena), + ioptions_(ioptions), + record_list_(kRecordsPerGroup), + is_first_record_(true), + due_index_(false), + num_prefixes_(0), + num_keys_per_prefix_(0), + prev_key_prefix_hash_(0), + index_sparseness_(index_sparseness), + index_size_(0), + sub_index_size_(0), + prefix_extractor_(prefix_extractor), + hash_table_ratio_(hash_table_ratio), + huge_page_tlb_size_(huge_page_tlb_size) {} + + void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset); + + Slice Finish(); + + uint32_t GetTotalSize() const { + return VarintLength(index_size_) + VarintLength(num_prefixes_) + + PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_; + } + + static const std::string kPlainTableIndexBlock; + + private: + struct IndexRecord { + uint32_t hash; // hash of the prefix + uint32_t offset; // offset of a row + IndexRecord* next; + }; + + // Helper class to track all the index records + class IndexRecordList { + public: + explicit IndexRecordList(size_t num_records_per_group) + : kNumRecordsPerGroup(num_records_per_group), + current_group_(nullptr), + num_records_in_current_group_(num_records_per_group) {} + + ~IndexRecordList() { + for (size_t i = 0; i < groups_.size(); i++) { + delete[] groups_[i]; + } + } + + void AddRecord(uint32_t hash, uint32_t offset); + + size_t GetNumRecords() const { + return (groups_.size() - 1) * kNumRecordsPerGroup + + num_records_in_current_group_; + } + IndexRecord* At(size_t index) { + return &(groups_[index / kNumRecordsPerGroup] + [index % kNumRecordsPerGroup]); + } + + private: + IndexRecord* AllocateNewGroup() { + IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; + groups_.push_back(result); + return result; + } + + // Each group in `groups_` contains fix-sized records (determined by + // kNumRecordsPerGroup). Which can help us minimize the cost if resizing + // occurs. + const size_t kNumRecordsPerGroup; + IndexRecord* current_group_; + // List of arrays allocated + std::vector<IndexRecord*> groups_; + size_t num_records_in_current_group_; + }; + + void AllocateIndex(); + + // Internal helper function to bucket index record list to hash buckets. + void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets, + std::vector<uint32_t>* entries_per_bucket); + + // Internal helper class to fill the indexes and bloom filters to internal + // data structures. + Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets, + const std::vector<uint32_t>& entries_per_bucket); + + Arena* arena_; + const ImmutableCFOptions ioptions_; + HistogramImpl keys_per_prefix_hist_; + IndexRecordList record_list_; + bool is_first_record_; + bool due_index_; + uint32_t num_prefixes_; + uint32_t num_keys_per_prefix_; + + uint32_t prev_key_prefix_hash_; + size_t index_sparseness_; + uint32_t index_size_; + uint32_t sub_index_size_; + + const SliceTransform* prefix_extractor_; + double hash_table_ratio_; + size_t huge_page_tlb_size_; + + std::string prev_key_prefix_; + + static const size_t kRecordsPerGroup = 256; +}; + +}; // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain_table_key_coding.cc b/src/rocksdb/table/plain_table_key_coding.cc new file mode 100644 index 00000000..6f5ee9b4 --- /dev/null +++ b/src/rocksdb/table/plain_table_key_coding.cc @@ -0,0 +1,498 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include "table/plain_table_key_coding.h" + +#include <algorithm> +#include <string> +#include "db/dbformat.h" +#include "table/plain_table_reader.h" +#include "table/plain_table_factory.h" +#include "util/file_reader_writer.h" + +namespace rocksdb { + +enum PlainTableEntryType : unsigned char { + kFullKey = 0, + kPrefixFromPreviousKey = 1, + kKeySuffix = 2, +}; + +namespace { + +// Control byte: +// First two bits indicate type of entry +// Other bytes are inlined sizes. If all bits are 1 (0x03F), overflow bytes +// are used. key_size-0x3F will be encoded as a variint32 after this bytes. + +const unsigned char kSizeInlineLimit = 0x3F; + +// Return 0 for error +size_t EncodeSize(PlainTableEntryType type, uint32_t key_size, + char* out_buffer) { + out_buffer[0] = type << 6; + + if (key_size < static_cast<uint32_t>(kSizeInlineLimit)) { + // size inlined + out_buffer[0] |= static_cast<char>(key_size); + return 1; + } else { + out_buffer[0] |= kSizeInlineLimit; + char* ptr = EncodeVarint32(out_buffer + 1, key_size - kSizeInlineLimit); + return ptr - out_buffer; + } +} +} // namespace + +// Fill bytes_read with number of bytes read. +inline Status PlainTableKeyDecoder::DecodeSize(uint32_t start_offset, + PlainTableEntryType* entry_type, + uint32_t* key_size, + uint32_t* bytes_read) { + Slice next_byte_slice; + bool success = file_reader_.Read(start_offset, 1, &next_byte_slice); + if (!success) { + return file_reader_.status(); + } + *entry_type = static_cast<PlainTableEntryType>( + (static_cast<unsigned char>(next_byte_slice[0]) & ~kSizeInlineLimit) >> + 6); + char inline_key_size = next_byte_slice[0] & kSizeInlineLimit; + if (inline_key_size < kSizeInlineLimit) { + *key_size = inline_key_size; + *bytes_read = 1; + return Status::OK(); + } else { + uint32_t extra_size; + uint32_t tmp_bytes_read; + success = file_reader_.ReadVarint32(start_offset + 1, &extra_size, + &tmp_bytes_read); + if (!success) { + return file_reader_.status(); + } + assert(tmp_bytes_read > 0); + *key_size = kSizeInlineLimit + extra_size; + *bytes_read = tmp_bytes_read + 1; + return Status::OK(); + } +} + +Status PlainTableKeyEncoder::AppendKey(const Slice& key, + WritableFileWriter* file, + uint64_t* offset, char* meta_bytes_buf, + size_t* meta_bytes_buf_size) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(key, &parsed_key)) { + return Status::Corruption(Slice()); + } + + Slice key_to_write = key; // Portion of internal key to write out. + + uint32_t user_key_size = static_cast<uint32_t>(key.size() - 8); + if (encoding_type_ == kPlain) { + if (fixed_user_key_len_ == kPlainTableVariableLength) { + // Write key length + char key_size_buf[5]; // tmp buffer for key size as varint32 + char* ptr = EncodeVarint32(key_size_buf, user_key_size); + assert(ptr <= key_size_buf + sizeof(key_size_buf)); + auto len = ptr - key_size_buf; + Status s = file->Append(Slice(key_size_buf, len)); + if (!s.ok()) { + return s; + } + *offset += len; + } + } else { + assert(encoding_type_ == kPrefix); + char size_bytes[12]; + size_t size_bytes_pos = 0; + + Slice prefix = + prefix_extractor_->Transform(Slice(key.data(), user_key_size)); + if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetUserKey() || + key_count_for_prefix_ % index_sparseness_ == 0) { + key_count_for_prefix_ = 1; + pre_prefix_.SetUserKey(prefix); + size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes); + Status s = file->Append(Slice(size_bytes, size_bytes_pos)); + if (!s.ok()) { + return s; + } + *offset += size_bytes_pos; + } else { + key_count_for_prefix_++; + if (key_count_for_prefix_ == 2) { + // For second key within a prefix, need to encode prefix length + size_bytes_pos += + EncodeSize(kPrefixFromPreviousKey, + static_cast<uint32_t>(pre_prefix_.GetUserKey().size()), + size_bytes + size_bytes_pos); + } + uint32_t prefix_len = + static_cast<uint32_t>(pre_prefix_.GetUserKey().size()); + size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len, + size_bytes + size_bytes_pos); + Status s = file->Append(Slice(size_bytes, size_bytes_pos)); + if (!s.ok()) { + return s; + } + *offset += size_bytes_pos; + key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len); + } + } + + // Encode full key + // For value size as varint32 (up to 5 bytes). + // If the row is of value type with seqId 0, flush the special flag together + // in this buffer to safe one file append call, which takes 1 byte. + if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) { + Status s = + file->Append(Slice(key_to_write.data(), key_to_write.size() - 8)); + if (!s.ok()) { + return s; + } + *offset += key_to_write.size() - 8; + meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0; + *meta_bytes_buf_size += 1; + } else { + file->Append(key_to_write); + *offset += key_to_write.size(); + } + + return Status::OK(); +} + +Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset, + uint32_t len) { + assert(file_offset + len <= file_info_->data_end_offset); + return Slice(buffer->buf.get() + (file_offset - buffer->buf_start_offset), + len); +} + +bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len, + Slice* out) { + const uint32_t kPrefetchSize = 256u; + + // Try to read from buffers. + for (uint32_t i = 0; i < num_buf_; i++) { + Buffer* buffer = buffers_[num_buf_ - 1 - i].get(); + if (file_offset >= buffer->buf_start_offset && + file_offset + len <= buffer->buf_start_offset + buffer->buf_len) { + *out = GetFromBuffer(buffer, file_offset, len); + return true; + } + } + + Buffer* new_buffer; + // Data needed is not in any of the buffer. Allocate a new buffer. + if (num_buf_ < buffers_.size()) { + // Add a new buffer + new_buffer = new Buffer(); + buffers_[num_buf_++].reset(new_buffer); + } else { + // Now simply replace the last buffer. Can improve the placement policy + // if needed. + new_buffer = buffers_[num_buf_ - 1].get(); + } + + assert(file_offset + len <= file_info_->data_end_offset); + uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset, + std::max(kPrefetchSize, len)); + if (size_to_read > new_buffer->buf_capacity) { + new_buffer->buf.reset(new char[size_to_read]); + new_buffer->buf_capacity = size_to_read; + new_buffer->buf_len = 0; + } + Slice read_result; + Status s = file_info_->file->Read(file_offset, size_to_read, &read_result, + new_buffer->buf.get()); + if (!s.ok()) { + status_ = s; + return false; + } + new_buffer->buf_start_offset = file_offset; + new_buffer->buf_len = size_to_read; + *out = GetFromBuffer(new_buffer, file_offset, len); + return true; +} + +inline bool PlainTableFileReader::ReadVarint32(uint32_t offset, uint32_t* out, + uint32_t* bytes_read) { + if (file_info_->is_mmap_mode) { + const char* start = file_info_->file_data.data() + offset; + const char* limit = + file_info_->file_data.data() + file_info_->data_end_offset; + const char* key_ptr = GetVarint32Ptr(start, limit, out); + assert(key_ptr != nullptr); + *bytes_read = static_cast<uint32_t>(key_ptr - start); + return true; + } else { + return ReadVarint32NonMmap(offset, out, bytes_read); + } +} + +bool PlainTableFileReader::ReadVarint32NonMmap(uint32_t offset, uint32_t* out, + uint32_t* bytes_read) { + const char* start; + const char* limit; + const uint32_t kMaxVarInt32Size = 6u; + uint32_t bytes_to_read = + std::min(file_info_->data_end_offset - offset, kMaxVarInt32Size); + Slice bytes; + if (!Read(offset, bytes_to_read, &bytes)) { + return false; + } + start = bytes.data(); + limit = bytes.data() + bytes.size(); + + const char* key_ptr = GetVarint32Ptr(start, limit, out); + *bytes_read = + (key_ptr != nullptr) ? static_cast<uint32_t>(key_ptr - start) : 0; + return true; +} + +Status PlainTableKeyDecoder::ReadInternalKey( + uint32_t file_offset, uint32_t user_key_size, ParsedInternalKey* parsed_key, + uint32_t* bytes_read, bool* internal_key_valid, Slice* internal_key) { + Slice tmp_slice; + bool success = file_reader_.Read(file_offset, user_key_size + 1, &tmp_slice); + if (!success) { + return file_reader_.status(); + } + if (tmp_slice[user_key_size] == PlainTableFactory::kValueTypeSeqId0) { + // Special encoding for the row with seqID=0 + parsed_key->user_key = Slice(tmp_slice.data(), user_key_size); + parsed_key->sequence = 0; + parsed_key->type = kTypeValue; + *bytes_read += user_key_size + 1; + *internal_key_valid = false; + } else { + success = file_reader_.Read(file_offset, user_key_size + 8, internal_key); + if (!success) { + return file_reader_.status(); + } + *internal_key_valid = true; + if (!ParseInternalKey(*internal_key, parsed_key)) { + return Status::Corruption( + Slice("Incorrect value type found when reading the next key")); + } + *bytes_read += user_key_size + 8; + } + return Status::OK(); +} + +Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, + uint32_t* bytes_read, + bool* /*seekable*/) { + uint32_t user_key_size = 0; + Status s; + if (fixed_user_key_len_ != kPlainTableVariableLength) { + user_key_size = fixed_user_key_len_; + } else { + uint32_t tmp_size = 0; + uint32_t tmp_read; + bool success = + file_reader_.ReadVarint32(start_offset, &tmp_size, &tmp_read); + if (!success) { + return file_reader_.status(); + } + assert(tmp_read > 0); + user_key_size = tmp_size; + *bytes_read = tmp_read; + } + // dummy initial value to avoid compiler complain + bool decoded_internal_key_valid = true; + Slice decoded_internal_key; + s = ReadInternalKey(start_offset + *bytes_read, user_key_size, parsed_key, + bytes_read, &decoded_internal_key_valid, + &decoded_internal_key); + if (!s.ok()) { + return s; + } + if (!file_reader_.file_info()->is_mmap_mode) { + cur_key_.SetInternalKey(*parsed_key); + parsed_key->user_key = + Slice(cur_key_.GetInternalKey().data(), user_key_size); + if (internal_key != nullptr) { + *internal_key = cur_key_.GetInternalKey(); + } + } else if (internal_key != nullptr) { + if (decoded_internal_key_valid) { + *internal_key = decoded_internal_key; + } else { + // Need to copy out the internal key + cur_key_.SetInternalKey(*parsed_key); + *internal_key = cur_key_.GetInternalKey(); + } + } + return Status::OK(); +} + +Status PlainTableKeyDecoder::NextPrefixEncodingKey( + uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key, + uint32_t* bytes_read, bool* seekable) { + PlainTableEntryType entry_type; + + bool expect_suffix = false; + Status s; + do { + uint32_t size = 0; + // dummy initial value to avoid compiler complain + bool decoded_internal_key_valid = true; + uint32_t my_bytes_read = 0; + s = DecodeSize(start_offset + *bytes_read, &entry_type, &size, + &my_bytes_read); + if (!s.ok()) { + return s; + } + if (my_bytes_read == 0) { + return Status::Corruption("Unexpected EOF when reading size of the key"); + } + *bytes_read += my_bytes_read; + + switch (entry_type) { + case kFullKey: { + expect_suffix = false; + Slice decoded_internal_key; + s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key, + bytes_read, &decoded_internal_key_valid, + &decoded_internal_key); + if (!s.ok()) { + return s; + } + if (!file_reader_.file_info()->is_mmap_mode || + (internal_key != nullptr && !decoded_internal_key_valid)) { + // In non-mmap mode, always need to make a copy of keys returned to + // users, because after reading value for the key, the key might + // be invalid. + cur_key_.SetInternalKey(*parsed_key); + saved_user_key_ = cur_key_.GetUserKey(); + if (!file_reader_.file_info()->is_mmap_mode) { + parsed_key->user_key = + Slice(cur_key_.GetInternalKey().data(), size); + } + if (internal_key != nullptr) { + *internal_key = cur_key_.GetInternalKey(); + } + } else { + if (internal_key != nullptr) { + *internal_key = decoded_internal_key; + } + saved_user_key_ = parsed_key->user_key; + } + break; + } + case kPrefixFromPreviousKey: { + if (seekable != nullptr) { + *seekable = false; + } + prefix_len_ = size; + assert(prefix_extractor_ == nullptr || + prefix_extractor_->Transform(saved_user_key_).size() == + prefix_len_); + // Need read another size flag for suffix + expect_suffix = true; + break; + } + case kKeySuffix: { + expect_suffix = false; + if (seekable != nullptr) { + *seekable = false; + } + + Slice tmp_slice; + s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key, + bytes_read, &decoded_internal_key_valid, + &tmp_slice); + if (!s.ok()) { + return s; + } + if (!file_reader_.file_info()->is_mmap_mode) { + // In non-mmap mode, we need to make a copy of keys returned to + // users, because after reading value for the key, the key might + // be invalid. + // saved_user_key_ points to cur_key_. We are making a copy of + // the prefix part to another string, and construct the current + // key from the prefix part and the suffix part back to cur_key_. + std::string tmp = + Slice(saved_user_key_.data(), prefix_len_).ToString(); + cur_key_.Reserve(prefix_len_ + size); + cur_key_.SetInternalKey(tmp, *parsed_key); + parsed_key->user_key = + Slice(cur_key_.GetInternalKey().data(), prefix_len_ + size); + saved_user_key_ = cur_key_.GetUserKey(); + } else { + cur_key_.Reserve(prefix_len_ + size); + cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_), + *parsed_key); + } + parsed_key->user_key = cur_key_.GetUserKey(); + if (internal_key != nullptr) { + *internal_key = cur_key_.GetInternalKey(); + } + break; + } + default: + return Status::Corruption("Un-identified size flag."); + } + } while (expect_suffix); // Another round if suffix is expected. + return Status::OK(); +} + +Status PlainTableKeyDecoder::NextKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, Slice* value, + uint32_t* bytes_read, bool* seekable) { + assert(value != nullptr); + Status s = NextKeyNoValue(start_offset, parsed_key, internal_key, bytes_read, + seekable); + if (s.ok()) { + assert(bytes_read != nullptr); + uint32_t value_size; + uint32_t value_size_bytes; + bool success = file_reader_.ReadVarint32(start_offset + *bytes_read, + &value_size, &value_size_bytes); + if (!success) { + return file_reader_.status(); + } + if (value_size_bytes == 0) { + return Status::Corruption( + "Unexpected EOF when reading the next value's size."); + } + *bytes_read += value_size_bytes; + success = file_reader_.Read(start_offset + *bytes_read, value_size, value); + if (!success) { + return file_reader_.status(); + } + *bytes_read += value_size; + } + return s; +} + +Status PlainTableKeyDecoder::NextKeyNoValue(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, + uint32_t* bytes_read, + bool* seekable) { + *bytes_read = 0; + if (seekable != nullptr) { + *seekable = true; + } + Status s; + if (encoding_type_ == kPlain) { + return NextPlainEncodingKey(start_offset, parsed_key, internal_key, + bytes_read, seekable); + } else { + assert(encoding_type_ == kPrefix); + return NextPrefixEncodingKey(start_offset, parsed_key, internal_key, + bytes_read, seekable); + } +} + +} // namespace rocksdb +#endif // ROCKSDB_LIT diff --git a/src/rocksdb/table/plain_table_key_coding.h b/src/rocksdb/table/plain_table_key_coding.h new file mode 100644 index 00000000..9a27ad06 --- /dev/null +++ b/src/rocksdb/table/plain_table_key_coding.h @@ -0,0 +1,183 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#ifndef ROCKSDB_LITE + +#include <array> +#include "rocksdb/slice.h" +#include "db/dbformat.h" +#include "table/plain_table_reader.h" + +namespace rocksdb { + +class WritableFile; +struct ParsedInternalKey; +struct PlainTableReaderFileInfo; +enum PlainTableEntryType : unsigned char; + +// Helper class to write out a key to an output file +// Actual data format of the key is documented in plain_table_factory.h +class PlainTableKeyEncoder { + public: + explicit PlainTableKeyEncoder(EncodingType encoding_type, + uint32_t user_key_len, + const SliceTransform* prefix_extractor, + size_t index_sparseness) + : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain), + fixed_user_key_len_(user_key_len), + prefix_extractor_(prefix_extractor), + index_sparseness_((index_sparseness > 1) ? index_sparseness : 1), + key_count_for_prefix_(0) {} + // key: the key to write out, in the format of internal key. + // file: the output file to write out + // offset: offset in the file. Needs to be updated after appending bytes + // for the key + // meta_bytes_buf: buffer for extra meta bytes + // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated + // if meta_bytes_buf is updated. + Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset, + char* meta_bytes_buf, size_t* meta_bytes_buf_size); + + // Return actual encoding type to be picked + EncodingType GetEncodingType() { return encoding_type_; } + + private: + EncodingType encoding_type_; + uint32_t fixed_user_key_len_; + const SliceTransform* prefix_extractor_; + const size_t index_sparseness_; + size_t key_count_for_prefix_; + IterKey pre_prefix_; +}; + +class PlainTableFileReader { + public: + explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info) + : file_info_(_file_info), num_buf_(0) {} + // In mmaped mode, the results point to mmaped area of the file, which + // means it is always valid before closing the file. + // In non-mmap mode, the results point to an internal buffer. If the caller + // makes another read call, the results may not be valid. So callers should + // make a copy when needed. + // In order to save read calls to files, we keep two internal buffers: + // the first read and the most recent read. This is efficient because it + // columns these two common use cases: + // (1) hash index only identify one location, we read the key to verify + // the location, and read key and value if it is the right location. + // (2) after hash index checking, we identify two locations (because of + // hash bucket conflicts), we binary search the two location to see + // which one is what we need and start to read from the location. + // These two most common use cases will be covered by the two buffers + // so that we don't need to re-read the same location. + // Currently we keep a fixed size buffer. If a read doesn't exactly fit + // the buffer, we replace the second buffer with the location user reads. + // + // If return false, status code is stored in status_. + bool Read(uint32_t file_offset, uint32_t len, Slice* out) { + if (file_info_->is_mmap_mode) { + assert(file_offset + len <= file_info_->data_end_offset); + *out = Slice(file_info_->file_data.data() + file_offset, len); + return true; + } else { + return ReadNonMmap(file_offset, len, out); + } + } + + // If return false, status code is stored in status_. + bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output); + + // *bytes_read = 0 means eof. false means failure and status is saved + // in status_. Not directly returning Status to save copying status + // object to map previous performance of mmap mode. + inline bool ReadVarint32(uint32_t offset, uint32_t* output, + uint32_t* bytes_read); + + bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output, + uint32_t* bytes_read); + + Status status() const { return status_; } + + const PlainTableReaderFileInfo* file_info() { return file_info_; } + + private: + const PlainTableReaderFileInfo* file_info_; + + struct Buffer { + Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {} + std::unique_ptr<char[]> buf; + uint32_t buf_start_offset; + uint32_t buf_len; + uint32_t buf_capacity; + }; + + // Keep buffers for two recent reads. + std::array<std::unique_ptr<Buffer>, 2> buffers_; + uint32_t num_buf_; + Status status_; + + Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len); +}; + +// A helper class to decode keys from input buffer +// Actual data format of the key is documented in plain_table_factory.h +class PlainTableKeyDecoder { + public: + explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info, + EncodingType encoding_type, + uint32_t user_key_len, + const SliceTransform* prefix_extractor) + : file_reader_(file_info), + encoding_type_(encoding_type), + prefix_len_(0), + fixed_user_key_len_(user_key_len), + prefix_extractor_(prefix_extractor), + in_prefix_(false) {} + // Find the next key. + // start: char array where the key starts. + // limit: boundary of the char array + // parsed_key: the output of the result key + // internal_key: if not null, fill with the output of the result key in + // un-parsed format + // bytes_read: how many bytes read from start. Output + // seekable: whether key can be read from this place. Used when building + // indexes. Output. + Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key, + Slice* internal_key, Slice* value, uint32_t* bytes_read, + bool* seekable = nullptr); + + Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key, + Slice* internal_key, uint32_t* bytes_read, + bool* seekable = nullptr); + + PlainTableFileReader file_reader_; + EncodingType encoding_type_; + uint32_t prefix_len_; + uint32_t fixed_user_key_len_; + Slice saved_user_key_; + IterKey cur_key_; + const SliceTransform* prefix_extractor_; + bool in_prefix_; + + private: + Status NextPlainEncodingKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, uint32_t* bytes_read, + bool* seekable = nullptr); + Status NextPrefixEncodingKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, uint32_t* bytes_read, + bool* seekable = nullptr); + Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size, + ParsedInternalKey* parsed_key, uint32_t* bytes_read, + bool* internal_key_valid, Slice* internal_key); + inline Status DecodeSize(uint32_t start_offset, + PlainTableEntryType* entry_type, uint32_t* key_size, + uint32_t* bytes_read); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain_table_reader.cc b/src/rocksdb/table/plain_table_reader.cc new file mode 100644 index 00000000..b0c6dcf0 --- /dev/null +++ b/src/rocksdb/table/plain_table_reader.cc @@ -0,0 +1,761 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include "table/plain_table_reader.h" + +#include <string> +#include <vector> + +#include "db/dbformat.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" + +#include "table/block.h" +#include "table/bloom_block.h" +#include "table/filter_block.h" +#include "table/format.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" +#include "table/two_level_iterator.h" +#include "table/plain_table_factory.h" +#include "table/plain_table_key_coding.h" +#include "table/get_context.h" + +#include "monitoring/histogram.h" +#include "monitoring/perf_context_imp.h" +#include "util/arena.h" +#include "util/coding.h" +#include "util/dynamic_bloom.h" +#include "util/hash.h" +#include "util/murmurhash.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +namespace rocksdb { + +namespace { + +// Safely getting a uint32_t element from a char array, where, starting from +// `base`, every 4 bytes are considered as an fixed 32 bit integer. +inline uint32_t GetFixed32Element(const char* base, size_t offset) { + return DecodeFixed32(base + offset * sizeof(uint32_t)); +} +} // namespace + +// Iterator to iterate IndexedTable +class PlainTableIterator : public InternalIterator { + public: + explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); + ~PlainTableIterator() override; + + bool Valid() const override; + + void SeekToFirst() override; + + void SeekToLast() override; + + void Seek(const Slice& target) override; + + void SeekForPrev(const Slice& target) override; + + void Next() override; + + void Prev() override; + + Slice key() const override; + + Slice value() const override; + + Status status() const override; + + private: + PlainTableReader* table_; + PlainTableKeyDecoder decoder_; + bool use_prefix_seek_; + uint32_t offset_; + uint32_t next_offset_; + Slice key_; + Slice value_; + Status status_; + // No copying allowed + PlainTableIterator(const PlainTableIterator&) = delete; + void operator=(const Iterator&) = delete; +}; + +extern const uint64_t kPlainTableMagicNumber; +PlainTableReader::PlainTableReader( + const ImmutableCFOptions& ioptions, + std::unique_ptr<RandomAccessFileReader>&& file, + const EnvOptions& storage_options, const InternalKeyComparator& icomparator, + EncodingType encoding_type, uint64_t file_size, + const TableProperties* table_properties, + const SliceTransform* prefix_extractor) + : internal_comparator_(icomparator), + encoding_type_(encoding_type), + full_scan_mode_(false), + user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)), + prefix_extractor_(prefix_extractor), + enable_bloom_(false), + bloom_(6), + file_info_(std::move(file), storage_options, + static_cast<uint32_t>(table_properties->data_size)), + ioptions_(ioptions), + file_size_(file_size), + table_properties_(nullptr) {} + +PlainTableReader::~PlainTableReader() { +} + +Status PlainTableReader::Open( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, + bool full_scan_mode, const bool immortal_table, + const SliceTransform* prefix_extractor) { + if (file_size > PlainTableIndex::kMaxFileSize) { + return Status::NotSupported("File is too large for PlainTableReader!"); + } + + TableProperties* props = nullptr; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + ioptions, &props, + true /* compression_type_missing */); + if (!s.ok()) { + return s; + } + + assert(hash_table_ratio >= 0.0); + auto& user_props = props->user_collected_properties; + auto prefix_extractor_in_file = props->prefix_extractor_name; + + if (!full_scan_mode && + !prefix_extractor_in_file.empty() /* old version sst file*/ + && prefix_extractor_in_file != "nullptr") { + if (!prefix_extractor) { + return Status::InvalidArgument( + "Prefix extractor is missing when opening a PlainTable built " + "using a prefix extractor"); + } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) != + 0) { + return Status::InvalidArgument( + "Prefix extractor given doesn't match the one used to build " + "PlainTable"); + } + } + + EncodingType encoding_type = kPlain; + auto encoding_type_prop = + user_props.find(PlainTablePropertyNames::kEncodingType); + if (encoding_type_prop != user_props.end()) { + encoding_type = static_cast<EncodingType>( + DecodeFixed32(encoding_type_prop->second.c_str())); + } + + std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader( + ioptions, std::move(file), env_options, internal_comparator, + encoding_type, file_size, props, prefix_extractor)); + + s = new_reader->MmapDataIfNeeded(); + if (!s.ok()) { + return s; + } + + if (!full_scan_mode) { + s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio, + index_sparseness, huge_page_tlb_size); + if (!s.ok()) { + return s; + } + } else { + // Flag to indicate it is a full scan mode so that none of the indexes + // can be used. + new_reader->full_scan_mode_ = true; + } + + if (immortal_table && new_reader->file_info_.is_mmap_mode) { + new_reader->dummy_cleanable_.reset(new Cleanable()); + } + + *table_reader = std::move(new_reader); + return s; +} + +void PlainTableReader::SetupForCompaction() { +} + +InternalIterator* PlainTableReader::NewIterator( + const ReadOptions& options, const SliceTransform* /* prefix_extractor */, + Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) { + bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek; + if (arena == nullptr) { + return new PlainTableIterator(this, use_prefix_seek); + } else { + auto mem = arena->AllocateAligned(sizeof(PlainTableIterator)); + return new (mem) PlainTableIterator(this, use_prefix_seek); + } +} + +Status PlainTableReader::PopulateIndexRecordList( + PlainTableIndexBuilder* index_builder, + std::vector<uint32_t>* prefix_hashes) { + Slice prev_key_prefix_slice; + std::string prev_key_prefix_buf; + uint32_t pos = data_start_offset_; + + bool is_first_record = true; + Slice key_prefix_slice; + PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, + prefix_extractor_); + while (pos < file_info_.data_end_offset) { + uint32_t key_offset = pos; + ParsedInternalKey key; + Slice value_slice; + bool seekable = false; + Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable); + if (!s.ok()) { + return s; + } + + key_prefix_slice = GetPrefix(key); + if (enable_bloom_) { + bloom_.AddHash(GetSliceHash(key.user_key)); + } else { + if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { + if (!is_first_record) { + prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice)); + } + if (file_info_.is_mmap_mode) { + prev_key_prefix_slice = key_prefix_slice; + } else { + prev_key_prefix_buf = key_prefix_slice.ToString(); + prev_key_prefix_slice = prev_key_prefix_buf; + } + } + } + + index_builder->AddKeyPrefix(GetPrefix(key), key_offset); + + if (!seekable && is_first_record) { + return Status::Corruption("Key for a prefix is not seekable"); + } + + is_first_record = false; + } + + prefix_hashes->push_back(GetSliceHash(key_prefix_slice)); + auto s = index_.InitFromRawData(index_builder->Finish()); + return s; +} + +void PlainTableReader::AllocateAndFillBloom( + int bloom_bits_per_key, int num_prefixes, size_t huge_page_tlb_size, + std::vector<uint32_t>* prefix_hashes) { + if (!IsTotalOrderMode()) { + uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key; + if (bloom_total_bits > 0) { + enable_bloom_ = true; + bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, + huge_page_tlb_size, ioptions_.info_log); + FillBloom(prefix_hashes); + } + } +} + +void PlainTableReader::FillBloom(std::vector<uint32_t>* prefix_hashes) { + assert(bloom_.IsInitialized()); + for (auto prefix_hash : *prefix_hashes) { + bloom_.AddHash(prefix_hash); + } +} + +Status PlainTableReader::MmapDataIfNeeded() { + if (file_info_.is_mmap_mode) { + // Get mmapped memory. + return file_info_.file->Read(0, static_cast<size_t>(file_size_), &file_info_.file_data, nullptr); + } + return Status::OK(); +} + +Status PlainTableReader::PopulateIndex(TableProperties* props, + int bloom_bits_per_key, + double hash_table_ratio, + size_t index_sparseness, + size_t huge_page_tlb_size) { + assert(props != nullptr); + table_properties_.reset(props); + + BlockContents index_block_contents; + Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, + file_size_, kPlainTableMagicNumber, ioptions_, + PlainTableIndexBuilder::kPlainTableIndexBlock, + &index_block_contents, + true /* compression_type_missing */); + + bool index_in_file = s.ok(); + + BlockContents bloom_block_contents; + bool bloom_in_file = false; + // We only need to read the bloom block if index block is in file. + if (index_in_file) { + s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, + file_size_, kPlainTableMagicNumber, ioptions_, + BloomBlockBuilder::kBloomBlock, &bloom_block_contents, + true /* compression_type_missing */); + bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; + } + + Slice* bloom_block; + if (bloom_in_file) { + // If bloom_block_contents.allocation is not empty (which will be the case + // for non-mmap mode), it holds the alloated memory for the bloom block. + // It needs to be kept alive to keep `bloom_block` valid. + bloom_block_alloc_ = std::move(bloom_block_contents.allocation); + bloom_block = &bloom_block_contents.data; + } else { + bloom_block = nullptr; + } + + Slice* index_block; + if (index_in_file) { + // If index_block_contents.allocation is not empty (which will be the case + // for non-mmap mode), it holds the alloated memory for the index block. + // It needs to be kept alive to keep `index_block` valid. + index_block_alloc_ = std::move(index_block_contents.allocation); + index_block = &index_block_contents.data; + } else { + index_block = nullptr; + } + + if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) { + // moptions.prefix_extractor is requried for a hash-based look-up. + return Status::NotSupported( + "PlainTable requires a prefix extractor enable prefix hash mode."); + } + + // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows + // for a prefix (starting from the first one), generate a record of (hash, + // offset) and append it to IndexRecordList, which is a data structure created + // to store them. + + if (!index_in_file) { + // Allocate bloom filter here for total order mode. + if (IsTotalOrderMode()) { + uint32_t num_bloom_bits = + static_cast<uint32_t>(table_properties_->num_entries) * + bloom_bits_per_key; + if (num_bloom_bits > 0) { + enable_bloom_ = true; + bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality, + huge_page_tlb_size, ioptions_.info_log); + } + } + } else if (bloom_in_file) { + enable_bloom_ = true; + auto num_blocks_property = props->user_collected_properties.find( + PlainTablePropertyNames::kNumBloomBlocks); + + uint32_t num_blocks = 0; + if (num_blocks_property != props->user_collected_properties.end()) { + Slice temp_slice(num_blocks_property->second); + if (!GetVarint32(&temp_slice, &num_blocks)) { + num_blocks = 0; + } + } + // cast away const qualifier, because bloom_ won't be changed + bloom_.SetRawData( + const_cast<unsigned char*>( + reinterpret_cast<const unsigned char*>(bloom_block->data())), + static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks); + } else { + // Index in file but no bloom in file. Disable bloom filter in this case. + enable_bloom_ = false; + bloom_bits_per_key = 0; + } + + PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_, + index_sparseness, hash_table_ratio, + huge_page_tlb_size); + + std::vector<uint32_t> prefix_hashes; + if (!index_in_file) { + s = PopulateIndexRecordList(&index_builder, &prefix_hashes); + if (!s.ok()) { + return s; + } + } else { + s = index_.InitFromRawData(*index_block); + if (!s.ok()) { + return s; + } + } + + if (!index_in_file) { + // Calculated bloom filter size and allocate memory for + // bloom filter based on the number of prefixes, then fill it. + AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(), + huge_page_tlb_size, &prefix_hashes); + } + + // Fill two table properties. + if (!index_in_file) { + props->user_collected_properties["plain_table_hash_table_size"] = + ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen); + props->user_collected_properties["plain_table_sub_index_size"] = + ToString(index_.GetSubIndexSize()); + } else { + props->user_collected_properties["plain_table_hash_table_size"] = + ToString(0); + props->user_collected_properties["plain_table_sub_index_size"] = + ToString(0); + } + + return Status::OK(); +} + +Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder, + const Slice& target, const Slice& prefix, + uint32_t prefix_hash, bool& prefix_matched, + uint32_t* offset) const { + prefix_matched = false; + uint32_t prefix_index_offset; + auto res = index_.GetOffset(prefix_hash, &prefix_index_offset); + if (res == PlainTableIndex::kNoPrefixForBucket) { + *offset = file_info_.data_end_offset; + return Status::OK(); + } else if (res == PlainTableIndex::kDirectToFile) { + *offset = prefix_index_offset; + return Status::OK(); + } + + // point to sub-index, need to do a binary search + uint32_t upper_bound; + const char* base_ptr = + index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound); + uint32_t low = 0; + uint32_t high = upper_bound; + ParsedInternalKey mid_key; + ParsedInternalKey parsed_target; + if (!ParseInternalKey(target, &parsed_target)) { + return Status::Corruption(Slice()); + } + + // The key is between [low, high). Do a binary search between it. + while (high - low > 1) { + uint32_t mid = (high + low) / 2; + uint32_t file_offset = GetFixed32Element(base_ptr, mid); + uint32_t tmp; + Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp); + if (!s.ok()) { + return s; + } + int cmp_result = internal_comparator_.Compare(mid_key, parsed_target); + if (cmp_result < 0) { + low = mid; + } else { + if (cmp_result == 0) { + // Happen to have found the exact key or target is smaller than the + // first key after base_offset. + prefix_matched = true; + *offset = file_offset; + return Status::OK(); + } else { + high = mid; + } + } + } + // Both of the key at the position low or low+1 could share the same + // prefix as target. We need to rule out one of them to avoid to go + // to the wrong prefix. + ParsedInternalKey low_key; + uint32_t tmp; + uint32_t low_key_offset = GetFixed32Element(base_ptr, low); + Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp); + if (!s.ok()) { + return s; + } + + if (GetPrefix(low_key) == prefix) { + prefix_matched = true; + *offset = low_key_offset; + } else if (low + 1 < upper_bound) { + // There is possible a next prefix, return it + prefix_matched = false; + *offset = GetFixed32Element(base_ptr, low + 1); + } else { + // target is larger than a key of the last prefix in this bucket + // but with a different prefix. Key does not exist. + *offset = file_info_.data_end_offset; + } + return Status::OK(); +} + +bool PlainTableReader::MatchBloom(uint32_t hash) const { + if (!enable_bloom_) { + return true; + } + + if (bloom_.MayContainHash(hash)) { + PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + return true; + } else { + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + return false; + } +} + +Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, Slice* value, + bool* seekable) const { + if (*offset == file_info_.data_end_offset) { + *offset = file_info_.data_end_offset; + return Status::OK(); + } + + if (*offset > file_info_.data_end_offset) { + return Status::Corruption("Offset is out of file size"); + } + + uint32_t bytes_read; + Status s = decoder->NextKey(*offset, parsed_key, internal_key, value, + &bytes_read, seekable); + if (!s.ok()) { + return s; + } + *offset = *offset + bytes_read; + return Status::OK(); +} + +void PlainTableReader::Prepare(const Slice& target) { + if (enable_bloom_) { + uint32_t prefix_hash = GetSliceHash(GetPrefix(target)); + bloom_.Prefetch(prefix_hash); + } +} + +Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, + GetContext* get_context, + const SliceTransform* /* prefix_extractor */, + bool /*skip_filters*/) { + // Check bloom filter first. + Slice prefix_slice; + uint32_t prefix_hash; + if (IsTotalOrderMode()) { + if (full_scan_mode_) { + status_ = + Status::InvalidArgument("Get() is not allowed in full scan mode."); + } + // Match whole user key for bloom filter check. + if (!MatchBloom(GetSliceHash(GetUserKey(target)))) { + return Status::OK(); + } + // in total order mode, there is only one bucket 0, and we always use empty + // prefix. + prefix_slice = Slice(); + prefix_hash = 0; + } else { + prefix_slice = GetPrefix(target); + prefix_hash = GetSliceHash(prefix_slice); + if (!MatchBloom(prefix_hash)) { + return Status::OK(); + } + } + uint32_t offset; + bool prefix_match; + PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, + prefix_extractor_); + Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash, + prefix_match, &offset); + + if (!s.ok()) { + return s; + } + ParsedInternalKey found_key; + ParsedInternalKey parsed_target; + if (!ParseInternalKey(target, &parsed_target)) { + return Status::Corruption(Slice()); + } + Slice found_value; + while (offset < file_info_.data_end_offset) { + s = Next(&decoder, &offset, &found_key, nullptr, &found_value); + if (!s.ok()) { + return s; + } + if (!prefix_match) { + // Need to verify prefix for the first key found if it is not yet + // checked. + if (GetPrefix(found_key) != prefix_slice) { + return Status::OK(); + } + prefix_match = true; + } + // TODO(ljin): since we know the key comparison result here, + // can we enable the fast path? + if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { + bool dont_care __attribute__((__unused__)); + if (!get_context->SaveValue(found_key, found_value, &dont_care, + dummy_cleanable_.get())) { + break; + } + } + } + return Status::OK(); +} + +uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/) { + return 0; +} + +PlainTableIterator::PlainTableIterator(PlainTableReader* table, + bool use_prefix_seek) + : table_(table), + decoder_(&table_->file_info_, table_->encoding_type_, + table_->user_key_len_, table_->prefix_extractor_), + use_prefix_seek_(use_prefix_seek) { + next_offset_ = offset_ = table_->file_info_.data_end_offset; +} + +PlainTableIterator::~PlainTableIterator() { +} + +bool PlainTableIterator::Valid() const { + return offset_ < table_->file_info_.data_end_offset && + offset_ >= table_->data_start_offset_; +} + +void PlainTableIterator::SeekToFirst() { + status_ = Status::OK(); + next_offset_ = table_->data_start_offset_; + if (next_offset_ >= table_->file_info_.data_end_offset) { + next_offset_ = offset_ = table_->file_info_.data_end_offset; + } else { + Next(); + } +} + +void PlainTableIterator::SeekToLast() { + assert(false); + status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable"); + next_offset_ = offset_ = table_->file_info_.data_end_offset; +} + +void PlainTableIterator::Seek(const Slice& target) { + if (use_prefix_seek_ != !table_->IsTotalOrderMode()) { + // This check is done here instead of NewIterator() to permit creating an + // iterator with total_order_seek = true even if we won't be able to Seek() + // it. This is needed for compaction: it creates iterator with + // total_order_seek = true but usually never does Seek() on it, + // only SeekToFirst(). + status_ = + Status::InvalidArgument( + "total_order_seek not implemented for PlainTable."); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + + // If the user doesn't set prefix seek option and we are not able to do a + // total Seek(). assert failure. + if (table_->IsTotalOrderMode()) { + if (table_->full_scan_mode_) { + status_ = + Status::InvalidArgument("Seek() is not allowed in full scan mode."); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } else if (table_->GetIndexSize() > 1) { + assert(false); + status_ = Status::NotSupported( + "PlainTable cannot issue non-prefix seek unless in total order " + "mode."); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + } + + Slice prefix_slice = table_->GetPrefix(target); + uint32_t prefix_hash = 0; + // Bloom filter is ignored in total-order mode. + if (!table_->IsTotalOrderMode()) { + prefix_hash = GetSliceHash(prefix_slice); + if (!table_->MatchBloom(prefix_hash)) { + status_ = Status::OK(); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + } + bool prefix_match; + status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash, + prefix_match, &next_offset_); + if (!status_.ok()) { + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + + if (next_offset_ < table_->file_info_.data_end_offset) { + for (Next(); status_.ok() && Valid(); Next()) { + if (!prefix_match) { + // Need to verify the first key's prefix + if (table_->GetPrefix(key()) != prefix_slice) { + offset_ = next_offset_ = table_->file_info_.data_end_offset; + break; + } + prefix_match = true; + } + if (table_->internal_comparator_.Compare(key(), target) >= 0) { + break; + } + } + } else { + offset_ = table_->file_info_.data_end_offset; + } +} + +void PlainTableIterator::SeekForPrev(const Slice& /*target*/) { + assert(false); + status_ = + Status::NotSupported("SeekForPrev() is not supported in PlainTable"); + offset_ = next_offset_ = table_->file_info_.data_end_offset; +} + +void PlainTableIterator::Next() { + offset_ = next_offset_; + if (offset_ < table_->file_info_.data_end_offset) { + Slice tmp_slice; + ParsedInternalKey parsed_key; + status_ = + table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_); + if (!status_.ok()) { + offset_ = next_offset_ = table_->file_info_.data_end_offset; + } + } +} + +void PlainTableIterator::Prev() { + assert(false); +} + +Slice PlainTableIterator::key() const { + assert(Valid()); + return key_; +} + +Slice PlainTableIterator::value() const { + assert(Valid()); + return value_; +} + +Status PlainTableIterator::status() const { + return status_; +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain_table_reader.h b/src/rocksdb/table/plain_table_reader.h new file mode 100644 index 00000000..022886b7 --- /dev/null +++ b/src/rocksdb/table/plain_table_reader.h @@ -0,0 +1,236 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE +#include <unordered_map> +#include <memory> +#include <vector> +#include <string> +#include <stdint.h> + +#include "db/dbformat.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/table_reader.h" +#include "table/plain_table_factory.h" +#include "table/plain_table_index.h" +#include "util/arena.h" +#include "util/dynamic_bloom.h" +#include "util/file_reader_writer.h" + +namespace rocksdb { + +class Block; +struct BlockContents; +class BlockHandle; +class Footer; +struct Options; +class RandomAccessFile; +struct ReadOptions; +class TableCache; +class TableReader; +class InternalKeyComparator; +class PlainTableKeyDecoder; +class GetContext; + +extern const uint32_t kPlainTableVariableLength; + +struct PlainTableReaderFileInfo { + bool is_mmap_mode; + Slice file_data; + uint32_t data_end_offset; + std::unique_ptr<RandomAccessFileReader> file; + + PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file, + const EnvOptions& storage_options, + uint32_t _data_size_offset) + : is_mmap_mode(storage_options.use_mmap_reads), + data_end_offset(_data_size_offset), + file(std::move(_file)) {} +}; + +// Based on following output file format shown in plain_table_factory.h +// When opening the output file, IndexedTableReader creates a hash table +// from key prefixes to offset of the output file. IndexedTable will decide +// whether it points to the data offset of the first key with the key prefix +// or the offset of it. If there are too many keys share this prefix, it will +// create a binary search-able index from the suffix to offset on disk. +// +// The implementation of IndexedTableReader requires output file is mmaped +class PlainTableReader: public TableReader { + public: + static Status Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr<RandomAccessFileReader>&& file, + uint64_t file_size, std::unique_ptr<TableReader>* table, + const int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness, size_t huge_page_tlb_size, + bool full_scan_mode, const bool immortal_table = false, + const SliceTransform* prefix_extractor = nullptr); + + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena = nullptr, + bool skip_filters = false, + bool for_compaction = false) override; + + void Prepare(const Slice& target) override; + + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + uint64_t ApproximateOffsetOf(const Slice& key) override; + + uint32_t GetIndexSize() const { return index_.GetIndexSize(); } + void SetupForCompaction() override; + + std::shared_ptr<const TableProperties> GetTableProperties() const override { + return table_properties_; + } + + virtual size_t ApproximateMemoryUsage() const override { + return arena_.MemoryAllocatedBytes(); + } + + PlainTableReader(const ImmutableCFOptions& ioptions, + std::unique_ptr<RandomAccessFileReader>&& file, + const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + EncodingType encoding_type, uint64_t file_size, + const TableProperties* table_properties, + const SliceTransform* prefix_extractor); + virtual ~PlainTableReader(); + + protected: + // Check bloom filter to see whether it might contain this prefix. + // The hash of the prefix is given, since it can be reused for index lookup + // too. + virtual bool MatchBloom(uint32_t hash) const; + + // PopulateIndex() builds index of keys. It must be called before any query + // to the table. + // + // props: the table properties object that need to be stored. Ownership of + // the object will be passed. + // + + Status PopulateIndex(TableProperties* props, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + size_t huge_page_tlb_size); + + Status MmapDataIfNeeded(); + + private: + const InternalKeyComparator internal_comparator_; + EncodingType encoding_type_; + // represents plain table's current status. + Status status_; + + PlainTableIndex index_; + bool full_scan_mode_; + + // data_start_offset_ and data_end_offset_ defines the range of the + // sst file that stores data. + const uint32_t data_start_offset_ = 0; + const uint32_t user_key_len_; + const SliceTransform* prefix_extractor_; + + static const size_t kNumInternalBytes = 8; + + // Bloom filter is used to rule out non-existent key + bool enable_bloom_; + DynamicBloom bloom_; + PlainTableReaderFileInfo file_info_; + Arena arena_; + CacheAllocationPtr index_block_alloc_; + CacheAllocationPtr bloom_block_alloc_; + + const ImmutableCFOptions& ioptions_; + std::unique_ptr<Cleanable> dummy_cleanable_; + uint64_t file_size_; + std::shared_ptr<const TableProperties> table_properties_; + + bool IsFixedLength() const { + return user_key_len_ != kPlainTableVariableLength; + } + + size_t GetFixedInternalKeyLength() const { + return user_key_len_ + kNumInternalBytes; + } + + Slice GetPrefix(const Slice& target) const { + assert(target.size() >= 8); // target is internal key + return GetPrefixFromUserKey(GetUserKey(target)); + } + + Slice GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); + } + + Slice GetUserKey(const Slice& key) const { + return Slice(key.data(), key.size() - 8); + } + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return prefix_extractor_->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. + // In that case, + // it falls back to pure binary search and + // total iterator seek is supported. + return Slice(); + } + } + + friend class TableCache; + friend class PlainTableIterator; + + // Internal helper function to generate an IndexRecordList object from all + // the rows, which contains index records as a list. + // If bloom_ is not null, all the keys' full-key hash will be added to the + // bloom filter. + Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder, + std::vector<uint32_t>* prefix_hashes); + + // Internal helper function to allocate memory for bloom filter and fill it + void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes, + size_t huge_page_tlb_size, + std::vector<uint32_t>* prefix_hashes); + + void FillBloom(std::vector<uint32_t>* prefix_hashes); + + // Read the key and value at `offset` to parameters for keys, the and + // `seekable`. + // On success, `offset` will be updated as the offset for the next key. + // `parsed_key` will be key in parsed format. + // if `internal_key` is not empty, it will be filled with key with slice + // format. + // if `seekable` is not null, it will return whether we can directly read + // data using this offset. + Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset, + ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value, + bool* seekable = nullptr) const; + // Get file offset for key target. + // return value prefix_matched is set to true if the offset is confirmed + // for a key with the same prefix as target. + Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target, + const Slice& prefix, uint32_t prefix_hash, + bool& prefix_matched, uint32_t* offset) const; + + bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } + + // No copying allowed + explicit PlainTableReader(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/scoped_arena_iterator.h b/src/rocksdb/table/scoped_arena_iterator.h new file mode 100644 index 00000000..1de570dc --- /dev/null +++ b/src/rocksdb/table/scoped_arena_iterator.h @@ -0,0 +1,61 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "table/internal_iterator.h" +#include "port/port.h" + +namespace rocksdb { +class ScopedArenaIterator { + + void reset(InternalIterator* iter) ROCKSDB_NOEXCEPT { + if (iter_ != nullptr) { + iter_->~InternalIterator(); + } + iter_ = iter; + } + + public: + + explicit ScopedArenaIterator(InternalIterator* iter = nullptr) + : iter_(iter) {} + + ScopedArenaIterator(const ScopedArenaIterator&) = delete; + ScopedArenaIterator& operator=(const ScopedArenaIterator&) = delete; + + ScopedArenaIterator(ScopedArenaIterator&& o) ROCKSDB_NOEXCEPT { + iter_ = o.iter_; + o.iter_ = nullptr; + } + + ScopedArenaIterator& operator=(ScopedArenaIterator&& o) ROCKSDB_NOEXCEPT { + reset(o.iter_); + o.iter_ = nullptr; + return *this; + } + + InternalIterator* operator->() { return iter_; } + InternalIterator* get() { return iter_; } + + void set(InternalIterator* iter) { reset(iter); } + + InternalIterator* release() { + assert(iter_ != nullptr); + auto* res = iter_; + iter_ = nullptr; + return res; + } + + ~ScopedArenaIterator() { + reset(nullptr); + } + + private: + InternalIterator* iter_; +}; +} // namespace rocksdb diff --git a/src/rocksdb/table/sst_file_reader.cc b/src/rocksdb/table/sst_file_reader.cc new file mode 100644 index 00000000..54408bb5 --- /dev/null +++ b/src/rocksdb/table/sst_file_reader.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "rocksdb/sst_file_reader.h" + +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "options/cf_options.h" +#include "table/get_context.h" +#include "table/table_builder.h" +#include "table/table_reader.h" +#include "util/file_reader_writer.h" + +namespace rocksdb { + +struct SstFileReader::Rep { + Options options; + EnvOptions soptions; + ImmutableCFOptions ioptions; + MutableCFOptions moptions; + + std::unique_ptr<TableReader> table_reader; + + Rep(const Options& opts) + : options(opts), + soptions(options), + ioptions(options), + moptions(ColumnFamilyOptions(options)) {} +}; + +SstFileReader::SstFileReader(const Options& options) : rep_(new Rep(options)) {} + +SstFileReader::~SstFileReader() {} + +Status SstFileReader::Open(const std::string& file_path) { + auto r = rep_.get(); + Status s; + uint64_t file_size = 0; + std::unique_ptr<RandomAccessFile> file; + std::unique_ptr<RandomAccessFileReader> file_reader; + s = r->options.env->GetFileSize(file_path, &file_size); + if (s.ok()) { + s = r->options.env->NewRandomAccessFile(file_path, &file, r->soptions); + } + if (s.ok()) { + file_reader.reset(new RandomAccessFileReader(std::move(file), file_path)); + } + if (s.ok()) { + TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor.get(), + r->soptions, r->ioptions.internal_comparator); + // Allow open file with global sequence number for backward compatibility. + t_opt.largest_seqno = kMaxSequenceNumber; + s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader), + file_size, &r->table_reader); + } + return s; +} + +Iterator* SstFileReader::NewIterator(const ReadOptions& options) { + auto r = rep_.get(); + auto sequence = options.snapshot != nullptr + ? options.snapshot->GetSequenceNumber() + : kMaxSequenceNumber; + auto internal_iter = + r->table_reader->NewIterator(options, r->moptions.prefix_extractor.get()); + return NewDBIterator(r->options.env, options, r->ioptions, r->moptions, + r->ioptions.user_comparator, internal_iter, sequence, + r->moptions.max_sequential_skip_in_iterations, + nullptr /* read_callback */); +} + +std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties() + const { + return rep_->table_reader->GetTableProperties(); +} + +Status SstFileReader::VerifyChecksum() { + return rep_->table_reader->VerifyChecksum(); +} + +} // namespace rocksdb + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/table/sst_file_reader_test.cc b/src/rocksdb/table/sst_file_reader_test.cc new file mode 100644 index 00000000..51bc975a --- /dev/null +++ b/src/rocksdb/table/sst_file_reader_test.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include <inttypes.h> + +#include "rocksdb/db.h" +#include "rocksdb/sst_file_reader.h" +#include "rocksdb/sst_file_writer.h" +#include "table/sst_file_writer_collectors.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +std::string EncodeAsString(uint64_t v) { + char buf[16]; + snprintf(buf, sizeof(buf), "%08" PRIu64, v); + return std::string(buf); +} + +std::string EncodeAsUint64(uint64_t v) { + std::string dst; + PutFixed64(&dst, v); + return dst; +} + +class SstFileReaderTest : public testing::Test { + public: + SstFileReaderTest() { + options_.merge_operator = MergeOperators::CreateUInt64AddOperator(); + sst_name_ = test::PerThreadDBPath("sst_file"); + } + + ~SstFileReaderTest() { + Status s = Env::Default()->DeleteFile(sst_name_); + assert(s.ok()); + } + + void CreateFile(const std::string& file_name, + const std::vector<std::string>& keys) { + SstFileWriter writer(soptions_, options_); + ASSERT_OK(writer.Open(file_name)); + for (size_t i = 0; i + 2 < keys.size(); i += 3) { + ASSERT_OK(writer.Put(keys[i], keys[i])); + ASSERT_OK(writer.Merge(keys[i + 1], EncodeAsUint64(i + 1))); + ASSERT_OK(writer.Delete(keys[i + 2])); + } + ASSERT_OK(writer.Finish()); + } + + void CheckFile(const std::string& file_name, + const std::vector<std::string>& keys, + bool check_global_seqno = false) { + ReadOptions ropts; + SstFileReader reader(options_); + ASSERT_OK(reader.Open(file_name)); + ASSERT_OK(reader.VerifyChecksum()); + std::unique_ptr<Iterator> iter(reader.NewIterator(ropts)); + iter->SeekToFirst(); + for (size_t i = 0; i + 2 < keys.size(); i += 3) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(keys[i]), 0); + ASSERT_EQ(iter->value().compare(keys[i]), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(keys[i + 1]), 0); + ASSERT_EQ(iter->value().compare(EncodeAsUint64(i + 1)), 0); + iter->Next(); + } + ASSERT_FALSE(iter->Valid()); + if (check_global_seqno) { + auto properties = reader.GetTableProperties(); + ASSERT_TRUE(properties); + auto& user_properties = properties->user_collected_properties; + ASSERT_TRUE( + user_properties.count(ExternalSstFilePropertyNames::kGlobalSeqno)); + } + } + + void CreateFileAndCheck(const std::vector<std::string>& keys) { + CreateFile(sst_name_, keys); + CheckFile(sst_name_, keys); + } + + protected: + Options options_; + EnvOptions soptions_; + std::string sst_name_; +}; + +const uint64_t kNumKeys = 100; + +TEST_F(SstFileReaderTest, Basic) { + std::vector<std::string> keys; + for (uint64_t i = 0; i < kNumKeys; i++) { + keys.emplace_back(EncodeAsString(i)); + } + CreateFileAndCheck(keys); +} + +TEST_F(SstFileReaderTest, Uint64Comparator) { + options_.comparator = test::Uint64Comparator(); + std::vector<std::string> keys; + for (uint64_t i = 0; i < kNumKeys; i++) { + keys.emplace_back(EncodeAsUint64(i)); + } + CreateFileAndCheck(keys); +} + +TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) { + std::vector<std::string> keys; + for (uint64_t i = 0; i < kNumKeys; i++) { + keys.emplace_back(EncodeAsString(i)); + } + // Generate a SST file. + CreateFile(sst_name_, keys); + + // Ingest the file into a db, to assign it a global sequence number. + Options options; + options.create_if_missing = true; + std::string db_name = test::PerThreadDBPath("test_db"); + DB* db; + ASSERT_OK(DB::Open(options, db_name, &db)); + // Bump sequence number. + ASSERT_OK(db->Put(WriteOptions(), keys[0], "foo")); + ASSERT_OK(db->Flush(FlushOptions())); + // Ingest the file. + IngestExternalFileOptions ingest_options; + ingest_options.write_global_seqno = true; + ASSERT_OK(db->IngestExternalFile({sst_name_}, ingest_options)); + std::vector<std::string> live_files; + uint64_t manifest_file_size = 0; + ASSERT_OK(db->GetLiveFiles(live_files, &manifest_file_size)); + // Get the ingested file. + std::string ingested_file; + for (auto& live_file : live_files) { + if (live_file.substr(live_file.size() - 4, std::string::npos) == ".sst") { + if (ingested_file.empty() || ingested_file < live_file) { + ingested_file = live_file; + } + } + } + ASSERT_FALSE(ingested_file.empty()); + delete db; + + // Verify the file can be open and read by SstFileReader. + CheckFile(db_name + ingested_file, keys, true /* check_global_seqno */); + + // Cleanup. + ASSERT_OK(DestroyDB(db_name, options)); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include <stdio.h> + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as SstFileReader is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/sst_file_writer.cc b/src/rocksdb/table/sst_file_writer.cc new file mode 100644 index 00000000..b9a7273e --- /dev/null +++ b/src/rocksdb/table/sst_file_writer.cc @@ -0,0 +1,315 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/sst_file_writer.h" + +#include <vector> +#include "db/dbformat.h" +#include "rocksdb/table.h" +#include "table/block_based_table_builder.h" +#include "table/sst_file_writer_collectors.h" +#include "util/file_reader_writer.h" +#include "util/sync_point.h" + +namespace rocksdb { + +const std::string ExternalSstFilePropertyNames::kVersion = + "rocksdb.external_sst_file.version"; +const std::string ExternalSstFilePropertyNames::kGlobalSeqno = + "rocksdb.external_sst_file.global_seqno"; + +#ifndef ROCKSDB_LITE + +const size_t kFadviseTrigger = 1024 * 1024; // 1MB + +struct SstFileWriter::Rep { + Rep(const EnvOptions& _env_options, const Options& options, + Env::IOPriority _io_priority, const Comparator* _user_comparator, + ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters) + : env_options(_env_options), + ioptions(options), + mutable_cf_options(options), + io_priority(_io_priority), + internal_comparator(_user_comparator), + cfh(_cfh), + invalidate_page_cache(_invalidate_page_cache), + last_fadvise_size(0), + skip_filters(_skip_filters) {} + + std::unique_ptr<WritableFileWriter> file_writer; + std::unique_ptr<TableBuilder> builder; + EnvOptions env_options; + ImmutableCFOptions ioptions; + MutableCFOptions mutable_cf_options; + Env::IOPriority io_priority; + InternalKeyComparator internal_comparator; + ExternalSstFileInfo file_info; + InternalKey ikey; + std::string column_family_name; + ColumnFamilyHandle* cfh; + // If true, We will give the OS a hint that this file pages is not needed + // every time we write 1MB to the file. + bool invalidate_page_cache; + // The size of the file during the last time we called Fadvise to remove + // cached pages from page cache. + uint64_t last_fadvise_size; + bool skip_filters; + Status Add(const Slice& user_key, const Slice& value, + const ValueType value_type) { + if (!builder) { + return Status::InvalidArgument("File is not opened"); + } + + if (file_info.num_entries == 0) { + file_info.smallest_key.assign(user_key.data(), user_key.size()); + } else { + if (internal_comparator.user_comparator()->Compare( + user_key, file_info.largest_key) <= 0) { + // Make sure that keys are added in order + return Status::InvalidArgument("Keys must be added in order"); + } + } + + // TODO(tec) : For external SST files we could omit the seqno and type. + switch (value_type) { + case ValueType::kTypeValue: + ikey.Set(user_key, 0 /* Sequence Number */, + ValueType::kTypeValue /* Put */); + break; + case ValueType::kTypeMerge: + ikey.Set(user_key, 0 /* Sequence Number */, + ValueType::kTypeMerge /* Merge */); + break; + case ValueType::kTypeDeletion: + ikey.Set(user_key, 0 /* Sequence Number */, + ValueType::kTypeDeletion /* Delete */); + break; + default: + return Status::InvalidArgument("Value type is not supported"); + } + builder->Add(ikey.Encode(), value); + + // update file info + file_info.num_entries++; + file_info.largest_key.assign(user_key.data(), user_key.size()); + file_info.file_size = builder->FileSize(); + + InvalidatePageCache(false /* closing */); + + return Status::OK(); + } + + Status DeleteRange(const Slice& begin_key, const Slice& end_key) { + if (!builder) { + return Status::InvalidArgument("File is not opened"); + } + + RangeTombstone tombstone(begin_key, end_key, 0 /* Sequence Number */); + if (file_info.num_range_del_entries == 0) { + file_info.smallest_range_del_key.assign(tombstone.start_key_.data(), + tombstone.start_key_.size()); + file_info.largest_range_del_key.assign(tombstone.end_key_.data(), + tombstone.end_key_.size()); + } else { + if (internal_comparator.user_comparator()->Compare( + tombstone.start_key_, file_info.smallest_range_del_key) < 0) { + file_info.smallest_range_del_key.assign(tombstone.start_key_.data(), + tombstone.start_key_.size()); + } + if (internal_comparator.user_comparator()->Compare( + tombstone.end_key_, file_info.largest_range_del_key) > 0) { + file_info.largest_range_del_key.assign(tombstone.end_key_.data(), + tombstone.end_key_.size()); + } + } + + auto ikey_and_end_key = tombstone.Serialize(); + builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second); + + // update file info + file_info.num_range_del_entries++; + file_info.file_size = builder->FileSize(); + + InvalidatePageCache(false /* closing */); + + return Status::OK(); + } + + void InvalidatePageCache(bool closing) { + if (invalidate_page_cache == false) { + // Fadvise disabled + return; + } + uint64_t bytes_since_last_fadvise = + builder->FileSize() - last_fadvise_size; + if (bytes_since_last_fadvise > kFadviseTrigger || closing) { + TEST_SYNC_POINT_CALLBACK("SstFileWriter::Rep::InvalidatePageCache", + &(bytes_since_last_fadvise)); + // Tell the OS that we dont need this file in page cache + file_writer->InvalidateCache(0, 0); + last_fadvise_size = builder->FileSize(); + } + } + +}; + +SstFileWriter::SstFileWriter(const EnvOptions& env_options, + const Options& options, + const Comparator* user_comparator, + ColumnFamilyHandle* column_family, + bool invalidate_page_cache, + Env::IOPriority io_priority, bool skip_filters) + : rep_(new Rep(env_options, options, io_priority, user_comparator, + column_family, invalidate_page_cache, skip_filters)) { + rep_->file_info.file_size = 0; +} + +SstFileWriter::~SstFileWriter() { + if (rep_->builder) { + // User did not call Finish() or Finish() failed, we need to + // abandon the builder. + rep_->builder->Abandon(); + } +} + +Status SstFileWriter::Open(const std::string& file_path) { + Rep* r = rep_.get(); + Status s; + std::unique_ptr<WritableFile> sst_file; + s = r->ioptions.env->NewWritableFile(file_path, &sst_file, r->env_options); + if (!s.ok()) { + return s; + } + + sst_file->SetIOPriority(r->io_priority); + + CompressionType compression_type; + CompressionOptions compression_opts; + if (r->ioptions.bottommost_compression != kDisableCompressionOption) { + compression_type = r->ioptions.bottommost_compression; + if (r->ioptions.bottommost_compression_opts.enabled) { + compression_opts = r->ioptions.bottommost_compression_opts; + } else { + compression_opts = r->ioptions.compression_opts; + } + } else if (!r->ioptions.compression_per_level.empty()) { + // Use the compression of the last level if we have per level compression + compression_type = *(r->ioptions.compression_per_level.rbegin()); + compression_opts = r->ioptions.compression_opts; + } else { + compression_type = r->mutable_cf_options.compression; + compression_opts = r->ioptions.compression_opts; + } + uint64_t sample_for_compression = + r->mutable_cf_options.sample_for_compression; + + std::vector<std::unique_ptr<IntTblPropCollectorFactory>> + int_tbl_prop_collector_factories; + + // SstFileWriter properties collector to add SstFileWriter version. + int_tbl_prop_collector_factories.emplace_back( + new SstFileWriterPropertiesCollectorFactory(2 /* version */, + 0 /* global_seqno*/)); + + // User collector factories + auto user_collector_factories = + r->ioptions.table_properties_collector_factories; + for (size_t i = 0; i < user_collector_factories.size(); i++) { + int_tbl_prop_collector_factories.emplace_back( + new UserKeyTablePropertiesCollectorFactory( + user_collector_factories[i])); + } + int unknown_level = -1; + uint32_t cf_id; + + if (r->cfh != nullptr) { + // user explicitly specified that this file will be ingested into cfh, + // we can persist this information in the file. + cf_id = r->cfh->GetID(); + r->column_family_name = r->cfh->GetName(); + } else { + r->column_family_name = ""; + cf_id = TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; + } + + TableBuilderOptions table_builder_options( + r->ioptions, r->mutable_cf_options, r->internal_comparator, + &int_tbl_prop_collector_factories, compression_type, + sample_for_compression, compression_opts, r->skip_filters, + r->column_family_name, unknown_level); + r->file_writer.reset(new WritableFileWriter( + std::move(sst_file), file_path, r->env_options, r->ioptions.env, + nullptr /* stats */, r->ioptions.listeners)); + + // TODO(tec) : If table_factory is using compressed block cache, we will + // be adding the external sst file blocks into it, which is wasteful. + r->builder.reset(r->ioptions.table_factory->NewTableBuilder( + table_builder_options, cf_id, r->file_writer.get())); + + r->file_info = ExternalSstFileInfo(); + r->file_info.file_path = file_path; + r->file_info.version = 2; + return s; +} + +Status SstFileWriter::Add(const Slice& user_key, const Slice& value) { + return rep_->Add(user_key, value, ValueType::kTypeValue); +} + +Status SstFileWriter::Put(const Slice& user_key, const Slice& value) { + return rep_->Add(user_key, value, ValueType::kTypeValue); +} + +Status SstFileWriter::Merge(const Slice& user_key, const Slice& value) { + return rep_->Add(user_key, value, ValueType::kTypeMerge); +} + +Status SstFileWriter::Delete(const Slice& user_key) { + return rep_->Add(user_key, Slice(), ValueType::kTypeDeletion); +} + +Status SstFileWriter::DeleteRange(const Slice& begin_key, + const Slice& end_key) { + return rep_->DeleteRange(begin_key, end_key); +} + +Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { + Rep* r = rep_.get(); + if (!r->builder) { + return Status::InvalidArgument("File is not opened"); + } + if (r->file_info.num_entries == 0 && + r->file_info.num_range_del_entries == 0) { + return Status::InvalidArgument("Cannot create sst file with no entries"); + } + + Status s = r->builder->Finish(); + r->file_info.file_size = r->builder->FileSize(); + + if (s.ok()) { + s = r->file_writer->Sync(r->ioptions.use_fsync); + r->InvalidatePageCache(true /* closing */); + if (s.ok()) { + s = r->file_writer->Close(); + } + } + if (!s.ok()) { + r->ioptions.env->DeleteFile(r->file_info.file_path); + } + + if (file_info != nullptr) { + *file_info = r->file_info; + } + + r->builder.reset(); + return s; +} + +uint64_t SstFileWriter::FileSize() { + return rep_->file_info.file_size; +} +#endif // !ROCKSDB_LITE + +} // namespace rocksdb diff --git a/src/rocksdb/table/sst_file_writer_collectors.h b/src/rocksdb/table/sst_file_writer_collectors.h new file mode 100644 index 00000000..e1827939 --- /dev/null +++ b/src/rocksdb/table/sst_file_writer_collectors.h @@ -0,0 +1,94 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include <string> +#include "db/dbformat.h" +#include "db/table_properties_collector.h" +#include "rocksdb/types.h" +#include "util/string_util.h" + +namespace rocksdb { + +// Table Properties that are specific to tables created by SstFileWriter. +struct ExternalSstFilePropertyNames { + // value of this property is a fixed uint32 number. + static const std::string kVersion; + // value of this property is a fixed uint64 number. + static const std::string kGlobalSeqno; +}; + +// PropertiesCollector used to add properties specific to tables +// generated by SstFileWriter +class SstFileWriterPropertiesCollector : public IntTblPropCollector { + public: + explicit SstFileWriterPropertiesCollector(int32_t version, + SequenceNumber global_seqno) + : version_(version), global_seqno_(global_seqno) {} + + virtual Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/, + uint64_t /*file_size*/) override { + // Intentionally left blank. Have no interest in collecting stats for + // individual key/value pairs. + return Status::OK(); + } + + virtual void BlockAdd(uint64_t /* blockRawBytes */, + uint64_t /* blockCompressedBytesFast */, + uint64_t /* blockCompressedBytesSlow */) override { + // Intentionally left blank. No interest in collecting stats for + // blocks. + return; + } + + virtual Status Finish(UserCollectedProperties* properties) override { + // File version + std::string version_val; + PutFixed32(&version_val, static_cast<uint32_t>(version_)); + properties->insert({ExternalSstFilePropertyNames::kVersion, version_val}); + + // Global Sequence number + std::string seqno_val; + PutFixed64(&seqno_val, static_cast<uint64_t>(global_seqno_)); + properties->insert({ExternalSstFilePropertyNames::kGlobalSeqno, seqno_val}); + + return Status::OK(); + } + + virtual const char* Name() const override { + return "SstFileWriterPropertiesCollector"; + } + + virtual UserCollectedProperties GetReadableProperties() const override { + return {{ExternalSstFilePropertyNames::kVersion, ToString(version_)}}; + } + + private: + int32_t version_; + SequenceNumber global_seqno_; +}; + +class SstFileWriterPropertiesCollectorFactory + : public IntTblPropCollectorFactory { + public: + explicit SstFileWriterPropertiesCollectorFactory(int32_t version, + SequenceNumber global_seqno) + : version_(version), global_seqno_(global_seqno) {} + + virtual IntTblPropCollector* CreateIntTblPropCollector( + uint32_t /*column_family_id*/) override { + return new SstFileWriterPropertiesCollector(version_, global_seqno_); + } + + virtual const char* Name() const override { + return "SstFileWriterPropertiesCollector"; + } + + private: + int32_t version_; + SequenceNumber global_seqno_; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/table_builder.h b/src/rocksdb/table/table_builder.h new file mode 100644 index 00000000..20d9a55f --- /dev/null +++ b/src/rocksdb/table/table_builder.h @@ -0,0 +1,155 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <stdint.h> +#include <string> +#include <utility> +#include <vector> +#include "db/dbformat.h" +#include "db/table_properties_collector.h" +#include "options/cf_options.h" +#include "rocksdb/options.h" +#include "rocksdb/table_properties.h" +#include "util/file_reader_writer.h" + +namespace rocksdb { + +class Slice; +class Status; + +struct TableReaderOptions { + // @param skip_filters Disables loading/accessing the filter block + TableReaderOptions(const ImmutableCFOptions& _ioptions, + const SliceTransform* _prefix_extractor, + const EnvOptions& _env_options, + const InternalKeyComparator& _internal_comparator, + bool _skip_filters = false, bool _immortal = false, + int _level = -1) + : TableReaderOptions(_ioptions, _prefix_extractor, _env_options, + _internal_comparator, _skip_filters, _immortal, + _level, 0 /* _largest_seqno */) {} + + // @param skip_filters Disables loading/accessing the filter block + TableReaderOptions(const ImmutableCFOptions& _ioptions, + const SliceTransform* _prefix_extractor, + const EnvOptions& _env_options, + const InternalKeyComparator& _internal_comparator, + bool _skip_filters, bool _immortal, int _level, + SequenceNumber _largest_seqno) + : ioptions(_ioptions), + prefix_extractor(_prefix_extractor), + env_options(_env_options), + internal_comparator(_internal_comparator), + skip_filters(_skip_filters), + immortal(_immortal), + level(_level), + largest_seqno(_largest_seqno) {} + + const ImmutableCFOptions& ioptions; + const SliceTransform* prefix_extractor; + const EnvOptions& env_options; + const InternalKeyComparator& internal_comparator; + // This is only used for BlockBasedTable (reader) + bool skip_filters; + // Whether the table will be valid as long as the DB is open + bool immortal; + // what level this table/file is on, -1 for "not set, don't know" + int level; + // largest seqno in the table + SequenceNumber largest_seqno; +}; + +struct TableBuilderOptions { + TableBuilderOptions( + const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions, + const InternalKeyComparator& _internal_comparator, + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + _int_tbl_prop_collector_factories, + CompressionType _compression_type, uint64_t _sample_for_compression, + const CompressionOptions& _compression_opts, bool _skip_filters, + const std::string& _column_family_name, int _level, + const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0, + const uint64_t _target_file_size = 0) + : ioptions(_ioptions), + moptions(_moptions), + internal_comparator(_internal_comparator), + int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories), + compression_type(_compression_type), + sample_for_compression(_sample_for_compression), + compression_opts(_compression_opts), + skip_filters(_skip_filters), + column_family_name(_column_family_name), + level(_level), + creation_time(_creation_time), + oldest_key_time(_oldest_key_time), + target_file_size(_target_file_size) {} + const ImmutableCFOptions& ioptions; + const MutableCFOptions& moptions; + const InternalKeyComparator& internal_comparator; + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + int_tbl_prop_collector_factories; + CompressionType compression_type; + uint64_t sample_for_compression; + const CompressionOptions& compression_opts; + bool skip_filters; // only used by BlockBasedTableBuilder + const std::string& column_family_name; + int level; // what level this table/file is on, -1 for "not set, don't know" + const uint64_t creation_time; + const int64_t oldest_key_time; + const uint64_t target_file_size; +}; + +// TableBuilder provides the interface used to build a Table +// (an immutable and sorted map from keys to values). +// +// Multiple threads can invoke const methods on a TableBuilder without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same TableBuilder must use +// external synchronization. +class TableBuilder { + public: + // REQUIRES: Either Finish() or Abandon() has been called. + virtual ~TableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Add(const Slice& key, const Slice& value) = 0; + + // Return non-ok iff some error has been detected. + virtual Status status() const = 0; + + // Finish building the table. + // REQUIRES: Finish(), Abandon() have not been called + virtual Status Finish() = 0; + + // Indicate that the contents of this builder should be abandoned. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + virtual void Abandon() = 0; + + // Number of calls to Add() so far. + virtual uint64_t NumEntries() const = 0; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + virtual uint64_t FileSize() const = 0; + + // If the user defined table properties collector suggest the file to + // be further compacted. + virtual bool NeedCompact() const { return false; } + + // Returns table properties + virtual TableProperties GetTableProperties() const = 0; +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/table_properties.cc b/src/rocksdb/table/table_properties.cc new file mode 100644 index 00000000..b7aaea48 --- /dev/null +++ b/src/rocksdb/table/table_properties.cc @@ -0,0 +1,265 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/table_properties.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "table/block.h" +#include "table/internal_iterator.h" +#include "table/table_properties_internal.h" +#include "util/string_util.h" + +namespace rocksdb { + +const uint32_t TablePropertiesCollectorFactory::Context::kUnknownColumnFamily = + port::kMaxInt32; + +namespace { + void AppendProperty( + std::string& props, + const std::string& key, + const std::string& value, + const std::string& prop_delim, + const std::string& kv_delim) { + props.append(key); + props.append(kv_delim); + props.append(value); + props.append(prop_delim); + } + + template <class TValue> + void AppendProperty( + std::string& props, + const std::string& key, + const TValue& value, + const std::string& prop_delim, + const std::string& kv_delim) { + AppendProperty( + props, key, ToString(value), prop_delim, kv_delim + ); + } + + // Seek to the specified meta block. + // Return true if it successfully seeks to that block. + Status SeekToMetaBlock(InternalIterator* meta_iter, + const std::string& block_name, bool* is_found, + BlockHandle* block_handle = nullptr) { + if (block_handle != nullptr) { + *block_handle = BlockHandle::NullBlockHandle(); + } + *is_found = true; + meta_iter->Seek(block_name); + if (meta_iter->status().ok()) { + if (meta_iter->Valid() && meta_iter->key() == block_name) { + *is_found = true; + if (block_handle) { + Slice v = meta_iter->value(); + return block_handle->DecodeFrom(&v); + } + } else { + *is_found = false; + return Status::OK(); + } + } + return meta_iter->status(); + } +} + +std::string TableProperties::ToString( + const std::string& prop_delim, + const std::string& kv_delim) const { + std::string result; + result.reserve(1024); + + // Basic Info + AppendProperty(result, "# data blocks", num_data_blocks, prop_delim, + kv_delim); + AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim); + AppendProperty(result, "# deletions", num_deletions, prop_delim, kv_delim); + AppendProperty(result, "# merge operands", num_merge_operands, prop_delim, + kv_delim); + AppendProperty(result, "# range deletions", num_range_deletions, prop_delim, + kv_delim); + + AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim); + AppendProperty(result, "raw average key size", + num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0, + prop_delim, kv_delim); + AppendProperty(result, "raw value size", raw_value_size, prop_delim, + kv_delim); + AppendProperty(result, "raw average value size", + num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0, + prop_delim, kv_delim); + + AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); + char index_block_size_str[80]; + snprintf(index_block_size_str, sizeof(index_block_size_str), + "index block size (user-key? %d, delta-value? %d)", + static_cast<int>(index_key_is_user_key), + static_cast<int>(index_value_is_delta_encoded)); + AppendProperty(result, index_block_size_str, index_size, prop_delim, + kv_delim); + if (index_partitions != 0) { + AppendProperty(result, "# index partitions", index_partitions, prop_delim, + kv_delim); + AppendProperty(result, "top-level index size", top_level_index_size, prop_delim, + kv_delim); + } + AppendProperty(result, "filter block size", filter_size, prop_delim, + kv_delim); + AppendProperty(result, "(estimated) table size", + data_size + index_size + filter_size, prop_delim, kv_delim); + + AppendProperty( + result, "filter policy name", + filter_policy_name.empty() ? std::string("N/A") : filter_policy_name, + prop_delim, kv_delim); + + AppendProperty(result, "prefix extractor name", + prefix_extractor_name.empty() ? std::string("N/A") + : prefix_extractor_name, + prop_delim, kv_delim); + + AppendProperty(result, "column family ID", + column_family_id == rocksdb::TablePropertiesCollectorFactory:: + Context::kUnknownColumnFamily + ? std::string("N/A") + : rocksdb::ToString(column_family_id), + prop_delim, kv_delim); + AppendProperty( + result, "column family name", + column_family_name.empty() ? std::string("N/A") : column_family_name, + prop_delim, kv_delim); + + AppendProperty(result, "comparator name", + comparator_name.empty() ? std::string("N/A") : comparator_name, + prop_delim, kv_delim); + + AppendProperty( + result, "merge operator name", + merge_operator_name.empty() ? std::string("N/A") : merge_operator_name, + prop_delim, kv_delim); + + AppendProperty(result, "property collectors names", + property_collectors_names.empty() ? std::string("N/A") + : property_collectors_names, + prop_delim, kv_delim); + + AppendProperty( + result, "SST file compression algo", + compression_name.empty() ? std::string("N/A") : compression_name, + prop_delim, kv_delim); + + AppendProperty( + result, "SST file compression options", + compression_options.empty() ? std::string("N/A") : compression_options, + prop_delim, kv_delim); + + AppendProperty(result, "creation time", creation_time, prop_delim, kv_delim); + + AppendProperty(result, "time stamp of earliest key", oldest_key_time, + prop_delim, kv_delim); + + return result; +} + +void TableProperties::Add(const TableProperties& tp) { + data_size += tp.data_size; + index_size += tp.index_size; + index_partitions += tp.index_partitions; + top_level_index_size += tp.top_level_index_size; + index_key_is_user_key += tp.index_key_is_user_key; + index_value_is_delta_encoded += tp.index_value_is_delta_encoded; + filter_size += tp.filter_size; + raw_key_size += tp.raw_key_size; + raw_value_size += tp.raw_value_size; + num_data_blocks += tp.num_data_blocks; + num_entries += tp.num_entries; + num_deletions += tp.num_deletions; + num_merge_operands += tp.num_merge_operands; + num_range_deletions += tp.num_range_deletions; +} + +const std::string TablePropertiesNames::kDataSize = + "rocksdb.data.size"; +const std::string TablePropertiesNames::kIndexSize = + "rocksdb.index.size"; +const std::string TablePropertiesNames::kIndexPartitions = + "rocksdb.index.partitions"; +const std::string TablePropertiesNames::kTopLevelIndexSize = + "rocksdb.top-level.index.size"; +const std::string TablePropertiesNames::kIndexKeyIsUserKey = + "rocksdb.index.key.is.user.key"; +const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded = + "rocksdb.index.value.is.delta.encoded"; +const std::string TablePropertiesNames::kFilterSize = + "rocksdb.filter.size"; +const std::string TablePropertiesNames::kRawKeySize = + "rocksdb.raw.key.size"; +const std::string TablePropertiesNames::kRawValueSize = + "rocksdb.raw.value.size"; +const std::string TablePropertiesNames::kNumDataBlocks = + "rocksdb.num.data.blocks"; +const std::string TablePropertiesNames::kNumEntries = + "rocksdb.num.entries"; +const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys"; +const std::string TablePropertiesNames::kMergeOperands = + "rocksdb.merge.operands"; +const std::string TablePropertiesNames::kNumRangeDeletions = + "rocksdb.num.range-deletions"; +const std::string TablePropertiesNames::kFilterPolicy = + "rocksdb.filter.policy"; +const std::string TablePropertiesNames::kFormatVersion = + "rocksdb.format.version"; +const std::string TablePropertiesNames::kFixedKeyLen = + "rocksdb.fixed.key.length"; +const std::string TablePropertiesNames::kColumnFamilyId = + "rocksdb.column.family.id"; +const std::string TablePropertiesNames::kColumnFamilyName = + "rocksdb.column.family.name"; +const std::string TablePropertiesNames::kComparator = "rocksdb.comparator"; +const std::string TablePropertiesNames::kMergeOperator = + "rocksdb.merge.operator"; +const std::string TablePropertiesNames::kPrefixExtractorName = + "rocksdb.prefix.extractor.name"; +const std::string TablePropertiesNames::kPropertyCollectors = + "rocksdb.property.collectors"; +const std::string TablePropertiesNames::kCompression = "rocksdb.compression"; +const std::string TablePropertiesNames::kCompressionOptions = + "rocksdb.compression_options"; +const std::string TablePropertiesNames::kCreationTime = "rocksdb.creation.time"; +const std::string TablePropertiesNames::kOldestKeyTime = + "rocksdb.oldest.key.time"; + +extern const std::string kPropertiesBlock = "rocksdb.properties"; +// Old property block name for backward compatibility +extern const std::string kPropertiesBlockOldName = "rocksdb.stats"; +extern const std::string kCompressionDictBlock = "rocksdb.compression_dict"; +extern const std::string kRangeDelBlock = "rocksdb.range_del"; + +// Seek to the properties block. +// Return true if it successfully seeks to the properties block. +Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found) { + Status status = SeekToMetaBlock(meta_iter, kPropertiesBlock, is_found); + if (!*is_found && status.ok()) { + status = SeekToMetaBlock(meta_iter, kPropertiesBlockOldName, is_found); + } + return status; +} + +// Seek to the compression dictionary block. +// Return true if it successfully seeks to that block. +Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found, + BlockHandle* block_handle) { + return SeekToMetaBlock(meta_iter, kCompressionDictBlock, is_found, block_handle); +} + +Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found, + BlockHandle* block_handle = nullptr) { + return SeekToMetaBlock(meta_iter, kRangeDelBlock, is_found, block_handle); +} + +} // namespace rocksdb diff --git a/src/rocksdb/table/table_properties_internal.h b/src/rocksdb/table/table_properties_internal.h new file mode 100644 index 00000000..888b43d2 --- /dev/null +++ b/src/rocksdb/table/table_properties_internal.h @@ -0,0 +1,30 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/status.h" +#include "rocksdb/iterator.h" + +namespace rocksdb { + +class BlockHandle; + +// Seek to the properties block. +// If it successfully seeks to the properties block, "is_found" will be +// set to true. +Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found); + +// Seek to the compression dictionary block. +// If it successfully seeks to the properties block, "is_found" will be +// set to true. +Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found, + BlockHandle* block_handle); + +// TODO(andrewkr) should not put all meta block in table_properties.h/cc +Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found, + BlockHandle* block_handle); + +} // namespace rocksdb diff --git a/src/rocksdb/table/table_reader.h b/src/rocksdb/table/table_reader.h new file mode 100644 index 00000000..a5f15e13 --- /dev/null +++ b/src/rocksdb/table/table_reader.h @@ -0,0 +1,115 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <memory> +#include "db/range_tombstone_fragmenter.h" +#include "rocksdb/slice_transform.h" +#include "table/internal_iterator.h" + +namespace rocksdb { + +class Iterator; +struct ParsedInternalKey; +class Slice; +class Arena; +struct ReadOptions; +struct TableProperties; +class GetContext; + +// A Table is a sorted map from strings to strings. Tables are +// immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. +class TableReader { + public: + virtual ~TableReader() {} + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // arena: If not null, the arena needs to be used to allocate the Iterator. + // When destroying the iterator, the caller will not call "delete" + // but Iterator::~Iterator() directly. The destructor needs to destroy + // all the states but those allocated in arena. + // skip_filters: disables checking the bloom filters even if they exist. This + // option is effective only for block-based table format. + virtual InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena = nullptr, + bool skip_filters = false, + bool for_compaction = false) = 0; + + virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( + const ReadOptions& /*read_options*/) { + return nullptr; + } + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0; + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + virtual void SetupForCompaction() = 0; + + virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0; + + // Prepare work that can be done before the real Get() + virtual void Prepare(const Slice& /*target*/) {} + + // Report an approximation of how much memory has been used. + virtual size_t ApproximateMemoryUsage() const = 0; + + // Calls get_context->SaveValue() repeatedly, starting with + // the entry found after a call to Seek(key), until it returns false. + // May not make such a call if filter policy says that key is not present. + // + // get_context->MarkKeyMayExist needs to be called when it is configured to be + // memory only and the key is not found in the block cache. + // + // readOptions is the options for the read + // key is the key to search for + // skip_filters: disables checking the bloom filters even if they exist. This + // option is effective only for block-based table format. + virtual Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, + const SliceTransform* prefix_extractor, + bool skip_filters = false) = 0; + + // Prefetch data corresponding to a give range of keys + // Typically this functionality is required for table implementations that + // persists the data on a non volatile storage medium like disk/SSD + virtual Status Prefetch(const Slice* begin = nullptr, + const Slice* end = nullptr) { + (void) begin; + (void) end; + // Default implementation is NOOP. + // The child class should implement functionality when applicable + return Status::OK(); + } + + // convert db file to a human readable form + virtual Status DumpTable(WritableFile* /*out_file*/, + const SliceTransform* /*prefix_extractor*/) { + return Status::NotSupported("DumpTable() not supported"); + } + + // check whether there is corruption in this db file + virtual Status VerifyChecksum() { + return Status::NotSupported("VerifyChecksum() not supported"); + } + + virtual void Close() {} +}; + +} // namespace rocksdb diff --git a/src/rocksdb/table/table_reader_bench.cc b/src/rocksdb/table/table_reader_bench.cc new file mode 100644 index 00000000..a9b75715 --- /dev/null +++ b/src/rocksdb/table/table_reader_bench.cc @@ -0,0 +1,343 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef GFLAGS +#include <cstdio> +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include "db/db_impl.h" +#include "db/dbformat.h" +#include "monitoring/histogram.h" +#include "rocksdb/db.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "table/block_based_table_factory.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/plain_table_factory.h" +#include "table/table_builder.h" +#include "util/file_reader_writer.h" +#include "util/gflags_compat.h" +#include "util/testharness.h" +#include "util/testutil.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +using GFLAGS_NAMESPACE::SetUsageMessage; + +namespace rocksdb { + +namespace { +// Make a key that i determines the first 4 characters and j determines the +// last 4 characters. +static std::string MakeKey(int i, int j, bool through_db) { + char buf[100]; + snprintf(buf, sizeof(buf), "%04d__key___%04d", i, j); + if (through_db) { + return std::string(buf); + } + // If we directly query table, which operates on internal keys + // instead of user keys, we need to add 8 bytes of internal + // information (row type etc) to user key to make an internal + // key. + InternalKey key(std::string(buf), 0, ValueType::kTypeValue); + return key.Encode().ToString(); +} + +uint64_t Now(Env* env, bool measured_by_nanosecond) { + return measured_by_nanosecond ? env->NowNanos() : env->NowMicros(); +} +} // namespace + +// A very simple benchmark that. +// Create a table with roughly numKey1 * numKey2 keys, +// where there are numKey1 prefixes of the key, each has numKey2 number of +// distinguished key, differing in the suffix part. +// If if_query_empty_keys = false, query the existing keys numKey1 * numKey2 +// times randomly. +// If if_query_empty_keys = true, query numKey1 * numKey2 random empty keys. +// Print out the total time. +// If through_db=true, a full DB will be created and queries will be against +// it. Otherwise, operations will be directly through table level. +// +// If for_terator=true, instead of just query one key each time, it queries +// a range sharing the same prefix. +namespace { +void TableReaderBenchmark(Options& opts, EnvOptions& env_options, + ReadOptions& read_options, int num_keys1, + int num_keys2, int num_iter, int /*prefix_len*/, + bool if_query_empty_keys, bool for_iterator, + bool through_db, bool measured_by_nanosecond) { + rocksdb::InternalKeyComparator ikc(opts.comparator); + + std::string file_name = + test::PerThreadDBPath("rocksdb_table_reader_benchmark"); + std::string dbname = test::PerThreadDBPath("rocksdb_table_reader_bench_db"); + WriteOptions wo; + Env* env = Env::Default(); + TableBuilder* tb = nullptr; + DB* db = nullptr; + Status s; + const ImmutableCFOptions ioptions(opts); + const ColumnFamilyOptions cfo(opts); + const MutableCFOptions moptions(cfo); + std::unique_ptr<WritableFileWriter> file_writer; + if (!through_db) { + std::unique_ptr<WritableFile> file; + env->NewWritableFile(file_name, &file, env_options); + + std::vector<std::unique_ptr<IntTblPropCollectorFactory> > + int_tbl_prop_collector_factories; + + file_writer.reset( + new WritableFileWriter(std::move(file), file_name, env_options)); + int unknown_level = -1; + tb = opts.table_factory->NewTableBuilder( + TableBuilderOptions( + ioptions, moptions, ikc, &int_tbl_prop_collector_factories, + CompressionType::kNoCompression, 0 /* sample_for_compression */, + CompressionOptions(), false /* skip_filters */, + kDefaultColumnFamilyName, unknown_level), + 0 /* column_family_id */, file_writer.get()); + } else { + s = DB::Open(opts, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + } + // Populate slightly more than 1M keys + for (int i = 0; i < num_keys1; i++) { + for (int j = 0; j < num_keys2; j++) { + std::string key = MakeKey(i * 2, j, through_db); + if (!through_db) { + tb->Add(key, key); + } else { + db->Put(wo, key, key); + } + } + } + if (!through_db) { + tb->Finish(); + file_writer->Close(); + } else { + db->Flush(FlushOptions()); + } + + std::unique_ptr<TableReader> table_reader; + if (!through_db) { + std::unique_ptr<RandomAccessFile> raf; + s = env->NewRandomAccessFile(file_name, &raf, env_options); + if (!s.ok()) { + fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str()); + exit(1); + } + uint64_t file_size; + env->GetFileSize(file_name, &file_size); + std::unique_ptr<RandomAccessFileReader> file_reader( + new RandomAccessFileReader(std::move(raf), file_name)); + s = opts.table_factory->NewTableReader( + TableReaderOptions(ioptions, moptions.prefix_extractor.get(), + env_options, ikc), + std::move(file_reader), file_size, &table_reader); + if (!s.ok()) { + fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str()); + exit(1); + } + } + + Random rnd(301); + std::string result; + HistogramImpl hist; + + for (int it = 0; it < num_iter; it++) { + for (int i = 0; i < num_keys1; i++) { + for (int j = 0; j < num_keys2; j++) { + int r1 = rnd.Uniform(num_keys1) * 2; + int r2 = rnd.Uniform(num_keys2); + if (if_query_empty_keys) { + r1++; + r2 = num_keys2 * 2 - r2; + } + + if (!for_iterator) { + // Query one existing key; + std::string key = MakeKey(r1, r2, through_db); + uint64_t start_time = Now(env, measured_by_nanosecond); + if (!through_db) { + PinnableSlice value; + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + GetContext get_context(ioptions.user_comparator, + ioptions.merge_operator, ioptions.info_log, + ioptions.statistics, GetContext::kNotFound, + Slice(key), &value, nullptr, &merge_context, + &max_covering_tombstone_seq, env); + s = table_reader->Get(read_options, key, &get_context, nullptr); + } else { + s = db->Get(read_options, key, &result); + } + hist.Add(Now(env, measured_by_nanosecond) - start_time); + } else { + int r2_len; + if (if_query_empty_keys) { + r2_len = 0; + } else { + r2_len = rnd.Uniform(num_keys2) + 1; + if (r2_len + r2 > num_keys2) { + r2_len = num_keys2 - r2; + } + } + std::string start_key = MakeKey(r1, r2, through_db); + std::string end_key = MakeKey(r1, r2 + r2_len, through_db); + uint64_t total_time = 0; + uint64_t start_time = Now(env, measured_by_nanosecond); + Iterator* iter = nullptr; + InternalIterator* iiter = nullptr; + if (!through_db) { + iiter = table_reader->NewIterator(read_options, nullptr); + } else { + iter = db->NewIterator(read_options); + } + int count = 0; + for (through_db ? iter->Seek(start_key) : iiter->Seek(start_key); + through_db ? iter->Valid() : iiter->Valid(); + through_db ? iter->Next() : iiter->Next()) { + if (if_query_empty_keys) { + break; + } + // verify key; + total_time += Now(env, measured_by_nanosecond) - start_time; + assert(Slice(MakeKey(r1, r2 + count, through_db)) == + (through_db ? iter->key() : iiter->key())); + start_time = Now(env, measured_by_nanosecond); + if (++count >= r2_len) { + break; + } + } + if (count != r2_len) { + fprintf( + stderr, "Iterator cannot iterate expected number of entries. " + "Expected %d but got %d\n", r2_len, count); + assert(false); + } + delete iter; + total_time += Now(env, measured_by_nanosecond) - start_time; + hist.Add(total_time); + } + } + } + } + + fprintf( + stderr, + "===================================================" + "====================================================\n" + "InMemoryTableSimpleBenchmark: %20s num_key1: %5d " + "num_key2: %5d %10s\n" + "===================================================" + "====================================================" + "\nHistogram (unit: %s): \n%s", + opts.table_factory->Name(), num_keys1, num_keys2, + for_iterator ? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"), + measured_by_nanosecond ? "nanosecond" : "microsecond", + hist.ToString().c_str()); + if (!through_db) { + env->DeleteFile(file_name); + } else { + delete db; + db = nullptr; + DestroyDB(dbname, opts); + } +} +} // namespace +} // namespace rocksdb + +DEFINE_bool(query_empty, false, "query non-existing keys instead of existing " + "ones."); +DEFINE_int32(num_keys1, 4096, "number of distinguish prefix of keys"); +DEFINE_int32(num_keys2, 512, "number of distinguish keys for each prefix"); +DEFINE_int32(iter, 3, "query non-existing keys instead of existing ones"); +DEFINE_int32(prefix_len, 16, "Prefix length used for iterators and indexes"); +DEFINE_bool(iterator, false, "For test iterator"); +DEFINE_bool(through_db, false, "If enable, a DB instance will be created and " + "the query will be against DB. Otherwise, will be directly against " + "a table reader."); +DEFINE_bool(mmap_read, true, "Whether use mmap read"); +DEFINE_string(table_factory, "block_based", + "Table factory to use: `block_based` (default), `plain_table` or " + "`cuckoo_hash`."); +DEFINE_string(time_unit, "microsecond", + "The time unit used for measuring performance. User can specify " + "`microsecond` (default) or `nanosecond`"); + +int main(int argc, char** argv) { + SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + ParseCommandLineFlags(&argc, &argv, true); + + std::shared_ptr<rocksdb::TableFactory> tf; + rocksdb::Options options; + if (FLAGS_prefix_len < 16) { + options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform( + FLAGS_prefix_len)); + } + rocksdb::ReadOptions ro; + rocksdb::EnvOptions env_options; + options.create_if_missing = true; + options.compression = rocksdb::CompressionType::kNoCompression; + + if (FLAGS_table_factory == "cuckoo_hash") { +#ifndef ROCKSDB_LITE + options.allow_mmap_reads = FLAGS_mmap_read; + env_options.use_mmap_reads = FLAGS_mmap_read; + rocksdb::CuckooTableOptions table_options; + table_options.hash_table_ratio = 0.75; + tf.reset(rocksdb::NewCuckooTableFactory(table_options)); +#else + fprintf(stderr, "Plain table is not supported in lite mode\n"); + exit(1); +#endif // ROCKSDB_LITE + } else if (FLAGS_table_factory == "plain_table") { +#ifndef ROCKSDB_LITE + options.allow_mmap_reads = FLAGS_mmap_read; + env_options.use_mmap_reads = FLAGS_mmap_read; + + rocksdb::PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = (FLAGS_prefix_len == 16) ? 0 : 8; + plain_table_options.hash_table_ratio = 0.75; + + tf.reset(new rocksdb::PlainTableFactory(plain_table_options)); + options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform( + FLAGS_prefix_len)); +#else + fprintf(stderr, "Cuckoo table is not supported in lite mode\n"); + exit(1); +#endif // ROCKSDB_LITE + } else if (FLAGS_table_factory == "block_based") { + tf.reset(new rocksdb::BlockBasedTableFactory()); + } else { + fprintf(stderr, "Invalid table type %s\n", FLAGS_table_factory.c_str()); + } + + if (tf) { + // if user provides invalid options, just fall back to microsecond. + bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond"; + + options.table_factory = tf; + rocksdb::TableReaderBenchmark(options, env_options, ro, FLAGS_num_keys1, + FLAGS_num_keys2, FLAGS_iter, FLAGS_prefix_len, + FLAGS_query_empty, FLAGS_iterator, + FLAGS_through_db, measured_by_nanosecond); + } else { + return 1; + } + + return 0; +} + +#endif // GFLAGS diff --git a/src/rocksdb/table/table_test.cc b/src/rocksdb/table/table_test.cc new file mode 100644 index 00000000..f217fe50 --- /dev/null +++ b/src/rocksdb/table/table_test.cc @@ -0,0 +1,3879 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <stdio.h> + +#include <algorithm> +#include <iostream> +#include <map> +#include <memory> +#include <string> +#include <vector> + +#include "cache/lru_cache.h" +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "memtable/stl_wrappers.h" +#include "monitoring/statistics.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/block.h" +#include "table/block_based_table_builder.h" +#include "table/block_based_table_factory.h" +#include "table/block_based_table_reader.h" +#include "table/block_builder.h" +#include "table/block_fetcher.h" +#include "table/format.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" +#include "table/scoped_arena_iterator.h" +#include "table/sst_file_writer_collectors.h" +#include "util/compression.h" +#include "util/random.h" +#include "util/string_util.h" +#include "util/sync_point.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +namespace rocksdb { + +extern const uint64_t kLegacyBlockBasedTableMagicNumber; +extern const uint64_t kLegacyPlainTableMagicNumber; +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kPlainTableMagicNumber; + +namespace { + +// DummyPropertiesCollector used to test BlockBasedTableProperties +class DummyPropertiesCollector : public TablePropertiesCollector { + public: + const char* Name() const override { return ""; } + + Status Finish(UserCollectedProperties* /*properties*/) override { + return Status::OK(); + } + + Status Add(const Slice& /*user_key*/, const Slice& /*value*/) override { + return Status::OK(); + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } +}; + +class DummyPropertiesCollectorFactory1 + : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return new DummyPropertiesCollector(); + } + const char* Name() const override { return "DummyPropertiesCollector1"; } +}; + +class DummyPropertiesCollectorFactory2 + : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /*context*/) override { + return new DummyPropertiesCollector(); + } + const char* Name() const override { return "DummyPropertiesCollector2"; } +}; + +// Return reverse of "key". +// Used to test non-lexicographic comparators. +std::string Reverse(const Slice& key) { + auto rev = key.ToString(); + std::reverse(rev.begin(), rev.end()); + return rev; +} + +class ReverseKeyComparator : public Comparator { + public: + const char* Name() const override { + return "rocksdb.ReverseBytewiseComparator"; + } + + int Compare(const Slice& a, const Slice& b) const override { + return BytewiseComparator()->Compare(Reverse(a), Reverse(b)); + } + + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + std::string s = Reverse(*start); + std::string l = Reverse(limit); + BytewiseComparator()->FindShortestSeparator(&s, l); + *start = Reverse(s); + } + + void FindShortSuccessor(std::string* key) const override { + std::string s = Reverse(*key); + BytewiseComparator()->FindShortSuccessor(&s); + *key = Reverse(s); + } +}; + +ReverseKeyComparator reverse_key_comparator; + +void Increment(const Comparator* cmp, std::string* key) { + if (cmp == BytewiseComparator()) { + key->push_back('\0'); + } else { + assert(cmp == &reverse_key_comparator); + std::string rev = Reverse(*key); + rev.push_back('\0'); + *key = Reverse(rev); + } +} + +} // namespace + +// Helper class for tests to unify the interface between +// BlockBuilder/TableBuilder and Block/Table. +class Constructor { + public: + explicit Constructor(const Comparator* cmp) + : data_(stl_wrappers::LessOfComparator(cmp)) {} + virtual ~Constructor() { } + + void Add(const std::string& key, const Slice& value) { + data_[key] = value.ToString(); + } + + // Finish constructing the data structure with all the keys that have + // been added so far. Returns the keys in sorted order in "*keys" + // and stores the key/value pairs in "*kvmap" + void Finish(const Options& options, const ImmutableCFOptions& ioptions, + const MutableCFOptions& moptions, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + std::vector<std::string>* keys, stl_wrappers::KVMap* kvmap) { + last_internal_key_ = &internal_comparator; + *kvmap = data_; + keys->clear(); + for (const auto& kv : data_) { + keys->push_back(kv.first); + } + data_.clear(); + Status s = FinishImpl(options, ioptions, moptions, table_options, + internal_comparator, *kvmap); + ASSERT_TRUE(s.ok()) << s.ToString(); + } + + // Construct the data structure from the data in "data" + virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, + const MutableCFOptions& moptions, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + const stl_wrappers::KVMap& data) = 0; + + virtual InternalIterator* NewIterator( + const SliceTransform* prefix_extractor = nullptr) const = 0; + + virtual const stl_wrappers::KVMap& data() { return data_; } + + virtual bool IsArenaMode() const { return false; } + + virtual DB* db() const { return nullptr; } // Overridden in DBConstructor + + virtual bool AnywayDeleteIterator() const { return false; } + + protected: + const InternalKeyComparator* last_internal_key_; + + private: + stl_wrappers::KVMap data_; +}; + +class BlockConstructor: public Constructor { + public: + explicit BlockConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp), + block_(nullptr) { } + ~BlockConstructor() override { delete block_; } + Status FinishImpl(const Options& /*options*/, + const ImmutableCFOptions& /*ioptions*/, + const MutableCFOptions& /*moptions*/, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& /*internal_comparator*/, + const stl_wrappers::KVMap& kv_map) override { + delete block_; + block_ = nullptr; + BlockBuilder builder(table_options.block_restart_interval); + + for (const auto kv : kv_map) { + builder.Add(kv.first, kv.second); + } + // Open the block + data_ = builder.Finish().ToString(); + BlockContents contents; + contents.data = data_; + block_ = new Block(std::move(contents), kDisableGlobalSequenceNumber); + return Status::OK(); + } + InternalIterator* NewIterator( + const SliceTransform* /*prefix_extractor*/) const override { + return block_->NewIterator<DataBlockIter>(comparator_, comparator_); + } + + private: + const Comparator* comparator_; + std::string data_; + Block* block_; + + BlockConstructor(); +}; + +// A helper class that converts internal format keys into user keys +class KeyConvertingIterator : public InternalIterator { + public: + explicit KeyConvertingIterator(InternalIterator* iter, + bool arena_mode = false) + : iter_(iter), arena_mode_(arena_mode) {} + ~KeyConvertingIterator() override { + if (arena_mode_) { + iter_->~InternalIterator(); + } else { + delete iter_; + } + } + bool Valid() const override { return iter_->Valid() && status_.ok(); } + void Seek(const Slice& target) override { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + iter_->Seek(encoded); + } + void SeekForPrev(const Slice& target) override { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + iter_->SeekForPrev(encoded); + } + void SeekToFirst() override { iter_->SeekToFirst(); } + void SeekToLast() override { iter_->SeekToLast(); } + void Next() override { iter_->Next(); } + void Prev() override { iter_->Prev(); } + + Slice key() const override { + assert(Valid()); + ParsedInternalKey parsed_key; + if (!ParseInternalKey(iter_->key(), &parsed_key)) { + status_ = Status::Corruption("malformed internal key"); + return Slice("corrupted key"); + } + return parsed_key.user_key; + } + + Slice value() const override { return iter_->value(); } + Status status() const override { + return status_.ok() ? iter_->status() : status_; + } + + private: + mutable Status status_; + InternalIterator* iter_; + bool arena_mode_; + + // No copying allowed + KeyConvertingIterator(const KeyConvertingIterator&); + void operator=(const KeyConvertingIterator&); +}; + +class TableConstructor: public Constructor { + public: + explicit TableConstructor(const Comparator* cmp, + bool convert_to_internal_key = false, + int level = -1) + : Constructor(cmp), + convert_to_internal_key_(convert_to_internal_key), + level_(level) {} + ~TableConstructor() override { Reset(); } + + Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions, + const MutableCFOptions& moptions, + const BlockBasedTableOptions& /*table_options*/, + const InternalKeyComparator& internal_comparator, + const stl_wrappers::KVMap& kv_map) override { + Reset(); + soptions.use_mmap_reads = ioptions.allow_mmap_reads; + file_writer_.reset(test::GetWritableFileWriter(new test::StringSink(), + "" /* don't care */)); + std::unique_ptr<TableBuilder> builder; + std::vector<std::unique_ptr<IntTblPropCollectorFactory>> + int_tbl_prop_collector_factories; + std::string column_family_name; + builder.reset(ioptions.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, internal_comparator, + &int_tbl_prop_collector_factories, + options.compression, options.sample_for_compression, + options.compression_opts, false /* skip_filters */, + column_family_name, level_), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + file_writer_.get())); + + for (const auto kv : kv_map) { + if (convert_to_internal_key_) { + ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + builder->Add(encoded, kv.second); + } else { + builder->Add(kv.first, kv.second); + } + EXPECT_TRUE(builder->status().ok()); + } + Status s = builder->Finish(); + file_writer_->Flush(); + EXPECT_TRUE(s.ok()) << s.ToString(); + + EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize()); + + // Open the table + uniq_id_ = cur_uniq_id_++; + file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource( + TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); + const bool kSkipFilters = true; + const bool kImmortal = true; + return ioptions.table_factory->NewTableReader( + TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, + internal_comparator, !kSkipFilters, !kImmortal, + level_), + std::move(file_reader_), TEST_GetSink()->contents().size(), + &table_reader_); + } + + InternalIterator* NewIterator( + const SliceTransform* prefix_extractor) const override { + ReadOptions ro; + InternalIterator* iter = table_reader_->NewIterator(ro, prefix_extractor); + if (convert_to_internal_key_) { + return new KeyConvertingIterator(iter); + } else { + return iter; + } + } + + uint64_t ApproximateOffsetOf(const Slice& key) const { + if (convert_to_internal_key_) { + InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); + const Slice skey = ikey.Encode(); + return table_reader_->ApproximateOffsetOf(skey); + } + return table_reader_->ApproximateOffsetOf(key); + } + + virtual Status Reopen(const ImmutableCFOptions& ioptions, + const MutableCFOptions& moptions) { + file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource( + TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); + return ioptions.table_factory->NewTableReader( + TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, + *last_internal_key_), + std::move(file_reader_), TEST_GetSink()->contents().size(), + &table_reader_); + } + + virtual TableReader* GetTableReader() { return table_reader_.get(); } + + bool AnywayDeleteIterator() const override { + return convert_to_internal_key_; + } + + void ResetTableReader() { table_reader_.reset(); } + + bool ConvertToInternalKey() { return convert_to_internal_key_; } + + test::StringSink* TEST_GetSink() { + return static_cast<test::StringSink*>(file_writer_->writable_file()); + } + + private: + void Reset() { + uniq_id_ = 0; + table_reader_.reset(); + file_writer_.reset(); + file_reader_.reset(); + } + + uint64_t uniq_id_; + std::unique_ptr<WritableFileWriter> file_writer_; + std::unique_ptr<RandomAccessFileReader> file_reader_; + std::unique_ptr<TableReader> table_reader_; + bool convert_to_internal_key_; + int level_; + + TableConstructor(); + + static uint64_t cur_uniq_id_; + EnvOptions soptions; +}; +uint64_t TableConstructor::cur_uniq_id_ = 1; + +class MemTableConstructor: public Constructor { + public: + explicit MemTableConstructor(const Comparator* cmp, WriteBufferManager* wb) + : Constructor(cmp), + internal_comparator_(cmp), + write_buffer_manager_(wb), + table_factory_(new SkipListFactory) { + options_.memtable_factory = table_factory_; + ImmutableCFOptions ioptions(options_); + memtable_ = + new MemTable(internal_comparator_, ioptions, MutableCFOptions(options_), + wb, kMaxSequenceNumber, 0 /* column_family_id */); + memtable_->Ref(); + } + ~MemTableConstructor() override { delete memtable_->Unref(); } + Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions, + const MutableCFOptions& /*moptions*/, + const BlockBasedTableOptions& /*table_options*/, + const InternalKeyComparator& /*internal_comparator*/, + const stl_wrappers::KVMap& kv_map) override { + delete memtable_->Unref(); + ImmutableCFOptions mem_ioptions(ioptions); + memtable_ = new MemTable(internal_comparator_, mem_ioptions, + MutableCFOptions(options_), write_buffer_manager_, + kMaxSequenceNumber, 0 /* column_family_id */); + memtable_->Ref(); + int seq = 1; + for (const auto kv : kv_map) { + memtable_->Add(seq, kTypeValue, kv.first, kv.second); + seq++; + } + return Status::OK(); + } + InternalIterator* NewIterator( + const SliceTransform* /*prefix_extractor*/) const override { + return new KeyConvertingIterator( + memtable_->NewIterator(ReadOptions(), &arena_), true); + } + + bool AnywayDeleteIterator() const override { return true; } + + bool IsArenaMode() const override { return true; } + + private: + mutable Arena arena_; + InternalKeyComparator internal_comparator_; + Options options_; + WriteBufferManager* write_buffer_manager_; + MemTable* memtable_; + std::shared_ptr<SkipListFactory> table_factory_; +}; + +class InternalIteratorFromIterator : public InternalIterator { + public: + explicit InternalIteratorFromIterator(Iterator* it) : it_(it) {} + bool Valid() const override { return it_->Valid(); } + void Seek(const Slice& target) override { it_->Seek(target); } + void SeekForPrev(const Slice& target) override { it_->SeekForPrev(target); } + void SeekToFirst() override { it_->SeekToFirst(); } + void SeekToLast() override { it_->SeekToLast(); } + void Next() override { it_->Next(); } + void Prev() override { it_->Prev(); } + Slice key() const override { return it_->key(); } + Slice value() const override { return it_->value(); } + Status status() const override { return it_->status(); } + + private: + std::unique_ptr<Iterator> it_; +}; + +class DBConstructor: public Constructor { + public: + explicit DBConstructor(const Comparator* cmp) + : Constructor(cmp), + comparator_(cmp) { + db_ = nullptr; + NewDB(); + } + ~DBConstructor() override { delete db_; } + Status FinishImpl(const Options& /*options*/, + const ImmutableCFOptions& /*ioptions*/, + const MutableCFOptions& /*moptions*/, + const BlockBasedTableOptions& /*table_options*/, + const InternalKeyComparator& /*internal_comparator*/, + const stl_wrappers::KVMap& kv_map) override { + delete db_; + db_ = nullptr; + NewDB(); + for (const auto kv : kv_map) { + WriteBatch batch; + batch.Put(kv.first, kv.second); + EXPECT_TRUE(db_->Write(WriteOptions(), &batch).ok()); + } + return Status::OK(); + } + + InternalIterator* NewIterator( + const SliceTransform* /*prefix_extractor*/) const override { + return new InternalIteratorFromIterator(db_->NewIterator(ReadOptions())); + } + + DB* db() const override { return db_; } + + private: + void NewDB() { + std::string name = test::PerThreadDBPath("table_testdb"); + + Options options; + options.comparator = comparator_; + Status status = DestroyDB(name, options); + ASSERT_TRUE(status.ok()) << status.ToString(); + + options.create_if_missing = true; + options.error_if_exists = true; + options.write_buffer_size = 10000; // Something small to force merging + status = DB::Open(options, name, &db_); + ASSERT_TRUE(status.ok()) << status.ToString(); + } + + const Comparator* comparator_; + DB* db_; +}; + +enum TestType { + BLOCK_BASED_TABLE_TEST, +#ifndef ROCKSDB_LITE + PLAIN_TABLE_SEMI_FIXED_PREFIX, + PLAIN_TABLE_FULL_STR_PREFIX, + PLAIN_TABLE_TOTAL_ORDER, +#endif // !ROCKSDB_LITE + BLOCK_TEST, + MEMTABLE_TEST, + DB_TEST +}; + +struct TestArgs { + TestType type; + bool reverse_compare; + int restart_interval; + CompressionType compression; + uint32_t format_version; + bool use_mmap; +}; + +static std::vector<TestArgs> GenerateArgList() { + std::vector<TestArgs> test_args; + std::vector<TestType> test_types = { + BLOCK_BASED_TABLE_TEST, +#ifndef ROCKSDB_LITE + PLAIN_TABLE_SEMI_FIXED_PREFIX, + PLAIN_TABLE_FULL_STR_PREFIX, + PLAIN_TABLE_TOTAL_ORDER, +#endif // !ROCKSDB_LITE + BLOCK_TEST, + MEMTABLE_TEST, DB_TEST}; + std::vector<bool> reverse_compare_types = {false, true}; + std::vector<int> restart_intervals = {16, 1, 1024}; + + // Only add compression if it is supported + std::vector<std::pair<CompressionType, bool>> compression_types; + compression_types.emplace_back(kNoCompression, false); + if (Snappy_Supported()) { + compression_types.emplace_back(kSnappyCompression, false); + } + if (Zlib_Supported()) { + compression_types.emplace_back(kZlibCompression, false); + compression_types.emplace_back(kZlibCompression, true); + } + if (BZip2_Supported()) { + compression_types.emplace_back(kBZip2Compression, false); + compression_types.emplace_back(kBZip2Compression, true); + } + if (LZ4_Supported()) { + compression_types.emplace_back(kLZ4Compression, false); + compression_types.emplace_back(kLZ4Compression, true); + compression_types.emplace_back(kLZ4HCCompression, false); + compression_types.emplace_back(kLZ4HCCompression, true); + } + if (XPRESS_Supported()) { + compression_types.emplace_back(kXpressCompression, false); + compression_types.emplace_back(kXpressCompression, true); + } + if (ZSTD_Supported()) { + compression_types.emplace_back(kZSTD, false); + compression_types.emplace_back(kZSTD, true); + } + + for (auto test_type : test_types) { + for (auto reverse_compare : reverse_compare_types) { +#ifndef ROCKSDB_LITE + if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX || + test_type == PLAIN_TABLE_FULL_STR_PREFIX || + test_type == PLAIN_TABLE_TOTAL_ORDER) { + // Plain table doesn't use restart index or compression. + TestArgs one_arg; + one_arg.type = test_type; + one_arg.reverse_compare = reverse_compare; + one_arg.restart_interval = restart_intervals[0]; + one_arg.compression = compression_types[0].first; + one_arg.use_mmap = true; + test_args.push_back(one_arg); + one_arg.use_mmap = false; + test_args.push_back(one_arg); + continue; + } +#endif // !ROCKSDB_LITE + + for (auto restart_interval : restart_intervals) { + for (auto compression_type : compression_types) { + TestArgs one_arg; + one_arg.type = test_type; + one_arg.reverse_compare = reverse_compare; + one_arg.restart_interval = restart_interval; + one_arg.compression = compression_type.first; + one_arg.format_version = compression_type.second ? 2 : 1; + one_arg.use_mmap = false; + test_args.push_back(one_arg); + } + } + } + } + return test_args; +} + +// In order to make all tests run for plain table format, including +// those operating on empty keys, create a new prefix transformer which +// return fixed prefix if the slice is not shorter than the prefix length, +// and the full slice if it is shorter. +class FixedOrLessPrefixTransform : public SliceTransform { + private: + const size_t prefix_len_; + + public: + explicit FixedOrLessPrefixTransform(size_t prefix_len) : + prefix_len_(prefix_len) { + } + + const char* Name() const override { return "rocksdb.FixedPrefix"; } + + Slice Transform(const Slice& src) const override { + assert(InDomain(src)); + if (src.size() < prefix_len_) { + return src; + } + return Slice(src.data(), prefix_len_); + } + + bool InDomain(const Slice& /*src*/) const override { return true; } + + bool InRange(const Slice& dst) const override { + return (dst.size() <= prefix_len_); + } + bool FullLengthEnabled(size_t* /*len*/) const override { return false; } +}; + +class HarnessTest : public testing::Test { + public: + HarnessTest() + : ioptions_(options_), + moptions_(options_), + constructor_(nullptr), + write_buffer_(options_.db_write_buffer_size) {} + + void Init(const TestArgs& args) { + delete constructor_; + constructor_ = nullptr; + options_ = Options(); + options_.compression = args.compression; + // Use shorter block size for tests to exercise block boundary + // conditions more. + if (args.reverse_compare) { + options_.comparator = &reverse_key_comparator; + } + + internal_comparator_.reset( + new test::PlainInternalKeyComparator(options_.comparator)); + + support_prev_ = true; + only_support_prefix_seek_ = false; + options_.allow_mmap_reads = args.use_mmap; + switch (args.type) { + case BLOCK_BASED_TABLE_TEST: + table_options_.flush_block_policy_factory.reset( + new FlushBlockBySizePolicyFactory()); + table_options_.block_size = 256; + table_options_.block_restart_interval = args.restart_interval; + table_options_.index_block_restart_interval = args.restart_interval; + table_options_.format_version = args.format_version; + options_.table_factory.reset( + new BlockBasedTableFactory(table_options_)); + constructor_ = new TableConstructor( + options_.comparator, true /* convert_to_internal_key_ */); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; +// Plain table is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE + case PLAIN_TABLE_SEMI_FIXED_PREFIX: + support_prev_ = false; + only_support_prefix_seek_ = true; + options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2)); + options_.table_factory.reset(NewPlainTableFactory()); + constructor_ = new TableConstructor( + options_.comparator, true /* convert_to_internal_key_ */); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; + case PLAIN_TABLE_FULL_STR_PREFIX: + support_prev_ = false; + only_support_prefix_seek_ = true; + options_.prefix_extractor.reset(NewNoopTransform()); + options_.table_factory.reset(NewPlainTableFactory()); + constructor_ = new TableConstructor( + options_.comparator, true /* convert_to_internal_key_ */); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; + case PLAIN_TABLE_TOTAL_ORDER: + support_prev_ = false; + only_support_prefix_seek_ = false; + options_.prefix_extractor = nullptr; + + { + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = kPlainTableVariableLength; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + + options_.table_factory.reset( + NewPlainTableFactory(plain_table_options)); + } + constructor_ = new TableConstructor( + options_.comparator, true /* convert_to_internal_key_ */); + internal_comparator_.reset( + new InternalKeyComparator(options_.comparator)); + break; +#endif // !ROCKSDB_LITE + case BLOCK_TEST: + table_options_.block_size = 256; + options_.table_factory.reset( + new BlockBasedTableFactory(table_options_)); + constructor_ = new BlockConstructor(options_.comparator); + break; + case MEMTABLE_TEST: + table_options_.block_size = 256; + options_.table_factory.reset( + new BlockBasedTableFactory(table_options_)); + constructor_ = new MemTableConstructor(options_.comparator, + &write_buffer_); + break; + case DB_TEST: + table_options_.block_size = 256; + options_.table_factory.reset( + new BlockBasedTableFactory(table_options_)); + constructor_ = new DBConstructor(options_.comparator); + break; + } + ioptions_ = ImmutableCFOptions(options_); + moptions_ = MutableCFOptions(options_); + } + + ~HarnessTest() override { delete constructor_; } + + void Add(const std::string& key, const std::string& value) { + constructor_->Add(key, value); + } + + void Test(Random* rnd) { + std::vector<std::string> keys; + stl_wrappers::KVMap data; + constructor_->Finish(options_, ioptions_, moptions_, table_options_, + *internal_comparator_, &keys, &data); + + TestForwardScan(keys, data); + if (support_prev_) { + TestBackwardScan(keys, data); + } + TestRandomAccess(rnd, keys, data); + } + + void TestForwardScan(const std::vector<std::string>& /*keys*/, + const stl_wrappers::KVMap& data) { + InternalIterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToFirst(); + for (stl_wrappers::KVMap::const_iterator model_iter = data.begin(); + model_iter != data.end(); ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Next(); + } + ASSERT_TRUE(!iter->Valid()); + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~InternalIterator(); + } else { + delete iter; + } + } + + void TestBackwardScan(const std::vector<std::string>& /*keys*/, + const stl_wrappers::KVMap& data) { + InternalIterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + iter->SeekToLast(); + for (stl_wrappers::KVMap::const_reverse_iterator model_iter = data.rbegin(); + model_iter != data.rend(); ++model_iter) { + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + iter->Prev(); + } + ASSERT_TRUE(!iter->Valid()); + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~InternalIterator(); + } else { + delete iter; + } + } + + void TestRandomAccess(Random* rnd, const std::vector<std::string>& keys, + const stl_wrappers::KVMap& data) { + static const bool kVerbose = false; + InternalIterator* iter = constructor_->NewIterator(); + ASSERT_TRUE(!iter->Valid()); + stl_wrappers::KVMap::const_iterator model_iter = data.begin(); + if (kVerbose) fprintf(stderr, "---\n"); + for (int i = 0; i < 200; i++) { + const int toss = rnd->Uniform(support_prev_ ? 5 : 3); + switch (toss) { + case 0: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Next\n"); + iter->Next(); + ++model_iter; + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 1: { + if (kVerbose) fprintf(stderr, "SeekToFirst\n"); + iter->SeekToFirst(); + model_iter = data.begin(); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 2: { + std::string key = PickRandomKey(rnd, keys); + model_iter = data.lower_bound(key); + if (kVerbose) fprintf(stderr, "Seek '%s'\n", + EscapeString(key).c_str()); + iter->Seek(Slice(key)); + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + + case 3: { + if (iter->Valid()) { + if (kVerbose) fprintf(stderr, "Prev\n"); + iter->Prev(); + if (model_iter == data.begin()) { + model_iter = data.end(); // Wrap around to invalid value + } else { + --model_iter; + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + } + break; + } + + case 4: { + if (kVerbose) fprintf(stderr, "SeekToLast\n"); + iter->SeekToLast(); + if (keys.empty()) { + model_iter = data.end(); + } else { + std::string last = data.rbegin()->first; + model_iter = data.lower_bound(last); + } + ASSERT_EQ(ToString(data, model_iter), ToString(iter)); + break; + } + } + } + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~InternalIterator(); + } else { + delete iter; + } + } + + std::string ToString(const stl_wrappers::KVMap& data, + const stl_wrappers::KVMap::const_iterator& it) { + if (it == data.end()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const stl_wrappers::KVMap& data, + const stl_wrappers::KVMap::const_reverse_iterator& it) { + if (it == data.rend()) { + return "END"; + } else { + return "'" + it->first + "->" + it->second + "'"; + } + } + + std::string ToString(const InternalIterator* it) { + if (!it->Valid()) { + return "END"; + } else { + return "'" + it->key().ToString() + "->" + it->value().ToString() + "'"; + } + } + + std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) { + if (keys.empty()) { + return "foo"; + } else { + const int index = rnd->Uniform(static_cast<int>(keys.size())); + std::string result = keys[index]; + switch (rnd->Uniform(support_prev_ ? 3 : 1)) { + case 0: + // Return an existing key + break; + case 1: { + // Attempt to return something smaller than an existing key + if (result.size() > 0 && result[result.size() - 1] > '\0' + && (!only_support_prefix_seek_ + || options_.prefix_extractor->Transform(result).size() + < result.size())) { + result[result.size() - 1]--; + } + break; + } + case 2: { + // Return something larger than an existing key + Increment(options_.comparator, &result); + break; + } + } + return result; + } + } + + // Returns nullptr if not running against a DB + DB* db() const { return constructor_->db(); } + + void RandomizedHarnessTest(size_t part, size_t total) { + std::vector<TestArgs> args = GenerateArgList(); + assert(part); + assert(part <= total); + for (size_t i = 0; i < args.size(); i++) { + if ((i % total) + 1 != part) { + continue; + } + Init(args[i]); + Random rnd(test::RandomSeed() + 5); + for (int num_entries = 0; num_entries < 2000; + num_entries += (num_entries < 50 ? 1 : 200)) { + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + } + } + } + + private: + Options options_ = Options(); + ImmutableCFOptions ioptions_; + MutableCFOptions moptions_; + BlockBasedTableOptions table_options_ = BlockBasedTableOptions(); + Constructor* constructor_; + WriteBufferManager write_buffer_; + bool support_prev_; + bool only_support_prefix_seek_; + std::shared_ptr<InternalKeyComparator> internal_comparator_; +}; + +static bool Between(uint64_t val, uint64_t low, uint64_t high) { + bool result = (val >= low) && (val <= high); + if (!result) { + fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", + (unsigned long long)(val), + (unsigned long long)(low), + (unsigned long long)(high)); + } + return result; +} + +// Tests against all kinds of tables +class TableTest : public testing::Test { + public: + const InternalKeyComparator& GetPlainInternalComparator( + const Comparator* comp) { + if (!plain_internal_comparator) { + plain_internal_comparator.reset( + new test::PlainInternalKeyComparator(comp)); + } + return *plain_internal_comparator; + } + void IndexTest(BlockBasedTableOptions table_options); + + private: + std::unique_ptr<InternalKeyComparator> plain_internal_comparator; +}; + +class GeneralTableTest : public TableTest {}; +class BlockBasedTableTest + : public TableTest, + virtual public ::testing::WithParamInterface<uint32_t> { + public: + BlockBasedTableTest() : format_(GetParam()) {} + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions options; + options.format_version = format_; + return options; + } + + protected: + uint64_t IndexUncompressedHelper(bool indexCompress); + + private: + uint32_t format_; +}; +class PlainTableTest : public TableTest {}; +class TablePropertyTest : public testing::Test {}; +class BBTTailPrefetchTest : public TableTest {}; + +INSTANTIATE_TEST_CASE_P(FormatDef, BlockBasedTableTest, + testing::Values(test::kDefaultFormatVersion)); +INSTANTIATE_TEST_CASE_P(FormatLatest, BlockBasedTableTest, + testing::Values(test::kLatestFormatVersion)); + +// This test serves as the living tutorial for the prefix scan of user collected +// properties. +TEST_F(TablePropertyTest, PrefixScanTest) { + UserCollectedProperties props{{"num.111.1", "1"}, + {"num.111.2", "2"}, + {"num.111.3", "3"}, + {"num.333.1", "1"}, + {"num.333.2", "2"}, + {"num.333.3", "3"}, + {"num.555.1", "1"}, + {"num.555.2", "2"}, + {"num.555.3", "3"}, }; + + // prefixes that exist + for (const std::string& prefix : {"num.111", "num.333", "num.555"}) { + int num = 0; + for (auto pos = props.lower_bound(prefix); + pos != props.end() && + pos->first.compare(0, prefix.size(), prefix) == 0; + ++pos) { + ++num; + auto key = prefix + "." + ToString(num); + ASSERT_EQ(key, pos->first); + ASSERT_EQ(ToString(num), pos->second); + } + ASSERT_EQ(3, num); + } + + // prefixes that don't exist + for (const std::string& prefix : + {"num.000", "num.222", "num.444", "num.666"}) { + auto pos = props.lower_bound(prefix); + ASSERT_TRUE(pos == props.end() || + pos->first.compare(0, prefix.size(), prefix) != 0); + } +} + +// This test include all the basic checks except those for index size and block +// size, which will be conducted in separated unit tests. +TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) { + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + + c.Add("a1", "val1"); + c.Add("b2", "val2"); + c.Add("c3", "val3"); + c.Add("d4", "val4"); + c.Add("e5", "val5"); + c.Add("f6", "val6"); + c.Add("g7", "val7"); + c.Add("h8", "val8"); + c.Add("j9", "val9"); + uint64_t diff_internal_user_bytes = 9 * 8; // 8 is seq size, 9 k-v totally + + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + Options options; + options.compression = kNoCompression; + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.block_restart_interval = 1; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + ImmutableCFOptions ioptions(options); + MutableCFOptions moptions(options); + ioptions.statistics = options.statistics.get(); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_NOT_COMPRESSED), 0); + + auto& props = *c.GetTableReader()->GetTableProperties(); + ASSERT_EQ(kvmap.size(), props.num_entries); + + auto raw_key_size = kvmap.size() * 2ul; + auto raw_value_size = kvmap.size() * 4ul; + + ASSERT_EQ(raw_key_size + diff_internal_user_bytes, props.raw_key_size); + ASSERT_EQ(raw_value_size, props.raw_value_size); + ASSERT_EQ(1ul, props.num_data_blocks); + ASSERT_EQ("", props.filter_policy_name); // no filter policy is used + + // Verify data size. + BlockBuilder block_builder(1); + for (const auto& item : kvmap) { + block_builder.Add(item.first, item.second); + } + Slice content = block_builder.Finish(); + ASSERT_EQ(content.size() + kBlockTrailerSize + diff_internal_user_bytes, + props.data_size); + c.ResetTableReader(); +} + +#ifdef SNAPPY +uint64_t BlockBasedTableTest::IndexUncompressedHelper(bool compressed) { + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + constexpr size_t kNumKeys = 10000; + + for (size_t k = 0; k < kNumKeys; ++k) { + c.Add("key" + ToString(k), "val" + ToString(k)); + } + + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + Options options; + options.compression = kSnappyCompression; + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.block_restart_interval = 1; + table_options.enable_index_compression = compressed; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + ImmutableCFOptions ioptions(options); + MutableCFOptions moptions(options); + ioptions.statistics = options.statistics.get(); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + c.ResetTableReader(); + return options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); +} +TEST_P(BlockBasedTableTest, IndexUncompressed) { + uint64_t tbl1_compressed_cnt = IndexUncompressedHelper(true); + uint64_t tbl2_compressed_cnt = IndexUncompressedHelper(false); + // tbl1_compressed_cnt should include 1 index block + EXPECT_EQ(tbl2_compressed_cnt + 1, tbl1_compressed_cnt); +} +#endif // SNAPPY + +TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) { + TableConstructor c(&reverse_key_comparator); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + + { + Options options; + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + auto& props = *c.GetTableReader()->GetTableProperties(); + + // Default comparator + ASSERT_EQ("leveldb.BytewiseComparator", props.comparator_name); + // No merge operator + ASSERT_EQ("nullptr", props.merge_operator_name); + // No prefix extractor + ASSERT_EQ("nullptr", props.prefix_extractor_name); + // No property collectors + ASSERT_EQ("[]", props.property_collectors_names); + // No filter policy is used + ASSERT_EQ("", props.filter_policy_name); + // Compression type == that set: + ASSERT_EQ("NoCompression", props.compression_name); + c.ResetTableReader(); + } + + { + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.comparator = &reverse_key_comparator; + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.prefix_extractor.reset(NewNoopTransform()); + options.table_properties_collector_factories.emplace_back( + new DummyPropertiesCollectorFactory1()); + options.table_properties_collector_factories.emplace_back( + new DummyPropertiesCollectorFactory2()); + + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + auto& props = *c.GetTableReader()->GetTableProperties(); + + ASSERT_EQ("rocksdb.ReverseBytewiseComparator", props.comparator_name); + ASSERT_EQ("UInt64AddOperator", props.merge_operator_name); + ASSERT_EQ("rocksdb.Noop", props.prefix_extractor_name); + ASSERT_EQ("[DummyPropertiesCollector1,DummyPropertiesCollector2]", + props.property_collectors_names); + ASSERT_EQ("", props.filter_policy_name); // no filter policy is used + c.ResetTableReader(); + } +} + +TEST_P(BlockBasedTableTest, RangeDelBlock) { + TableConstructor c(BytewiseComparator()); + std::vector<std::string> keys = {"1pika", "2chu"}; + std::vector<std::string> vals = {"p", "c"}; + + std::vector<RangeTombstone> expected_tombstones = { + {"1pika", "2chu", 0}, + {"2chu", "c", 1}, + {"2chu", "c", 0}, + {"c", "p", 0}, + }; + + for (int i = 0; i < 2; i++) { + RangeTombstone t(keys[i], vals[i], i); + std::pair<InternalKey, Slice> p = t.Serialize(); + c.Add(p.first.Encode().ToString(), p.second); + } + + std::vector<std::string> sorted_keys; + stl_wrappers::KVMap kvmap; + Options options; + options.compression = kNoCompression; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.block_restart_interval = 1; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + std::unique_ptr<InternalKeyComparator> internal_cmp( + new InternalKeyComparator(options.comparator)); + c.Finish(options, ioptions, moptions, table_options, *internal_cmp, + &sorted_keys, &kvmap); + + for (int j = 0; j < 2; ++j) { + std::unique_ptr<InternalIterator> iter( + c.GetTableReader()->NewRangeTombstoneIterator(ReadOptions())); + if (j > 0) { + // For second iteration, delete the table reader object and verify the + // iterator can still access its metablock's range tombstones. + c.ResetTableReader(); + } + ASSERT_FALSE(iter->Valid()); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + for (size_t i = 0; i < expected_tombstones.size(); i++) { + ASSERT_TRUE(iter->Valid()); + ParsedInternalKey parsed_key; + ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key)); + RangeTombstone t(parsed_key, iter->value()); + const auto& expected_t = expected_tombstones[i]; + ASSERT_EQ(t.start_key_, expected_t.start_key_); + ASSERT_EQ(t.end_key_, expected_t.end_key_); + ASSERT_EQ(t.seq_, expected_t.seq_); + iter->Next(); + } + ASSERT_TRUE(!iter->Valid()); + } +} + +TEST_P(BlockBasedTableTest, FilterPolicyNameProperties) { + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + c.Add("a1", "val1"); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + Options options; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + auto& props = *c.GetTableReader()->GetTableProperties(); + ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name); + c.ResetTableReader(); +} + +// +// BlockBasedTableTest::PrefetchTest +// +void AssertKeysInCache(BlockBasedTable* table_reader, + const std::vector<std::string>& keys_in_cache, + const std::vector<std::string>& keys_not_in_cache, + bool convert = false) { + if (convert) { + for (auto key : keys_in_cache) { + InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); + ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode())); + } + for (auto key : keys_not_in_cache) { + InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); + ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode())); + } + } else { + for (auto key : keys_in_cache) { + ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); + } + for (auto key : keys_not_in_cache) { + ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key)); + } + } +} + +void PrefetchRange(TableConstructor* c, Options* opt, + BlockBasedTableOptions* table_options, const char* key_begin, + const char* key_end, + const std::vector<std::string>& keys_in_cache, + const std::vector<std::string>& keys_not_in_cache, + const Status expected_status = Status::OK()) { + // reset the cache and reopen the table + table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4); + opt->table_factory.reset(NewBlockBasedTableFactory(*table_options)); + const ImmutableCFOptions ioptions2(*opt); + const MutableCFOptions moptions(*opt); + ASSERT_OK(c->Reopen(ioptions2, moptions)); + + // prefetch + auto* table_reader = dynamic_cast<BlockBasedTable*>(c->GetTableReader()); + Status s; + std::unique_ptr<Slice> begin, end; + std::unique_ptr<InternalKey> i_begin, i_end; + if (key_begin != nullptr) { + if (c->ConvertToInternalKey()) { + i_begin.reset(new InternalKey(key_begin, kMaxSequenceNumber, kTypeValue)); + begin.reset(new Slice(i_begin->Encode())); + } else { + begin.reset(new Slice(key_begin)); + } + } + if (key_end != nullptr) { + if (c->ConvertToInternalKey()) { + i_end.reset(new InternalKey(key_end, kMaxSequenceNumber, kTypeValue)); + end.reset(new Slice(i_end->Encode())); + } else { + end.reset(new Slice(key_end)); + } + } + s = table_reader->Prefetch(begin.get(), end.get()); + + ASSERT_TRUE(s.code() == expected_status.code()); + + // assert our expectation in cache warmup + AssertKeysInCache(table_reader, keys_in_cache, keys_not_in_cache, + c->ConvertToInternalKey()); + c->ResetTableReader(); +} + +TEST_P(BlockBasedTableTest, PrefetchTest) { + // The purpose of this test is to test the prefetching operation built into + // BlockBasedTable. + Options opt; + std::unique_ptr<InternalKeyComparator> ikc; + ikc.reset(new test::PlainInternalKeyComparator(opt.comparator)); + opt.compression = kNoCompression; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.block_size = 1024; + // big enough so we don't ever lose cached values. + table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4); + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(opt); + const MutableCFOptions moptions(opt); + c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap); + c.ResetTableReader(); + + // We get the following data spread : + // + // Data block Index + // ======================== + // [ k01 k02 k03 ] k03 + // [ k04 ] k04 + // [ k05 ] k05 + // [ k06 k07 ] k07 + + + // Simple + PrefetchRange(&c, &opt, &table_options, + /*key_range=*/"k01", "k05", + /*keys_in_cache=*/{"k01", "k02", "k03", "k04", "k05"}, + /*keys_not_in_cache=*/{"k06", "k07"}); + PrefetchRange(&c, &opt, &table_options, "k01", "k01", {"k01", "k02", "k03"}, + {"k04", "k05", "k06", "k07"}); + // odd + PrefetchRange(&c, &opt, &table_options, "a", "z", + {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {}); + PrefetchRange(&c, &opt, &table_options, "k00", "k00", {"k01", "k02", "k03"}, + {"k04", "k05", "k06", "k07"}); + // Edge cases + PrefetchRange(&c, &opt, &table_options, "k00", "k06", + {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {}); + PrefetchRange(&c, &opt, &table_options, "k00", "zzz", + {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {}); + // null keys + PrefetchRange(&c, &opt, &table_options, nullptr, nullptr, + {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {}); + PrefetchRange(&c, &opt, &table_options, "k04", nullptr, + {"k04", "k05", "k06", "k07"}, {"k01", "k02", "k03"}); + PrefetchRange(&c, &opt, &table_options, nullptr, "k05", + {"k01", "k02", "k03", "k04", "k05"}, {"k06", "k07"}); + // invalid + PrefetchRange(&c, &opt, &table_options, "k06", "k00", {}, {}, + Status::InvalidArgument(Slice("k06 "), Slice("k07"))); + c.ResetTableReader(); +} + +TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + for (int i = 0; i < 4; ++i) { + Options options; + // Make each key/value an individual block + table_options.block_size = 64; + switch (i) { + case 0: + // Binary search index + table_options.index_type = BlockBasedTableOptions::kBinarySearch; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + break; + case 1: + // Hash search index + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + break; + case 2: + // Hash search index with hash_index_allow_collision + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.hash_index_allow_collision = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + break; + case 3: + // Hash search index with filter policy + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + break; + case 4: + default: + // Binary search index + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + break; + } + + TableConstructor c(BytewiseComparator(), + true /* convert_to_internal_key_ */); + c.Add("aaaa1", std::string('a', 56)); + c.Add("bbaa1", std::string('a', 56)); + c.Add("cccc1", std::string('a', 56)); + c.Add("bbbb1", std::string('a', 56)); + c.Add("baaa1", std::string('a', 56)); + c.Add("abbb1", std::string('a', 56)); + c.Add("cccc2", std::string('a', 56)); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + auto props = c.GetTableReader()->GetTableProperties(); + ASSERT_EQ(7u, props->num_data_blocks); + auto* reader = c.GetTableReader(); + ReadOptions ro; + ro.total_order_seek = true; + std::unique_ptr<InternalIterator> iter( + reader->NewIterator(ro, moptions.prefix_extractor.get())); + + iter->Seek(InternalKey("b", 0, kTypeValue).Encode()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("baaa1", ExtractUserKey(iter->key()).ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString()); + + iter->Seek(InternalKey("bb", 0, kTypeValue).Encode()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString()); + + iter->Seek(InternalKey("bbb", 0, kTypeValue).Encode()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("cccc1", ExtractUserKey(iter->key()).ToString()); + } +} + +TEST_P(BlockBasedTableTest, NoopTransformSeek) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + + Options options; + options.comparator = BytewiseComparator(); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewNoopTransform()); + + TableConstructor c(options.comparator); + // To tickle the PrefixMayMatch bug it is important that the + // user-key is a single byte so that the index key exactly matches + // the user-key. + InternalKey key("a", 1, kTypeValue); + c.Add(key.Encode().ToString(), "b"); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + const InternalKeyComparator internal_comparator(options.comparator); + c.Finish(options, ioptions, moptions, table_options, internal_comparator, + &keys, &kvmap); + + auto* reader = c.GetTableReader(); + for (int i = 0; i < 2; ++i) { + ReadOptions ro; + ro.total_order_seek = (i == 0); + std::unique_ptr<InternalIterator> iter( + reader->NewIterator(ro, moptions.prefix_extractor.get())); + + iter->Seek(key.Encode()); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a", ExtractUserKey(iter->key()).ToString()); + } +} + +TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) { + // if DB is opened with a prefix extractor of a different name, + // prefix bloom is skipped when read the file + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.filter_policy.reset(NewBloomFilterPolicy(2)); + table_options.whole_key_filtering = false; + + Options options; + options.comparator = BytewiseComparator(); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + TableConstructor c(options.comparator); + InternalKey key("abcdefghijk", 1, kTypeValue); + c.Add(key.Encode().ToString(), "test"); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + const InternalKeyComparator internal_comparator(options.comparator); + c.Finish(options, ioptions, moptions, table_options, internal_comparator, + &keys, &kvmap); + // TODO(Zhongyi): update test to use MutableCFOptions + options.prefix_extractor.reset(NewFixedPrefixTransform(9)); + const ImmutableCFOptions new_ioptions(options); + const MutableCFOptions new_moptions(options); + c.Reopen(new_ioptions, new_moptions); + auto reader = c.GetTableReader(); + std::unique_ptr<InternalIterator> db_iter( + reader->NewIterator(ReadOptions(), new_moptions.prefix_extractor.get())); + + // Test point lookup + // only one kv + for (auto& kv : kvmap) { + db_iter->Seek(kv.first); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ(db_iter->key(), kv.first); + ASSERT_EQ(db_iter->value(), kv.second); + } +} + +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} + +void AddInternalKey(TableConstructor* c, const std::string& prefix, + int /*suffix_len*/ = 800) { + static Random rnd(1023); + InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue); + c->Add(k.Encode().ToString(), "v"); +} + +void TableTest::IndexTest(BlockBasedTableOptions table_options) { + TableConstructor c(BytewiseComparator()); + + // keys with prefix length 3, make sure the key/value is big enough to fill + // one block + AddInternalKey(&c, "0015"); + AddInternalKey(&c, "0035"); + + AddInternalKey(&c, "0054"); + AddInternalKey(&c, "0055"); + + AddInternalKey(&c, "0056"); + AddInternalKey(&c, "0057"); + + AddInternalKey(&c, "0058"); + AddInternalKey(&c, "0075"); + + AddInternalKey(&c, "0076"); + AddInternalKey(&c, "0095"); + + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + Options options; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + table_options.block_size = 1700; + table_options.block_cache = NewLRUCache(1024, 4); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + std::unique_ptr<InternalKeyComparator> comparator( + new InternalKeyComparator(BytewiseComparator())); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, + &kvmap); + auto reader = c.GetTableReader(); + + auto props = reader->GetTableProperties(); + ASSERT_EQ(5u, props->num_data_blocks); + + // TODO(Zhongyi): update test to use MutableCFOptions + std::unique_ptr<InternalIterator> index_iter( + reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get())); + + // -- Find keys do not exist, but have common prefix. + std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"}; + std::vector<std::string> lower_bound = {keys[0], keys[1], keys[2], + keys[7], keys[9], }; + + // find the lower bound of the prefix + for (size_t i = 0; i < prefixes.size(); ++i) { + index_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode()); + ASSERT_OK(index_iter->status()); + ASSERT_TRUE(index_iter->Valid()); + + // seek the first element in the block + ASSERT_EQ(lower_bound[i], index_iter->key().ToString()); + ASSERT_EQ("v", index_iter->value().ToString()); + } + + // find the upper bound of prefixes + std::vector<std::string> upper_bound = {keys[1], keys[2], keys[7], keys[9], }; + + // find existing keys + for (const auto& item : kvmap) { + auto ukey = ExtractUserKey(item.first).ToString(); + index_iter->Seek(ukey); + + // ASSERT_OK(regular_iter->status()); + ASSERT_OK(index_iter->status()); + + // ASSERT_TRUE(regular_iter->Valid()); + ASSERT_TRUE(index_iter->Valid()); + + ASSERT_EQ(item.first, index_iter->key().ToString()); + ASSERT_EQ(item.second, index_iter->value().ToString()); + } + + for (size_t i = 0; i < prefixes.size(); ++i) { + // the key is greater than any existing keys. + auto key = prefixes[i] + "9"; + index_iter->Seek(InternalKey(key, 0, kTypeValue).Encode()); + + ASSERT_OK(index_iter->status()); + if (i == prefixes.size() - 1) { + // last key + ASSERT_TRUE(!index_iter->Valid()); + } else { + ASSERT_TRUE(index_iter->Valid()); + // seek the first element in the block + ASSERT_EQ(upper_bound[i], index_iter->key().ToString()); + ASSERT_EQ("v", index_iter->value().ToString()); + } + } + + // find keys with prefix that don't match any of the existing prefixes. + std::vector<std::string> non_exist_prefixes = {"002", "004", "006", "008"}; + for (const auto& prefix : non_exist_prefixes) { + index_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode()); + // regular_iter->Seek(prefix); + + ASSERT_OK(index_iter->status()); + // Seek to non-existing prefixes should yield either invalid, or a + // key with prefix greater than the target. + if (index_iter->Valid()) { + Slice ukey = ExtractUserKey(index_iter->key()); + Slice ukey_prefix = options.prefix_extractor->Transform(ukey); + ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) < 0); + } + } + c.ResetTableReader(); +} + +TEST_P(BlockBasedTableTest, BinaryIndexTest) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kBinarySearch; + IndexTest(table_options); +} + +TEST_P(BlockBasedTableTest, HashIndexTest) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kHashSearch; + IndexTest(table_options); +} + +TEST_P(BlockBasedTableTest, PartitionIndexTest) { + const int max_index_keys = 5; + const int est_max_index_key_value_size = 32; + const int est_max_index_size = max_index_keys * est_max_index_key_value_size; + for (int i = 1; i <= est_max_index_size + 1; i++) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + table_options.metadata_block_size = i; + IndexTest(table_options); + } +} + +// It's very hard to figure out the index block size of a block accurately. +// To make sure we get the index size, we just make sure as key number +// grows, the filter block size also grows. +TEST_P(BlockBasedTableTest, IndexSizeStat) { + uint64_t last_index_size = 0; + + // we need to use random keys since the pure human readable texts + // may be well compressed, resulting insignifcant change of index + // block size. + Random rnd(test::RandomSeed()); + std::vector<std::string> keys; + + for (int i = 0; i < 100; ++i) { + keys.push_back(RandomString(&rnd, 10000)); + } + + // Each time we load one more key to the table. the table index block + // size is expected to be larger than last time's. + for (size_t i = 1; i < keys.size(); ++i) { + TableConstructor c(BytewiseComparator(), + true /* convert_to_internal_key_ */); + for (size_t j = 0; j < i; ++j) { + c.Add(keys[j], "val"); + } + + std::vector<std::string> ks; + stl_wrappers::KVMap kvmap; + Options options; + options.compression = kNoCompression; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.block_restart_interval = 1; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &ks, &kvmap); + auto index_size = c.GetTableReader()->GetTableProperties()->index_size; + ASSERT_GT(index_size, last_index_size); + last_index_size = index_size; + c.ResetTableReader(); + } +} + +TEST_P(BlockBasedTableTest, NumBlockStat) { + Random rnd(test::RandomSeed()); + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + Options options; + options.compression = kNoCompression; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.block_restart_interval = 1; + table_options.block_size = 1000; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + for (int i = 0; i < 10; ++i) { + // the key/val are slightly smaller than block size, so that each block + // holds roughly one key/value pair. + c.Add(RandomString(&rnd, 900), "val"); + } + + std::vector<std::string> ks; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &ks, &kvmap); + ASSERT_EQ(kvmap.size(), + c.GetTableReader()->GetTableProperties()->num_data_blocks); + c.ResetTableReader(); +} + +// A simple tool that takes the snapshot of block cache statistics. +class BlockCachePropertiesSnapshot { + public: + explicit BlockCachePropertiesSnapshot(Statistics* statistics) { + block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS); + block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT); + index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS); + index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT); + data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS); + data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT); + filter_block_cache_miss = + statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS); + filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT); + block_cache_bytes_read = statistics->getTickerCount(BLOCK_CACHE_BYTES_READ); + block_cache_bytes_write = + statistics->getTickerCount(BLOCK_CACHE_BYTES_WRITE); + } + + void AssertIndexBlockStat(int64_t expected_index_block_cache_miss, + int64_t expected_index_block_cache_hit) { + ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss); + ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit); + } + + void AssertFilterBlockStat(int64_t expected_filter_block_cache_miss, + int64_t expected_filter_block_cache_hit) { + ASSERT_EQ(expected_filter_block_cache_miss, filter_block_cache_miss); + ASSERT_EQ(expected_filter_block_cache_hit, filter_block_cache_hit); + } + + // Check if the fetched props matches the expected ones. + // TODO(kailiu) Use this only when you disabled filter policy! + void AssertEqual(int64_t expected_index_block_cache_miss, + int64_t expected_index_block_cache_hit, + int64_t expected_data_block_cache_miss, + int64_t expected_data_block_cache_hit) const { + ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss); + ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit); + ASSERT_EQ(expected_data_block_cache_miss, data_block_cache_miss); + ASSERT_EQ(expected_data_block_cache_hit, data_block_cache_hit); + ASSERT_EQ(expected_index_block_cache_miss + expected_data_block_cache_miss, + block_cache_miss); + ASSERT_EQ(expected_index_block_cache_hit + expected_data_block_cache_hit, + block_cache_hit); + } + + int64_t GetCacheBytesRead() { return block_cache_bytes_read; } + + int64_t GetCacheBytesWrite() { return block_cache_bytes_write; } + + private: + int64_t block_cache_miss = 0; + int64_t block_cache_hit = 0; + int64_t index_block_cache_miss = 0; + int64_t index_block_cache_hit = 0; + int64_t data_block_cache_miss = 0; + int64_t data_block_cache_hit = 0; + int64_t filter_block_cache_miss = 0; + int64_t filter_block_cache_hit = 0; + int64_t block_cache_bytes_read = 0; + int64_t block_cache_bytes_write = 0; +}; + +// Make sure, by default, index/filter blocks were pre-loaded (meaning we won't +// use block cache to store them). +TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) { + Options options; + options.create_if_missing = true; + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.block_cache = NewLRUCache(1024, 4); + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + c.Add("key", "value"); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + // preloading filter/index blocks is enabled. + auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader()); + ASSERT_TRUE(reader->TEST_filter_block_preloaded()); + ASSERT_TRUE(reader->TEST_index_reader_preloaded()); + + { + // nothing happens in the beginning + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertIndexBlockStat(0, 0); + props.AssertFilterBlockStat(0, 0); + } + + { + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(), nullptr, nullptr, + nullptr, nullptr, nullptr); + // a hack that just to trigger BlockBasedTable::GetFilter. + reader->Get(ReadOptions(), "non-exist-key", &get_context, + moptions.prefix_extractor.get()); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertIndexBlockStat(0, 0); + props.AssertFilterBlockStat(0, 0); + } +} + +// Due to the difficulities of the intersaction between statistics, this test +// only tests the case when "index block is put to block cache" +TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { + // -- Table construction + Options options; + options.create_if_missing = true; + options.statistics = CreateDBStatistics(); + + // Enable the cache for index/filter blocks + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.block_cache = NewLRUCache(2048, 2); + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + c.Add("key", "value"); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + // preloading filter/index blocks is prohibited. + auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader()); + ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); + ASSERT_TRUE(!reader->TEST_index_reader_preloaded()); + + // -- PART 1: Open with regular block cache. + // Since block_cache is disabled, no cache activities will be involved. + std::unique_ptr<InternalIterator> iter; + + int64_t last_cache_bytes_read = 0; + // At first, no block will be accessed. + { + BlockCachePropertiesSnapshot props(options.statistics.get()); + // index will be added to block cache. + props.AssertEqual(1, // index block miss + 0, 0, 0); + ASSERT_EQ(props.GetCacheBytesRead(), 0); + ASSERT_EQ(props.GetCacheBytesWrite(), + table_options.block_cache->GetUsage()); + last_cache_bytes_read = props.GetCacheBytesRead(); + } + + // Only index block will be accessed + { + iter.reset(c.NewIterator(moptions.prefix_extractor.get())); + BlockCachePropertiesSnapshot props(options.statistics.get()); + // NOTE: to help better highlight the "detla" of each ticker, I use + // <last_value> + <added_value> to indicate the increment of changed + // value; other numbers remain the same. + props.AssertEqual(1, 0 + 1, // index block hit + 0, 0); + // Cache hit, bytes read from cache should increase + ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read); + ASSERT_EQ(props.GetCacheBytesWrite(), + table_options.block_cache->GetUsage()); + last_cache_bytes_read = props.GetCacheBytesRead(); + } + + // Only data block will be accessed + { + iter->SeekToFirst(); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1, 1, 0 + 1, // data block miss + 0); + // Cache miss, Bytes read from cache should not change + ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read); + ASSERT_EQ(props.GetCacheBytesWrite(), + table_options.block_cache->GetUsage()); + last_cache_bytes_read = props.GetCacheBytesRead(); + } + + // Data block will be in cache + { + iter.reset(c.NewIterator(moptions.prefix_extractor.get())); + iter->SeekToFirst(); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1, 1 + 1, /* index block hit */ + 1, 0 + 1 /* data block hit */); + // Cache hit, bytes read from cache should increase + ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read); + ASSERT_EQ(props.GetCacheBytesWrite(), + table_options.block_cache->GetUsage()); + } + // release the iterator so that the block cache can reset correctly. + iter.reset(); + + c.ResetTableReader(); + + // -- PART 2: Open with very small block cache + // In this test, no block will ever get hit since the block cache is + // too small to fit even one entry. + table_options.block_cache = NewLRUCache(1, 4); + options.statistics = CreateDBStatistics(); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + const ImmutableCFOptions ioptions2(options); + const MutableCFOptions moptions2(options); + c.Reopen(ioptions2, moptions2); + { + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1, // index block miss + 0, 0, 0); + // Cache miss, Bytes read from cache should not change + ASSERT_EQ(props.GetCacheBytesRead(), 0); + } + + { + // Both index and data block get accessed. + // It first cache index block then data block. But since the cache size + // is only 1, index block will be purged after data block is inserted. + iter.reset(c.NewIterator(moptions2.prefix_extractor.get())); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(1 + 1, // index block miss + 0, 0, // data block miss + 0); + // Cache hit, bytes read from cache should increase + ASSERT_EQ(props.GetCacheBytesRead(), 0); + } + + { + // SeekToFirst() accesses data block. With similar reason, we expect data + // block's cache miss. + iter->SeekToFirst(); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertEqual(2, 0, 0 + 1, // data block miss + 0); + // Cache miss, Bytes read from cache should not change + ASSERT_EQ(props.GetCacheBytesRead(), 0); + } + iter.reset(); + c.ResetTableReader(); + + // -- PART 3: Open table with bloom filter enabled but not in SST file + table_options.block_cache = NewLRUCache(4096, 4); + table_options.cache_index_and_filter_blocks = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + TableConstructor c3(BytewiseComparator()); + std::string user_key = "k01"; + InternalKey internal_key(user_key, 0, kTypeValue); + c3.Add(internal_key.Encode().ToString(), "hello"); + ImmutableCFOptions ioptions3(options); + MutableCFOptions moptions3(options); + // Generate table without filter policy + c3.Finish(options, ioptions3, moptions3, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + c3.ResetTableReader(); + + // Open table with filter policy + table_options.filter_policy.reset(NewBloomFilterPolicy(1)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.statistics = CreateDBStatistics(); + ImmutableCFOptions ioptions4(options); + MutableCFOptions moptions4(options); + ASSERT_OK(c3.Reopen(ioptions4, moptions4)); + reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader()); + ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); + PinnableSlice value; + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, nullptr, + nullptr, nullptr, nullptr); + ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context, + moptions4.prefix_extractor.get())); + ASSERT_STREQ(value.data(), "hello"); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertFilterBlockStat(0, 0); + c3.ResetTableReader(); +} + +void ValidateBlockSizeDeviation(int value, int expected) { + BlockBasedTableOptions table_options; + table_options.block_size_deviation = value; + BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options); + + const BlockBasedTableOptions* normalized_table_options = + (const BlockBasedTableOptions*)factory->GetOptions(); + ASSERT_EQ(normalized_table_options->block_size_deviation, expected); + + delete factory; +} + +void ValidateBlockRestartInterval(int value, int expected) { + BlockBasedTableOptions table_options; + table_options.block_restart_interval = value; + BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options); + + const BlockBasedTableOptions* normalized_table_options = + (const BlockBasedTableOptions*)factory->GetOptions(); + ASSERT_EQ(normalized_table_options->block_restart_interval, expected); + + delete factory; +} + +TEST_P(BlockBasedTableTest, InvalidOptions) { + // invalid values for block_size_deviation (<0 or >100) are silently set to 0 + ValidateBlockSizeDeviation(-10, 0); + ValidateBlockSizeDeviation(-1, 0); + ValidateBlockSizeDeviation(0, 0); + ValidateBlockSizeDeviation(1, 1); + ValidateBlockSizeDeviation(99, 99); + ValidateBlockSizeDeviation(100, 100); + ValidateBlockSizeDeviation(101, 0); + ValidateBlockSizeDeviation(1000, 0); + + // invalid values for block_restart_interval (<1) are silently set to 1 + ValidateBlockRestartInterval(-10, 1); + ValidateBlockRestartInterval(-1, 1); + ValidateBlockRestartInterval(0, 1); + ValidateBlockRestartInterval(1, 1); + ValidateBlockRestartInterval(2, 2); + ValidateBlockRestartInterval(1000, 1000); +} + +TEST_P(BlockBasedTableTest, BlockReadCountTest) { + // bloom_filter_type = 0 -- block-based filter + // bloom_filter_type = 0 -- full filter + for (int bloom_filter_type = 0; bloom_filter_type < 2; ++bloom_filter_type) { + for (int index_and_filter_in_cache = 0; index_and_filter_in_cache < 2; + ++index_and_filter_in_cache) { + Options options; + options.create_if_missing = true; + + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.block_cache = NewLRUCache(1, 0); + table_options.cache_index_and_filter_blocks = index_and_filter_in_cache; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, bloom_filter_type == 0)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + + TableConstructor c(BytewiseComparator()); + std::string user_key = "k04"; + InternalKey internal_key(user_key, 0, kTypeValue); + std::string encoded_key = internal_key.Encode().ToString(); + c.Add(encoded_key, "hello"); + ImmutableCFOptions ioptions(options); + MutableCFOptions moptions(options); + // Generate table with filter policy + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + auto reader = c.GetTableReader(); + PinnableSlice value; + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, nullptr, + nullptr, nullptr, nullptr); + get_perf_context()->Reset(); + ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, + moptions.prefix_extractor.get())); + if (index_and_filter_in_cache) { + // data, index and filter block + ASSERT_EQ(get_perf_context()->block_read_count, 3); + } else { + // just the data block + ASSERT_EQ(get_perf_context()->block_read_count, 1); + } + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_STREQ(value.data(), "hello"); + + // Get non-existing key + user_key = "does-not-exist"; + internal_key = InternalKey(user_key, 0, kTypeValue); + encoded_key = internal_key.Encode().ToString(); + + value.Reset(); + get_context = GetContext(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, nullptr, + nullptr, nullptr, nullptr); + get_perf_context()->Reset(); + ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, + moptions.prefix_extractor.get())); + ASSERT_EQ(get_context.State(), GetContext::kNotFound); + + if (index_and_filter_in_cache) { + if (bloom_filter_type == 0) { + // with block-based, we read index and then the filter + ASSERT_EQ(get_perf_context()->block_read_count, 2); + } else { + // with full-filter, we read filter first and then we stop + ASSERT_EQ(get_perf_context()->block_read_count, 1); + } + } else { + // filter is already in memory and it figures out that the key doesn't + // exist + ASSERT_EQ(get_perf_context()->block_read_count, 0); + } + } + } +} + +// A wrapper around LRICache that also keeps track of data blocks (in contrast +// with the objects) in the cache. The class is very simple and can be used only +// for trivial tests. +class MockCache : public LRUCache { + public: + MockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, + double high_pri_pool_ratio) + : LRUCache(capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio) {} + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + // Replace the deleter with our own so that we keep track of data blocks + // erased from the cache + deleters_[key.ToString()] = deleter; + return ShardedCache::Insert(key, value, charge, &MockDeleter, handle, + priority); + } + // This is called by the application right after inserting a data block + void TEST_mark_as_data_block(const Slice& key, size_t charge) override { + marked_data_in_cache_[key.ToString()] = charge; + marked_size_ += charge; + } + using DeleterFunc = void (*)(const Slice& key, void* value); + static std::map<std::string, DeleterFunc> deleters_; + static std::map<std::string, size_t> marked_data_in_cache_; + static size_t marked_size_; + static void MockDeleter(const Slice& key, void* value) { + // If the item was marked for being data block, decrease its usage from the + // total data block usage of the cache + if (marked_data_in_cache_.find(key.ToString()) != + marked_data_in_cache_.end()) { + marked_size_ -= marked_data_in_cache_[key.ToString()]; + } + // Then call the origianl deleter + assert(deleters_.find(key.ToString()) != deleters_.end()); + auto deleter = deleters_[key.ToString()]; + deleter(key, value); + } +}; + +size_t MockCache::marked_size_ = 0; +std::map<std::string, MockCache::DeleterFunc> MockCache::deleters_; +std::map<std::string, size_t> MockCache::marked_data_in_cache_; + +// Block cache can contain raw data blocks as well as general objects. If an +// object depends on the table to be live, it then must be destructed before the +// table is closed. This test makes sure that the only items remains in the +// cache after the table is closed are raw data blocks. +TEST_P(BlockBasedTableTest, NoObjectInCacheAfterTableClose) { + std::vector<CompressionType> compression_types{kNoCompression}; + + // The following are the compression library versions supporting compression + // dictionaries. See the test case CacheCompressionDict in the + // DBBlockCacheTest suite. +#ifdef ZLIB + compression_types.push_back(kZlibCompression); +#endif // ZLIB +#if LZ4_VERSION_NUMBER >= 10400 + compression_types.push_back(kLZ4Compression); + compression_types.push_back(kLZ4HCCompression); +#endif // LZ4_VERSION_NUMBER >= 10400 +#if ZSTD_VERSION_NUMBER >= 500 + compression_types.push_back(kZSTD); +#endif // ZSTD_VERSION_NUMBER >= 500 + + for (int level: {-1, 0, 1, 10}) { + for (auto index_type : + {BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}) { + for (bool block_based_filter : {true, false}) { + for (bool partition_filter : {true, false}) { + if (partition_filter && + (block_based_filter || + index_type != + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch)) { + continue; + } + for (bool index_and_filter_in_cache : {true, false}) { + for (bool pin_l0 : {true, false}) { + for (bool pin_top_level : {true, false}) { + if (pin_l0 && !index_and_filter_in_cache) { + continue; + } + + for (auto compression_type : compression_types) { + for (uint32_t max_dict_bytes : {0, 1 << 14}) { + if (compression_type == kNoCompression && max_dict_bytes) + continue; + + // Create a table + Options opt; + std::unique_ptr<InternalKeyComparator> ikc; + ikc.reset(new test::PlainInternalKeyComparator( + opt.comparator)); + opt.compression = compression_type; + opt.compression_opts.max_dict_bytes = max_dict_bytes; + BlockBasedTableOptions table_options = + GetBlockBasedTableOptions(); + table_options.block_size = 1024; + table_options.index_type = index_type; + table_options.pin_l0_filter_and_index_blocks_in_cache = + pin_l0; + table_options.pin_top_level_index_and_filter = + pin_top_level; + table_options.partition_filters = partition_filter; + table_options.cache_index_and_filter_blocks = + index_and_filter_in_cache; + // big enough so we don't ever lose cached values. + table_options.block_cache = std::make_shared<MockCache>( + 16 * 1024 * 1024, 4, false, 0.0); + table_options.filter_policy.reset( + rocksdb::NewBloomFilterPolicy(10, block_based_filter)); + opt.table_factory.reset(NewBlockBasedTableFactory( + table_options)); + + bool convert_to_internal_key = false; + TableConstructor c(BytewiseComparator(), + convert_to_internal_key, level); + std::string user_key = "k01"; + std::string key = + InternalKey(user_key, 0, kTypeValue).Encode().ToString(); + c.Add(key, "hello"); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(opt); + const MutableCFOptions moptions(opt); + c.Finish(opt, ioptions, moptions, table_options, *ikc, + &keys, &kvmap); + + // Doing a read to make index/filter loaded into the cache + auto table_reader = + dynamic_cast<BlockBasedTable*>(c.GetTableReader()); + PinnableSlice value; + GetContext get_context(opt.comparator, nullptr, nullptr, + nullptr, GetContext::kNotFound, user_key, &value, + nullptr, nullptr, nullptr, nullptr); + InternalKey ikey(user_key, 0, kTypeValue); + auto s = table_reader->Get(ReadOptions(), key, &get_context, + moptions.prefix_extractor.get()); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_STREQ(value.data(), "hello"); + + // Close the table + c.ResetTableReader(); + + auto usage = table_options.block_cache->GetUsage(); + auto pinned_usage = + table_options.block_cache->GetPinnedUsage(); + // The only usage must be for marked data blocks + ASSERT_EQ(usage, MockCache::marked_size_); + // There must be some pinned data since PinnableSlice has + // not released them yet + ASSERT_GT(pinned_usage, 0); + // Release pinnable slice reousrces + value.Reset(); + pinned_usage = table_options.block_cache->GetPinnedUsage(); + ASSERT_EQ(pinned_usage, 0); + } + } + } + } + } + } + } + } + } // level +} + +TEST_P(BlockBasedTableTest, BlockCacheLeak) { + // Check that when we reopen a table we don't lose access to blocks already + // in the cache. This test checks whether the Table actually makes use of the + // unique ID from the file. + + Options opt; + std::unique_ptr<InternalKeyComparator> ikc; + ikc.reset(new test::PlainInternalKeyComparator(opt.comparator)); + opt.compression = kNoCompression; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.block_size = 1024; + // big enough so we don't ever lose cached values. + table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4); + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(opt); + const MutableCFOptions moptions(opt); + c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap); + + std::unique_ptr<InternalIterator> iter( + c.NewIterator(moptions.prefix_extractor.get())); + iter->SeekToFirst(); + while (iter->Valid()) { + iter->key(); + iter->value(); + iter->Next(); + } + ASSERT_OK(iter->status()); + iter.reset(); + + const ImmutableCFOptions ioptions1(opt); + const MutableCFOptions moptions1(opt); + ASSERT_OK(c.Reopen(ioptions1, moptions1)); + auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader()); + for (const std::string& key : keys) { + InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); + ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode())); + } + c.ResetTableReader(); + + // rerun with different block cache + table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4); + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); + const ImmutableCFOptions ioptions2(opt); + const MutableCFOptions moptions2(opt); + ASSERT_OK(c.Reopen(ioptions2, moptions2)); + table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader()); + for (const std::string& key : keys) { + InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); + ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode())); + } + c.ResetTableReader(); +} + +namespace { +class CustomMemoryAllocator : public MemoryAllocator { + public: + const char* Name() const override { return "CustomMemoryAllocator"; } + + void* Allocate(size_t size) override { + ++numAllocations; + auto ptr = new char[size + 16]; + memcpy(ptr, "memory_allocator_", 16); // mangle first 16 bytes + return reinterpret_cast<void*>(ptr + 16); + } + void Deallocate(void* p) override { + ++numDeallocations; + char* ptr = reinterpret_cast<char*>(p) - 16; + delete[] ptr; + } + + std::atomic<int> numAllocations; + std::atomic<int> numDeallocations; +}; +} // namespace + +TEST_P(BlockBasedTableTest, MemoryAllocator) { + auto custom_memory_allocator = std::make_shared<CustomMemoryAllocator>(); + { + Options opt; + std::unique_ptr<InternalKeyComparator> ikc; + ikc.reset(new test::PlainInternalKeyComparator(opt.comparator)); + opt.compression = kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + LRUCacheOptions lruOptions; + lruOptions.memory_allocator = custom_memory_allocator; + lruOptions.capacity = 16 * 1024 * 1024; + lruOptions.num_shard_bits = 4; + table_options.block_cache = NewLRUCache(std::move(lruOptions)); + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + TableConstructor c(BytewiseComparator(), + true /* convert_to_internal_key_ */); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(opt); + const MutableCFOptions moptions(opt); + c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap); + + std::unique_ptr<InternalIterator> iter( + c.NewIterator(moptions.prefix_extractor.get())); + iter->SeekToFirst(); + while (iter->Valid()) { + iter->key(); + iter->value(); + iter->Next(); + } + ASSERT_OK(iter->status()); + } + + // out of scope, block cache should have been deleted, all allocations + // deallocated + EXPECT_EQ(custom_memory_allocator->numAllocations.load(), + custom_memory_allocator->numDeallocations.load()); + // make sure that allocations actually happened through the cache allocator + EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0); +} + +TEST_P(BlockBasedTableTest, NewIndexIteratorLeak) { + // A regression test to avoid data race described in + // https://github.com/facebook/rocksdb/issues/1267 + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + c.Add("a1", "val1"); + Options options; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache = NewLRUCache(0); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers( + { + {"BlockBasedTable::NewIndexIterator::thread1:1", + "BlockBasedTable::NewIndexIterator::thread2:2"}, + {"BlockBasedTable::NewIndexIterator::thread2:3", + "BlockBasedTable::NewIndexIterator::thread1:4"}, + }, + { + {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker", + "BlockBasedTable::NewIndexIterator::thread1:1"}, + {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker", + "BlockBasedTable::NewIndexIterator::thread1:4"}, + {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker", + "BlockBasedTable::NewIndexIterator::thread2:2"}, + {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker", + "BlockBasedTable::NewIndexIterator::thread2:3"}, + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + ReadOptions ro; + auto* reader = c.GetTableReader(); + + std::function<void()> func1 = [&]() { + TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker"); + // TODO(Zhongyi): update test to use MutableCFOptions + std::unique_ptr<InternalIterator> iter( + reader->NewIterator(ro, moptions.prefix_extractor.get())); + iter->Seek(InternalKey("a1", 0, kTypeValue).Encode()); + }; + + std::function<void()> func2 = [&]() { + TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker"); + std::unique_ptr<InternalIterator> iter( + reader->NewIterator(ro, moptions.prefix_extractor.get())); + }; + + auto thread1 = port::Thread(func1); + auto thread2 = port::Thread(func2); + thread1.join(); + thread2.join(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + c.ResetTableReader(); +} + +// Plain table is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE +TEST_F(PlainTableTest, BasicPlainTableProperties) { + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 8; + plain_table_options.bloom_bits_per_key = 8; + plain_table_options.hash_table_ratio = 0; + + PlainTableFactory factory(plain_table_options); + test::StringSink sink; + std::unique_ptr<WritableFileWriter> file_writer( + test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */)); + Options options; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + InternalKeyComparator ikc(options.comparator); + std::vector<std::unique_ptr<IntTblPropCollectorFactory>> + int_tbl_prop_collector_factories; + std::string column_family_name; + int unknown_level = -1; + std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder( + TableBuilderOptions( + ioptions, moptions, ikc, &int_tbl_prop_collector_factories, + kNoCompression, 0 /* sample_for_compression */, CompressionOptions(), + false /* skip_filters */, column_family_name, unknown_level), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + file_writer.get())); + + for (char c = 'a'; c <= 'z'; ++c) { + std::string key(8, c); + key.append("\1 "); // PlainTable expects internal key structure + std::string value(28, c + 42); + builder->Add(key, value); + } + ASSERT_OK(builder->Finish()); + file_writer->Flush(); + + test::StringSink* ss = + static_cast<test::StringSink*>(file_writer->writable_file()); + std::unique_ptr<RandomAccessFileReader> file_reader( + test::GetRandomAccessFileReader( + new test::StringSource(ss->contents(), 72242, true))); + + TableProperties* props = nullptr; + auto s = ReadTableProperties(file_reader.get(), ss->contents().size(), + kPlainTableMagicNumber, ioptions, + &props, true /* compression_type_missing */); + std::unique_ptr<TableProperties> props_guard(props); + ASSERT_OK(s); + + ASSERT_EQ(0ul, props->index_size); + ASSERT_EQ(0ul, props->filter_size); + ASSERT_EQ(16ul * 26, props->raw_key_size); + ASSERT_EQ(28ul * 26, props->raw_value_size); + ASSERT_EQ(26ul, props->num_entries); + ASSERT_EQ(1ul, props->num_data_blocks); +} +#endif // !ROCKSDB_LITE + +TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) { + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + c.Add("k01", "hello"); + c.Add("k02", "hello2"); + c.Add("k03", std::string(10000, 'x')); + c.Add("k04", std::string(200000, 'x')); + c.Add("k05", std::string(300000, 'x')); + c.Add("k06", "hello3"); + c.Add("k07", std::string(100000, 'x')); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + Options options; + test::PlainInternalKeyComparator internal_comparator(options.comparator); + options.compression = kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, internal_comparator, + &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); + // k04 and k05 will be in two consecutive blocks, the index is + // an arbitrary slice between k04 and k05, either before or after k04a + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 10000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000)); + c.ResetTableReader(); +} + +static void DoCompressionTest(CompressionType comp) { + Random rnd(301); + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + std::string tmp; + c.Add("k01", "hello"); + c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + c.Add("k03", "hello3"); + c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp)); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + Options options; + test::PlainInternalKeyComparator ikc(options.comparator); + options.compression = comp; + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap); + + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3500)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3500)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6500)); + c.ResetTableReader(); +} + +TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) { + std::vector<CompressionType> compression_state; + if (!Snappy_Supported()) { + fprintf(stderr, "skipping snappy compression tests\n"); + } else { + compression_state.push_back(kSnappyCompression); + } + + if (!Zlib_Supported()) { + fprintf(stderr, "skipping zlib compression tests\n"); + } else { + compression_state.push_back(kZlibCompression); + } + + // TODO(kailiu) DoCompressionTest() doesn't work with BZip2. + /* + if (!BZip2_Supported()) { + fprintf(stderr, "skipping bzip2 compression tests\n"); + } else { + compression_state.push_back(kBZip2Compression); + } + */ + + if (!LZ4_Supported()) { + fprintf(stderr, "skipping lz4 and lz4hc compression tests\n"); + } else { + compression_state.push_back(kLZ4Compression); + compression_state.push_back(kLZ4HCCompression); + } + + if (!XPRESS_Supported()) { + fprintf(stderr, "skipping xpress and xpress compression tests\n"); + } + else { + compression_state.push_back(kXpressCompression); + } + + for (auto state : compression_state) { + DoCompressionTest(state); + } +} + +#ifndef ROCKSDB_VALGRIND_RUN +// RandomizedHarnessTest is very slow for certain combination of arguments +// Split into 8 pieces to reduce the time individual tests take. +TEST_F(HarnessTest, Randomized1) { + // part 1 out of 8 + const size_t part = 1; + const size_t total = 8; + RandomizedHarnessTest(part, total); +} + +TEST_F(HarnessTest, Randomized2) { + // part 2 out of 8 + const size_t part = 2; + const size_t total = 8; + RandomizedHarnessTest(part, total); +} + +TEST_F(HarnessTest, Randomized3) { + // part 3 out of 8 + const size_t part = 3; + const size_t total = 8; + RandomizedHarnessTest(part, total); +} + +TEST_F(HarnessTest, Randomized4) { + // part 4 out of 8 + const size_t part = 4; + const size_t total = 8; + RandomizedHarnessTest(part, total); +} + +TEST_F(HarnessTest, Randomized5) { + // part 5 out of 8 + const size_t part = 5; + const size_t total = 8; + RandomizedHarnessTest(part, total); +} + +TEST_F(HarnessTest, Randomized6) { + // part 6 out of 8 + const size_t part = 6; + const size_t total = 8; + RandomizedHarnessTest(part, total); +} + +TEST_F(HarnessTest, Randomized7) { + // part 7 out of 8 + const size_t part = 7; + const size_t total = 8; + RandomizedHarnessTest(part, total); +} + +TEST_F(HarnessTest, Randomized8) { + // part 8 out of 8 + const size_t part = 8; + const size_t total = 8; + RandomizedHarnessTest(part, total); +} + +#ifndef ROCKSDB_LITE +TEST_F(HarnessTest, RandomizedLongDB) { + Random rnd(test::RandomSeed()); + TestArgs args = {DB_TEST, false, 16, kNoCompression, 0, false}; + Init(args); + int num_entries = 100000; + for (int e = 0; e < num_entries; e++) { + std::string v; + Add(test::RandomKey(&rnd, rnd.Skewed(4)), + test::RandomString(&rnd, rnd.Skewed(5), &v).ToString()); + } + Test(&rnd); + + // We must have created enough data to force merging + int files = 0; + for (int level = 0; level < db()->NumberLevels(); level++) { + std::string value; + char name[100]; + snprintf(name, sizeof(name), "rocksdb.num-files-at-level%d", level); + ASSERT_TRUE(db()->GetProperty(name, &value)); + files += atoi(value.c_str()); + } + ASSERT_GT(files, 0); +} +#endif // ROCKSDB_LITE +#endif // ROCKSDB_VALGRIND_RUN + +class MemTableTest : public testing::Test {}; + +TEST_F(MemTableTest, Simple) { + InternalKeyComparator cmp(BytewiseComparator()); + auto table_factory = std::make_shared<SkipListFactory>(); + Options options; + options.memtable_factory = table_factory; + ImmutableCFOptions ioptions(options); + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* memtable = + new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + memtable->Ref(); + WriteBatch batch; + WriteBatchInternal::SetSequence(&batch, 100); + batch.Put(std::string("k1"), std::string("v1")); + batch.Put(std::string("k2"), std::string("v2")); + batch.Put(std::string("k3"), std::string("v3")); + batch.Put(std::string("largekey"), std::string("vlarge")); + batch.DeleteRange(std::string("chi"), std::string("xigua")); + batch.DeleteRange(std::string("begin"), std::string("end")); + ColumnFamilyMemTablesDefault cf_mems_default(memtable); + ASSERT_TRUE( + WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr).ok()); + + for (int i = 0; i < 2; ++i) { + Arena arena; + ScopedArenaIterator arena_iter_guard; + std::unique_ptr<InternalIterator> iter_guard; + InternalIterator* iter; + if (i == 0) { + iter = memtable->NewIterator(ReadOptions(), &arena); + arena_iter_guard.set(iter); + } else { + iter = memtable->NewRangeTombstoneIterator( + ReadOptions(), kMaxSequenceNumber /* read_seq */); + iter_guard.reset(iter); + } + if (iter == nullptr) { + continue; + } + iter->SeekToFirst(); + while (iter->Valid()) { + fprintf(stderr, "key: '%s' -> '%s'\n", iter->key().ToString().c_str(), + iter->value().ToString().c_str()); + iter->Next(); + } + } + + delete memtable->Unref(); +} + +// Test the empty key +TEST_F(HarnessTest, SimpleEmptyKey) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 1); + Add("", "v"); + Test(&rnd); + } +} + +TEST_F(HarnessTest, SimpleSingle) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 2); + Add("abc", "v"); + Test(&rnd); + } +} + +TEST_F(HarnessTest, SimpleMulti) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 3); + Add("abc", "v"); + Add("abcd", "v"); + Add("ac", "v2"); + Test(&rnd); + } +} + +TEST_F(HarnessTest, SimpleSpecialKey) { + auto args = GenerateArgList(); + for (const auto& arg : args) { + Init(arg); + Random rnd(test::RandomSeed() + 4); + Add("\xff\xff", "v3"); + Test(&rnd); + } +} + +TEST_F(HarnessTest, FooterTests) { + { + // upconvert legacy block based + std::string encoded; + Footer footer(kLegacyBlockBasedTableMagicNumber, 0); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kCRC32c); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 0U); + } + { + // xxhash block based + std::string encoded; + Footer footer(kBlockBasedTableMagicNumber, 1); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.set_checksum(kxxHash); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kxxHash); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 1U); + } + { + // xxhash64 block based + std::string encoded; + Footer footer(kBlockBasedTableMagicNumber, 1); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.set_checksum(kxxHash64); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kxxHash64); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 1U); + } +// Plain table is not supported in ROCKSDB_LITE +#ifndef ROCKSDB_LITE + { + // upconvert legacy plain table + std::string encoded; + Footer footer(kLegacyPlainTableMagicNumber, 0); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kCRC32c); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 0U); + } + { + // xxhash block based + std::string encoded; + Footer footer(kPlainTableMagicNumber, 1); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.set_checksum(kxxHash); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kxxHash); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 1U); + } +#endif // !ROCKSDB_LITE + { + // version == 2 + std::string encoded; + Footer footer(kBlockBasedTableMagicNumber, 2); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kCRC32c); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 2U); + } +} + +class IndexBlockRestartIntervalTest + : public TableTest, + public ::testing::WithParamInterface<std::pair<int, bool>> { + public: + static std::vector<std::pair<int, bool>> GetRestartValues() { + return {{-1, false}, {0, false}, {1, false}, {8, false}, + {16, false}, {32, false}, {-1, true}, {0, true}, + {1, true}, {8, true}, {16, true}, {32, true}}; + } +}; + +INSTANTIATE_TEST_CASE_P( + IndexBlockRestartIntervalTest, IndexBlockRestartIntervalTest, + ::testing::ValuesIn(IndexBlockRestartIntervalTest::GetRestartValues())); + +TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) { + const int kKeysInTable = 10000; + const int kKeySize = 100; + const int kValSize = 500; + + const int index_block_restart_interval = std::get<0>(GetParam()); + const bool value_delta_encoding = std::get<1>(GetParam()); + + Options options; + BlockBasedTableOptions table_options; + table_options.block_size = 64; // small block size to get big index block + table_options.index_block_restart_interval = index_block_restart_interval; + if (value_delta_encoding) { + table_options.format_version = 4; + } + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + TableConstructor c(BytewiseComparator()); + static Random rnd(301); + for (int i = 0; i < kKeysInTable; i++) { + InternalKey k(RandomString(&rnd, kKeySize), 0, kTypeValue); + c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize)); + } + + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + std::unique_ptr<InternalKeyComparator> comparator( + new InternalKeyComparator(BytewiseComparator())); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, + &kvmap); + auto reader = c.GetTableReader(); + + std::unique_ptr<InternalIterator> db_iter( + reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get())); + + // Test point lookup + for (auto& kv : kvmap) { + db_iter->Seek(kv.first); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ(db_iter->key(), kv.first); + ASSERT_EQ(db_iter->value(), kv.second); + } + + // Test iterating + auto kv_iter = kvmap.begin(); + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + ASSERT_EQ(db_iter->key(), kv_iter->first); + ASSERT_EQ(db_iter->value(), kv_iter->second); + kv_iter++; + } + ASSERT_EQ(kv_iter, kvmap.end()); + c.ResetTableReader(); +} + +class PrefixTest : public testing::Test { + public: + PrefixTest() : testing::Test() {} + ~PrefixTest() override {} +}; + +namespace { +// A simple PrefixExtractor that only works for test PrefixAndWholeKeyTest +class TestPrefixExtractor : public rocksdb::SliceTransform { + public: + ~TestPrefixExtractor() override{}; + const char* Name() const override { return "TestPrefixExtractor"; } + + rocksdb::Slice Transform(const rocksdb::Slice& src) const override { + assert(IsValid(src)); + return rocksdb::Slice(src.data(), 3); + } + + bool InDomain(const rocksdb::Slice& src) const override { + assert(IsValid(src)); + return true; + } + + bool InRange(const rocksdb::Slice& /*dst*/) const override { return true; } + + bool IsValid(const rocksdb::Slice& src) const { + if (src.size() != 4) { + return false; + } + if (src[0] != '[') { + return false; + } + if (src[1] < '0' || src[1] > '9') { + return false; + } + if (src[2] != ']') { + return false; + } + if (src[3] < '0' || src[3] > '9') { + return false; + } + return true; + } +}; +} // namespace + +TEST_F(PrefixTest, PrefixAndWholeKeyTest) { + rocksdb::Options options; + options.compaction_style = rocksdb::kCompactionStyleUniversal; + options.num_levels = 20; + options.create_if_missing = true; + options.optimize_filters_for_hits = false; + options.target_file_size_base = 268435456; + options.prefix_extractor = std::make_shared<TestPrefixExtractor>(); + rocksdb::BlockBasedTableOptions bbto; + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10)); + bbto.block_size = 262144; + bbto.whole_key_filtering = true; + + const std::string kDBPath = test::PerThreadDBPath("table_prefix_test"); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyDB(kDBPath, options); + rocksdb::DB* db; + ASSERT_OK(rocksdb::DB::Open(options, kDBPath, &db)); + + // Create a bunch of keys with 10 filters. + for (int i = 0; i < 10; i++) { + std::string prefix = "[" + std::to_string(i) + "]"; + for (int j = 0; j < 10; j++) { + std::string key = prefix + std::to_string(j); + db->Put(rocksdb::WriteOptions(), key, "1"); + } + } + + // Trigger compaction. + db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + delete db; + // In the second round, turn whole_key_filtering off and expect + // rocksdb still works. +} + +/* + * Disable TableWithGlobalSeqno since RocksDB does not store global_seqno in + * the SST file any more. Instead, RocksDB deduces global_seqno from the + * MANIFEST while reading from an SST. Therefore, it's not possible to test the + * functionality of global_seqno in a single, isolated unit test without the + * involvement of Version, VersionSet, etc. + */ +TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { + BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); + test::StringSink* sink = new test::StringSink(); + std::unique_ptr<WritableFileWriter> file_writer( + test::GetWritableFileWriter(sink, "" /* don't care */)); + Options options; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + InternalKeyComparator ikc(options.comparator); + std::vector<std::unique_ptr<IntTblPropCollectorFactory>> + int_tbl_prop_collector_factories; + int_tbl_prop_collector_factories.emplace_back( + new SstFileWriterPropertiesCollectorFactory(2 /* version */, + 0 /* global_seqno*/)); + std::string column_family_name; + std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kNoCompression, + 0 /* sample_for_compression */, CompressionOptions(), + false /* skip_filters */, column_family_name, -1), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + file_writer.get())); + + for (char c = 'a'; c <= 'z'; ++c) { + std::string key(8, c); + std::string value = key; + InternalKey ik(key, 0, kTypeValue); + + builder->Add(ik.Encode(), value); + } + ASSERT_OK(builder->Finish()); + file_writer->Flush(); + + test::RandomRWStringSink ss_rw(sink); + uint32_t version; + uint64_t global_seqno; + uint64_t global_seqno_offset; + + // Helper function to get version, global_seqno, global_seqno_offset + std::function<void()> GetVersionAndGlobalSeqno = [&]() { + std::unique_ptr<RandomAccessFileReader> file_reader( + test::GetRandomAccessFileReader( + new test::StringSource(ss_rw.contents(), 73342, true))); + + TableProperties* props = nullptr; + ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), + kBlockBasedTableMagicNumber, ioptions, + &props, true /* compression_type_missing */)); + + UserCollectedProperties user_props = props->user_collected_properties; + version = DecodeFixed32( + user_props[ExternalSstFilePropertyNames::kVersion].c_str()); + global_seqno = DecodeFixed64( + user_props[ExternalSstFilePropertyNames::kGlobalSeqno].c_str()); + global_seqno_offset = + props->properties_offsets[ExternalSstFilePropertyNames::kGlobalSeqno]; + + delete props; + }; + + // Helper function to update the value of the global seqno in the file + std::function<void(uint64_t)> SetGlobalSeqno = [&](uint64_t val) { + std::string new_global_seqno; + PutFixed64(&new_global_seqno, val); + + ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno)); + }; + + // Helper function to get the contents of the table InternalIterator + std::unique_ptr<TableReader> table_reader; + std::function<InternalIterator*()> GetTableInternalIter = [&]() { + std::unique_ptr<RandomAccessFileReader> file_reader( + test::GetRandomAccessFileReader( + new test::StringSource(ss_rw.contents(), 73342, true))); + + options.table_factory->NewTableReader( + TableReaderOptions(ioptions, moptions.prefix_extractor.get(), + EnvOptions(), ikc), + std::move(file_reader), ss_rw.contents().size(), &table_reader); + + return table_reader->NewIterator(ReadOptions(), + moptions.prefix_extractor.get()); + }; + + GetVersionAndGlobalSeqno(); + ASSERT_EQ(2, version); + ASSERT_EQ(0, global_seqno); + + InternalIterator* iter = GetTableInternalIter(); + char current_c = 'a'; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey pik; + ASSERT_TRUE(ParseInternalKey(iter->key(), &pik)); + + ASSERT_EQ(pik.type, ValueType::kTypeValue); + ASSERT_EQ(pik.sequence, 0); + ASSERT_EQ(pik.user_key, iter->value()); + ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c)); + current_c++; + } + ASSERT_EQ(current_c, 'z' + 1); + delete iter; + + // Update global sequence number to 10 + SetGlobalSeqno(10); + GetVersionAndGlobalSeqno(); + ASSERT_EQ(2, version); + ASSERT_EQ(10, global_seqno); + + iter = GetTableInternalIter(); + current_c = 'a'; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey pik; + ASSERT_TRUE(ParseInternalKey(iter->key(), &pik)); + + ASSERT_EQ(pik.type, ValueType::kTypeValue); + ASSERT_EQ(pik.sequence, 10); + ASSERT_EQ(pik.user_key, iter->value()); + ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c)); + current_c++; + } + ASSERT_EQ(current_c, 'z' + 1); + + // Verify Seek + for (char c = 'a'; c <= 'z'; c++) { + std::string k = std::string(8, c); + InternalKey ik(k, 10, kValueTypeForSeek); + iter->Seek(ik.Encode()); + ASSERT_TRUE(iter->Valid()); + + ParsedInternalKey pik; + ASSERT_TRUE(ParseInternalKey(iter->key(), &pik)); + + ASSERT_EQ(pik.type, ValueType::kTypeValue); + ASSERT_EQ(pik.sequence, 10); + ASSERT_EQ(pik.user_key.ToString(), k); + ASSERT_EQ(iter->value().ToString(), k); + } + delete iter; + + // Update global sequence number to 3 + SetGlobalSeqno(3); + GetVersionAndGlobalSeqno(); + ASSERT_EQ(2, version); + ASSERT_EQ(3, global_seqno); + + iter = GetTableInternalIter(); + current_c = 'a'; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ParsedInternalKey pik; + ASSERT_TRUE(ParseInternalKey(iter->key(), &pik)); + + ASSERT_EQ(pik.type, ValueType::kTypeValue); + ASSERT_EQ(pik.sequence, 3); + ASSERT_EQ(pik.user_key, iter->value()); + ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c)); + current_c++; + } + ASSERT_EQ(current_c, 'z' + 1); + + // Verify Seek + for (char c = 'a'; c <= 'z'; c++) { + std::string k = std::string(8, c); + // seqno=4 is less than 3 so we still should get our key + InternalKey ik(k, 4, kValueTypeForSeek); + iter->Seek(ik.Encode()); + ASSERT_TRUE(iter->Valid()); + + ParsedInternalKey pik; + ASSERT_TRUE(ParseInternalKey(iter->key(), &pik)); + + ASSERT_EQ(pik.type, ValueType::kTypeValue); + ASSERT_EQ(pik.sequence, 3); + ASSERT_EQ(pik.user_key.ToString(), k); + ASSERT_EQ(iter->value().ToString(), k); + } + + delete iter; +} + +TEST_P(BlockBasedTableTest, BlockAlignTest) { + BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); + bbto.block_align = true; + test::StringSink* sink = new test::StringSink(); + std::unique_ptr<WritableFileWriter> file_writer( + test::GetWritableFileWriter(sink, "" /* don't care */)); + Options options; + options.compression = kNoCompression; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + InternalKeyComparator ikc(options.comparator); + std::vector<std::unique_ptr<IntTblPropCollectorFactory>> + int_tbl_prop_collector_factories; + std::string column_family_name; + std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kNoCompression, + 0 /* sample_for_compression */, CompressionOptions(), + false /* skip_filters */, column_family_name, -1), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + file_writer.get())); + + for (int i = 1; i <= 10000; ++i) { + std::ostringstream ostr; + ostr << std::setfill('0') << std::setw(5) << i; + std::string key = ostr.str(); + std::string value = "val"; + InternalKey ik(key, 0, kTypeValue); + + builder->Add(ik.Encode(), value); + } + ASSERT_OK(builder->Finish()); + file_writer->Flush(); + + test::RandomRWStringSink ss_rw(sink); + std::unique_ptr<RandomAccessFileReader> file_reader( + test::GetRandomAccessFileReader( + new test::StringSource(ss_rw.contents(), 73342, true))); + + // Helper function to get version, global_seqno, global_seqno_offset + std::function<void()> VerifyBlockAlignment = [&]() { + TableProperties* props = nullptr; + ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), + kBlockBasedTableMagicNumber, ioptions, + &props, true /* compression_type_missing */)); + + uint64_t data_block_size = props->data_size / props->num_data_blocks; + ASSERT_EQ(data_block_size, 4096); + ASSERT_EQ(props->data_size, data_block_size * props->num_data_blocks); + delete props; + }; + + VerifyBlockAlignment(); + + // The below block of code verifies that we can read back the keys. Set + // block_align to false when creating the reader to ensure we can flip between + // the two modes without any issues + std::unique_ptr<TableReader> table_reader; + bbto.block_align = false; + Options options2; + options2.table_factory.reset(NewBlockBasedTableFactory(bbto)); + ImmutableCFOptions ioptions2(options2); + const MutableCFOptions moptions2(options2); + + ASSERT_OK(ioptions.table_factory->NewTableReader( + TableReaderOptions(ioptions2, moptions2.prefix_extractor.get(), + EnvOptions(), + GetPlainInternalComparator(options2.comparator)), + std::move(file_reader), ss_rw.contents().size(), &table_reader)); + + std::unique_ptr<InternalIterator> db_iter(table_reader->NewIterator( + ReadOptions(), moptions2.prefix_extractor.get())); + + int expected_key = 1; + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + std::ostringstream ostr; + ostr << std::setfill('0') << std::setw(5) << expected_key++; + std::string key = ostr.str(); + std::string value = "val"; + + ASSERT_OK(db_iter->status()); + ASSERT_EQ(ExtractUserKey(db_iter->key()).ToString(), key); + ASSERT_EQ(db_iter->value().ToString(), value); + } + expected_key--; + ASSERT_EQ(expected_key, 10000); + table_reader.reset(); +} + +TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { + BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); + bbto.block_align = true; + test::StringSink* sink = new test::StringSink(); + std::unique_ptr<WritableFileWriter> file_writer( + test::GetWritableFileWriter(sink, "" /* don't care */)); + + Options options; + options.compression = kNoCompression; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + InternalKeyComparator ikc(options.comparator); + std::vector<std::unique_ptr<IntTblPropCollectorFactory>> + int_tbl_prop_collector_factories; + std::string column_family_name; + + std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kNoCompression, + 0 /* sample_for_compression */, CompressionOptions(), + false /* skip_filters */, column_family_name, -1), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + file_writer.get())); + + for (int i = 1; i <= 10000; ++i) { + std::ostringstream ostr; + ostr << std::setfill('0') << std::setw(5) << i; + std::string key = ostr.str(); + std::string value = "val"; + InternalKey ik(key, 0, kTypeValue); + + builder->Add(ik.Encode(), value); + } + ASSERT_OK(builder->Finish()); + file_writer->Flush(); + + test::RandomRWStringSink ss_rw(sink); + std::unique_ptr<RandomAccessFileReader> file_reader( + test::GetRandomAccessFileReader( + new test::StringSource(ss_rw.contents(), 73342, true))); + + { + RandomAccessFileReader* file = file_reader.get(); + uint64_t file_size = ss_rw.contents().size(); + + Footer footer; + ASSERT_OK(ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size, + &footer, kBlockBasedTableMagicNumber)); + + auto BlockFetchHelper = [&](const BlockHandle& handle, + BlockContents* contents) { + ReadOptions read_options; + read_options.verify_checksums = false; + PersistentCacheOptions cache_options; + + BlockFetcher block_fetcher( + file, nullptr /* prefetch_buffer */, footer, read_options, handle, + contents, ioptions, false /* decompress */, + false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), + cache_options); + + ASSERT_OK(block_fetcher.ReadBlockContents()); + }; + + // -- Read metaindex block + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + + BlockFetchHelper(metaindex_handle, &metaindex_contents); + Block metaindex_block(std::move(metaindex_contents), + kDisableGlobalSequenceNumber); + + std::unique_ptr<InternalIterator> meta_iter( + metaindex_block.NewIterator<DataBlockIter>(BytewiseComparator(), + BytewiseComparator())); + bool found_properties_block = true; + ASSERT_OK(SeekToPropertiesBlock(meta_iter.get(), &found_properties_block)); + ASSERT_TRUE(found_properties_block); + + // -- Read properties block + Slice v = meta_iter->value(); + BlockHandle properties_handle; + ASSERT_OK(properties_handle.DecodeFrom(&v)); + BlockContents properties_contents; + + BlockFetchHelper(properties_handle, &properties_contents); + Block properties_block(std::move(properties_contents), + kDisableGlobalSequenceNumber); + + ASSERT_EQ(properties_block.NumRestarts(), 1); + } +} + +TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { + // The properties meta-block should come at the end since we always need to + // read it when opening a file, unlike index/filter/other meta-blocks, which + // are sometimes read depending on the user's configuration. This ordering + // allows us to do a small readahead on the end of the file to read properties + // and meta-index blocks with one I/O. + TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); + c.Add("a1", "val1"); + c.Add("b2", "val2"); + c.Add("c3", "val3"); + c.Add("d4", "val4"); + c.Add("e5", "val5"); + c.Add("f6", "val6"); + c.Add("g7", "val7"); + c.Add("h8", "val8"); + c.Add("j9", "val9"); + + // write an SST file + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.filter_policy.reset(NewBloomFilterPolicy( + 8 /* bits_per_key */, false /* use_block_based_filter */)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ImmutableCFOptions ioptions(options); + MutableCFOptions moptions(options); + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + // get file reader + test::StringSink* table_sink = c.TEST_GetSink(); + std::unique_ptr<RandomAccessFileReader> table_reader{ + test::GetRandomAccessFileReader( + new test::StringSource(table_sink->contents(), 0 /* unique_id */, + false /* allow_mmap_reads */))}; + size_t table_size = table_sink->contents().size(); + + // read footer + Footer footer; + ASSERT_OK(ReadFooterFromFile(table_reader.get(), + nullptr /* prefetch_buffer */, table_size, + &footer, kBlockBasedTableMagicNumber)); + + // read metaindex + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + PersistentCacheOptions pcache_opts; + BlockFetcher block_fetcher( + table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(), + metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, + false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), + pcache_opts, nullptr /*memory_allocator*/); + ASSERT_OK(block_fetcher.ReadBlockContents()); + Block metaindex_block(std::move(metaindex_contents), + kDisableGlobalSequenceNumber); + + // verify properties block comes last + std::unique_ptr<InternalIterator> metaindex_iter{ + metaindex_block.NewIterator<DataBlockIter>(options.comparator, + options.comparator)}; + uint64_t max_offset = 0; + std::string key_at_max_offset; + for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); + metaindex_iter->Next()) { + BlockHandle handle; + Slice value = metaindex_iter->value(); + ASSERT_OK(handle.DecodeFrom(&value)); + if (handle.offset() > max_offset) { + max_offset = handle.offset(); + key_at_max_offset = metaindex_iter->key().ToString(); + } + } + ASSERT_EQ(kPropertiesBlock, key_at_max_offset); + // index handle is stored in footer rather than metaindex block, so need + // separate logic to verify it comes before properties block. + ASSERT_GT(max_offset, footer.index_handle().offset()); + c.ResetTableReader(); +} + +TEST_P(BlockBasedTableTest, BadOptions) { + rocksdb::Options options; + options.compression = kNoCompression; + BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); + bbto.block_size = 4000; + bbto.block_align = true; + + const std::string kDBPath = + test::PerThreadDBPath("block_based_table_bad_options_test"); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyDB(kDBPath, options); + rocksdb::DB* db; + ASSERT_NOK(rocksdb::DB::Open(options, kDBPath, &db)); + + bbto.block_size = 4096; + options.compression = kSnappyCompression; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + ASSERT_NOK(rocksdb::DB::Open(options, kDBPath, &db)); +} + +TEST_F(BBTTailPrefetchTest, TestTailPrefetchStats) { + TailPrefetchStats tpstats; + ASSERT_EQ(0, tpstats.GetSuggestedPrefetchSize()); + tpstats.RecordEffectiveSize(size_t{1000}); + tpstats.RecordEffectiveSize(size_t{1005}); + tpstats.RecordEffectiveSize(size_t{1002}); + ASSERT_EQ(1005, tpstats.GetSuggestedPrefetchSize()); + + // One single super large value shouldn't influence much + tpstats.RecordEffectiveSize(size_t{1002000}); + tpstats.RecordEffectiveSize(size_t{999}); + ASSERT_LE(1005, tpstats.GetSuggestedPrefetchSize()); + ASSERT_GT(1200, tpstats.GetSuggestedPrefetchSize()); + + // Only history of 32 is kept + for (int i = 0; i < 32; i++) { + tpstats.RecordEffectiveSize(size_t{100}); + } + ASSERT_EQ(100, tpstats.GetSuggestedPrefetchSize()); + + // 16 large values and 16 small values. The result should be closer + // to the small value as the algorithm. + for (int i = 0; i < 16; i++) { + tpstats.RecordEffectiveSize(size_t{1000}); + } + tpstats.RecordEffectiveSize(size_t{10}); + tpstats.RecordEffectiveSize(size_t{20}); + for (int i = 0; i < 6; i++) { + tpstats.RecordEffectiveSize(size_t{100}); + } + ASSERT_LE(80, tpstats.GetSuggestedPrefetchSize()); + ASSERT_GT(200, tpstats.GetSuggestedPrefetchSize()); +} + +TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) { + TailPrefetchStats tpstats; + FilePrefetchBuffer buffer(nullptr, 0, 0, false, true); + buffer.TryReadFromCache(500, 10, nullptr); + buffer.TryReadFromCache(480, 10, nullptr); + buffer.TryReadFromCache(490, 10, nullptr); + ASSERT_EQ(480, buffer.min_offset_read()); +} + +TEST_P(BlockBasedTableTest, DataBlockHashIndex) { + const int kNumKeys = 500; + const int kKeySize = 8; + const int kValSize = 40; + + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.data_block_index_type = + BlockBasedTableOptions::kDataBlockBinaryAndHash; + + Options options; + options.comparator = BytewiseComparator(); + + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + TableConstructor c(options.comparator); + + static Random rnd(1048); + for (int i = 0; i < kNumKeys; i++) { + // padding one "0" to mark existent keys. + std::string random_key(RandomString(&rnd, kKeySize - 1) + "1"); + InternalKey k(random_key, 0, kTypeValue); + c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize)); + } + + std::vector<std::string> keys; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + const InternalKeyComparator internal_comparator(options.comparator); + c.Finish(options, ioptions, moptions, table_options, internal_comparator, + &keys, &kvmap); + + auto reader = c.GetTableReader(); + + std::unique_ptr<InternalIterator> seek_iter; + seek_iter.reset( + reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get())); + for (int i = 0; i < 2; ++i) { + ReadOptions ro; + // for every kv, we seek using two method: Get() and Seek() + // Get() will use the SuffixIndexHash in Block. For non-existent key it + // will invalidate the iterator + // Seek() will use the default BinarySeek() in Block. So for non-existent + // key it will land at the closest key that is large than target. + + // Search for existent keys + for (auto& kv : kvmap) { + if (i == 0) { + // Search using Seek() + seek_iter->Seek(kv.first); + ASSERT_OK(seek_iter->status()); + ASSERT_TRUE(seek_iter->Valid()); + ASSERT_EQ(seek_iter->key(), kv.first); + ASSERT_EQ(seek_iter->value(), kv.second); + } else { + // Search using Get() + PinnableSlice value; + std::string user_key = ExtractUserKey(kv.first).ToString(); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, nullptr, + nullptr, nullptr, nullptr); + ASSERT_OK(reader->Get(ro, kv.first, &get_context, + moptions.prefix_extractor.get())); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value, Slice(kv.second)); + value.Reset(); + } + } + + // Search for non-existent keys + for (auto& kv : kvmap) { + std::string user_key = ExtractUserKey(kv.first).ToString(); + user_key.back() = '0'; // make it non-existent key + InternalKey internal_key(user_key, 0, kTypeValue); + std::string encoded_key = internal_key.Encode().ToString(); + if (i == 0) { // Search using Seek() + seek_iter->Seek(encoded_key); + ASSERT_OK(seek_iter->status()); + if (seek_iter->Valid()) { + ASSERT_TRUE(BytewiseComparator()->Compare( + user_key, ExtractUserKey(seek_iter->key())) < 0); + } + } else { // Search using Get() + PinnableSlice value; + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, nullptr, + nullptr, nullptr, nullptr); + ASSERT_OK(reader->Get(ro, encoded_key, &get_context, + moptions.prefix_extractor.get())); + ASSERT_EQ(get_context.State(), GetContext::kNotFound); + value.Reset(); + } + } + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/two_level_iterator.cc b/src/rocksdb/table/two_level_iterator.cc new file mode 100644 index 00000000..a8f617de --- /dev/null +++ b/src/rocksdb/table/two_level_iterator.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/two_level_iterator.h" +#include "db/pinned_iterators_manager.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/format.h" +#include "util/arena.h" + +namespace rocksdb { + +namespace { + +class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> { + public: + explicit TwoLevelIndexIterator( + TwoLevelIteratorState* state, + InternalIteratorBase<BlockHandle>* first_level_iter); + + ~TwoLevelIndexIterator() override { + first_level_iter_.DeleteIter(false /* is_arena_mode */); + second_level_iter_.DeleteIter(false /* is_arena_mode */); + delete state_; + } + + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void SeekToFirst() override; + void SeekToLast() override; + void Next() override; + void Prev() override; + + bool Valid() const override { return second_level_iter_.Valid(); } + Slice key() const override { + assert(Valid()); + return second_level_iter_.key(); + } + BlockHandle value() const override { + assert(Valid()); + return second_level_iter_.value(); + } + Status status() const override { + if (!first_level_iter_.status().ok()) { + assert(second_level_iter_.iter() == nullptr); + return first_level_iter_.status(); + } else if (second_level_iter_.iter() != nullptr && + !second_level_iter_.status().ok()) { + return second_level_iter_.status(); + } else { + return status_; + } + } + void SetPinnedItersMgr( + PinnedIteratorsManager* /*pinned_iters_mgr*/) override {} + bool IsKeyPinned() const override { return false; } + bool IsValuePinned() const override { return false; } + + private: + void SaveError(const Status& s) { + if (status_.ok() && !s.ok()) status_ = s; + } + void SkipEmptyDataBlocksForward(); + void SkipEmptyDataBlocksBackward(); + void SetSecondLevelIterator(InternalIteratorBase<BlockHandle>* iter); + void InitDataBlock(); + + TwoLevelIteratorState* state_; + IteratorWrapperBase<BlockHandle> first_level_iter_; + IteratorWrapperBase<BlockHandle> second_level_iter_; // May be nullptr + Status status_; + // If second_level_iter is non-nullptr, then "data_block_handle_" holds the + // "index_value" passed to block_function_ to create the second_level_iter. + BlockHandle data_block_handle_; +}; + +TwoLevelIndexIterator::TwoLevelIndexIterator( + TwoLevelIteratorState* state, + InternalIteratorBase<BlockHandle>* first_level_iter) + : state_(state), first_level_iter_(first_level_iter) {} + +void TwoLevelIndexIterator::Seek(const Slice& target) { + first_level_iter_.Seek(target); + + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.Seek(target); + } + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIndexIterator::SeekForPrev(const Slice& target) { + first_level_iter_.Seek(target); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekForPrev(target); + } + if (!Valid()) { + if (!first_level_iter_.Valid() && first_level_iter_.status().ok()) { + first_level_iter_.SeekToLast(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekForPrev(target); + } + } + SkipEmptyDataBlocksBackward(); + } +} + +void TwoLevelIndexIterator::SeekToFirst() { + first_level_iter_.SeekToFirst(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToFirst(); + } + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIndexIterator::SeekToLast() { + first_level_iter_.SeekToLast(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToLast(); + } + SkipEmptyDataBlocksBackward(); +} + +void TwoLevelIndexIterator::Next() { + assert(Valid()); + second_level_iter_.Next(); + SkipEmptyDataBlocksForward(); +} + +void TwoLevelIndexIterator::Prev() { + assert(Valid()); + second_level_iter_.Prev(); + SkipEmptyDataBlocksBackward(); +} + +void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() { + while (second_level_iter_.iter() == nullptr || + (!second_level_iter_.Valid() && second_level_iter_.status().ok())) { + // Move to next block + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); + return; + } + first_level_iter_.Next(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToFirst(); + } + } +} + +void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() { + while (second_level_iter_.iter() == nullptr || + (!second_level_iter_.Valid() && second_level_iter_.status().ok())) { + // Move to next block + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); + return; + } + first_level_iter_.Prev(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekToLast(); + } + } +} + +void TwoLevelIndexIterator::SetSecondLevelIterator( + InternalIteratorBase<BlockHandle>* iter) { + InternalIteratorBase<BlockHandle>* old_iter = second_level_iter_.Set(iter); + delete old_iter; +} + +void TwoLevelIndexIterator::InitDataBlock() { + if (!first_level_iter_.Valid()) { + SetSecondLevelIterator(nullptr); + } else { + BlockHandle handle = first_level_iter_.value(); + if (second_level_iter_.iter() != nullptr && + !second_level_iter_.status().IsIncomplete() && + handle.offset() == data_block_handle_.offset()) { + // second_level_iter is already constructed with this iterator, so + // no need to change anything + } else { + InternalIteratorBase<BlockHandle>* iter = + state_->NewSecondaryIterator(handle); + data_block_handle_ = handle; + SetSecondLevelIterator(iter); + } + } +} + +} // namespace + +InternalIteratorBase<BlockHandle>* NewTwoLevelIterator( + TwoLevelIteratorState* state, + InternalIteratorBase<BlockHandle>* first_level_iter) { + return new TwoLevelIndexIterator(state, first_level_iter); +} +} // namespace rocksdb diff --git a/src/rocksdb/table/two_level_iterator.h b/src/rocksdb/table/two_level_iterator.h new file mode 100644 index 00000000..55d5c01a --- /dev/null +++ b/src/rocksdb/table/two_level_iterator.h @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "rocksdb/iterator.h" +#include "rocksdb/env.h" +#include "table/iterator_wrapper.h" + +namespace rocksdb { + +struct ReadOptions; +class InternalKeyComparator; + +// TwoLevelIteratorState expects iterators are not created using the arena +struct TwoLevelIteratorState { + TwoLevelIteratorState() {} + + virtual ~TwoLevelIteratorState() {} + virtual InternalIteratorBase<BlockHandle>* NewSecondaryIterator( + const BlockHandle& handle) = 0; +}; + + +// Return a new two level iterator. A two-level iterator contains an +// index iterator whose values point to a sequence of blocks where +// each block is itself a sequence of key,value pairs. The returned +// two-level iterator yields the concatenation of all key/value pairs +// in the sequence of blocks. Takes ownership of "index_iter" and +// will delete it when no longer needed. +// +// Uses a supplied function to convert an index_iter value into +// an iterator over the contents of the corresponding block. +// Note: this function expects first_level_iter was not created using the arena +extern InternalIteratorBase<BlockHandle>* NewTwoLevelIterator( + TwoLevelIteratorState* state, + InternalIteratorBase<BlockHandle>* first_level_iter); + +} // namespace rocksdb |