diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:45:59 +0000 |
commit | 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch) | |
tree | 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/table/block_based | |
parent | Initial commit. (diff) | |
download | ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip |
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/table/block_based')
43 files changed, 16655 insertions, 0 deletions
diff --git a/src/rocksdb/table/block_based/block.cc b/src/rocksdb/table/block_based/block.cc new file mode 100644 index 000000000..a04dd8ac2 --- /dev/null +++ b/src/rocksdb/table/block_based/block.cc @@ -0,0 +1,1004 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Decodes the blocks generated by block_builder.cc. + +#include "table/block_based/block.h" +#include <algorithm> +#include <string> +#include <unordered_map> +#include <vector> + +#include "logging/logging.h" +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/comparator.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/data_block_footer.h" +#include "table/format.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +// Helper routine: decode the next block entry starting at "p", +// storing the number of shared key bytes, non_shared key bytes, +// and the length of the value in "*shared", "*non_shared", and +// "*value_length", respectively. Will not derefence past "limit". +// +// If any errors are detected, returns nullptr. Otherwise, returns a +// pointer to the key delta (just past the three decoded values). +struct DecodeEntry { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + assert(limit - p >= 3); + *shared = reinterpret_cast<const unsigned char*>(p)[0]; + *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; + *value_length = reinterpret_cast<const unsigned char*>(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { + return nullptr; + } + } + + // Using an assert in place of "return null" since we should not pay the + // cost of checking for corruption on every single key decoding + assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length))); + return p; + } +}; + +// Helper routine: similar to DecodeEntry but does not have assertions. +// Instead, returns nullptr so that caller can detect and report failure. +struct CheckAndDecodeEntry { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared, + uint32_t* value_length) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + if (limit - p < 3) { + return nullptr; + } + *shared = reinterpret_cast<const unsigned char*>(p)[0]; + *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; + *value_length = reinterpret_cast<const unsigned char*>(p)[2]; + if ((*shared | *non_shared | *value_length) < 128) { + // Fast path: all three values are encoded in one byte each + p += 3; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) { + return nullptr; + } + } + + if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) { + return nullptr; + } + return p; + } +}; + +struct DecodeKey { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + uint32_t value_length; + return DecodeEntry()(p, limit, shared, non_shared, &value_length); + } +}; + +// In format_version 4, which is used by index blocks, the value size is not +// encoded before the entry, as the value is known to be the handle with the +// known size. +struct DecodeKeyV4 { + inline const char* operator()(const char* p, const char* limit, + uint32_t* shared, uint32_t* non_shared) { + // We need 2 bytes for shared and non_shared size. We also need one more + // byte either for value size or the actual value in case of value delta + // encoding. + if (limit - p < 3) return nullptr; + *shared = reinterpret_cast<const unsigned char*>(p)[0]; + *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; + if ((*shared | *non_shared) < 128) { + // Fast path: all three values are encoded in one byte each + p += 2; + } else { + if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; + if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; + } + return p; + } +}; + +void DataBlockIter::Next() { + assert(Valid()); + ParseNextDataKey<DecodeEntry>(); +} + +void DataBlockIter::NextOrReport() { + assert(Valid()); + ParseNextDataKey<CheckAndDecodeEntry>(); +} + +void IndexBlockIter::Next() { + assert(Valid()); + ParseNextIndexKey(); +} + +void IndexBlockIter::Prev() { + assert(Valid()); + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + SeekToRestartPoint(restart_index_); + // Loop until end of current entry hits the start of original entry + while (ParseNextIndexKey() && NextEntryOffset() < original) { + } +} + +// Similar to IndexBlockIter::Prev but also caches the prev entries +void DataBlockIter::Prev() { + assert(Valid()); + + assert(prev_entries_idx_ == -1 || + static_cast<size_t>(prev_entries_idx_) < prev_entries_.size()); + // Check if we can use cached prev_entries_ + if (prev_entries_idx_ > 0 && + prev_entries_[prev_entries_idx_].offset == current_) { + // Read cached CachedPrevEntry + prev_entries_idx_--; + const CachedPrevEntry& current_prev_entry = + prev_entries_[prev_entries_idx_]; + + const char* key_ptr = nullptr; + if (current_prev_entry.key_ptr != nullptr) { + // The key is not delta encoded and stored in the data block + key_ptr = current_prev_entry.key_ptr; + key_pinned_ = true; + } else { + // The key is delta encoded and stored in prev_entries_keys_buff_ + key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; + key_pinned_ = false; + } + const Slice current_key(key_ptr, current_prev_entry.key_size); + + current_ = current_prev_entry.offset; + key_.SetKey(current_key, false /* copy */); + value_ = current_prev_entry.value; + + return; + } + + // Clear prev entries cache + prev_entries_idx_ = -1; + prev_entries_.clear(); + prev_entries_keys_buff_.clear(); + + // Scan backwards to a restart point before current_ + const uint32_t original = current_; + while (GetRestartPoint(restart_index_) >= original) { + if (restart_index_ == 0) { + // No more entries + current_ = restarts_; + restart_index_ = num_restarts_; + return; + } + restart_index_--; + } + + SeekToRestartPoint(restart_index_); + + do { + if (!ParseNextDataKey<DecodeEntry>()) { + break; + } + Slice current_key = key(); + + if (key_.IsKeyPinned()) { + // The key is not delta encoded + prev_entries_.emplace_back(current_, current_key.data(), 0, + current_key.size(), value()); + } else { + // The key is delta encoded, cache decoded key in buffer + size_t new_key_offset = prev_entries_keys_buff_.size(); + prev_entries_keys_buff_.append(current_key.data(), current_key.size()); + + prev_entries_.emplace_back(current_, nullptr, new_key_offset, + current_key.size(), value()); + } + // Loop until end of current entry hits the start of original entry + } while (NextEntryOffset() < original); + prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1; +} + +void DataBlockIter::Seek(const Slice& target) { + Slice seek_key = target; + PERF_TIMER_GUARD(block_seek_nanos); + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index, + comparator_); + + if (!ok) { + return; + } + SeekToRestartPoint(index); + + // Linear search (within restart block) for first key >= target + while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) { + } +} + +// Optimized Seek for point lookup for an internal key `target` +// target = "seek_user_key @ type | seqno". +// +// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion, +// or kTypeBlobIndex, this function behaves identically as Seek(). +// +// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion, +// or kTypeBlobIndex: +// +// If the return value is FALSE, iter location is undefined, and it means: +// 1) there is no key in this block falling into the range: +// ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"], +// inclusive; AND +// 2) the last key of this block has a greater user_key from seek_user_key +// +// If the return value is TRUE, iter location has two possibilies: +// 1) If iter is valid, it is set to a location as if set by BinarySeek. In +// this case, it points to the first key_ with a larger user_key or a +// matching user_key with a seqno no greater than the seeking seqno. +// 2) If the iter is invalid, it means that either all the user_key is less +// than the seek_user_key, or the block ends with a matching user_key but +// with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno +// but larger type). +bool DataBlockIter::SeekForGetImpl(const Slice& target) { + Slice target_user_key = ExtractUserKey(target); + uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t); + uint8_t entry = + data_block_hash_index_->Lookup(data_, map_offset, target_user_key); + + if (entry == kCollision) { + // HashSeek not effective, falling back + Seek(target); + return true; + } + + if (entry == kNoEntry) { + // Even if we cannot find the user_key in this block, the result may + // exist in the next block. Consider this exmpale: + // + // Block N: [aab@100, ... , app@120] + // bounary key: axy@50 (we make minimal assumption about a boundary key) + // Block N+1: [axy@10, ... ] + // + // If seek_key = axy@60, the search will starts from Block N. + // Even if the user_key is not found in the hash map, the caller still + // have to conntinue searching the next block. + // + // In this case, we pretend the key is the the last restart interval. + // The while-loop below will search the last restart interval for the + // key. It will stop at the first key that is larger than the seek_key, + // or to the end of the block if no one is larger. + entry = static_cast<uint8_t>(num_restarts_ - 1); + } + + uint32_t restart_index = entry; + + // check if the key is in the restart_interval + assert(restart_index < num_restarts_); + SeekToRestartPoint(restart_index); + + const char* limit = nullptr; + if (restart_index_ + 1 < num_restarts_) { + limit = data_ + GetRestartPoint(restart_index_ + 1); + } else { + limit = data_ + restarts_; + } + + while (true) { + // Here we only linear seek the target key inside the restart interval. + // If a key does not exist inside a restart interval, we avoid + // further searching the block content accross restart interval boundary. + // + // TODO(fwu): check the left and write boundary of the restart interval + // to avoid linear seek a target key that is out of range. + if (!ParseNextDataKey<DecodeEntry>(limit) || Compare(key_, target) >= 0) { + // we stop at the first potential matching user key. + break; + } + } + + if (current_ == restarts_) { + // Search reaches to the end of the block. There are three possibilites: + // 1) there is only one user_key match in the block (otherwise collsion). + // the matching user_key resides in the last restart interval, and it + // is the last key of the restart interval and of the block as well. + // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller. + // + // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, + // AND all existing user_keys in the restart interval are smaller than + // seek_user_key. + // + // 3) The seek_key is a false positive and happens to be hashed to the + // last restart interval, AND all existing user_keys in the restart + // interval are smaller than seek_user_key. + // + // The result may exist in the next block each case, so we return true. + return true; + } + + if (user_comparator_->Compare(key_.GetUserKey(), target_user_key) != 0) { + // the key is not in this block and cannot be at the next block either. + return false; + } + + // Here we are conservative and only support a limited set of cases + ValueType value_type = ExtractValueType(key_.GetKey()); + if (value_type != ValueType::kTypeValue && + value_type != ValueType::kTypeDeletion && + value_type != ValueType::kTypeSingleDeletion && + value_type != ValueType::kTypeBlobIndex) { + Seek(target); + return true; + } + + // Result found, and the iter is correctly set. + return true; +} + +void IndexBlockIter::Seek(const Slice& target) { + TEST_SYNC_POINT("IndexBlockIter::Seek:0"); + Slice seek_key = target; + if (!key_includes_seq_) { + seek_key = ExtractUserKey(target); + } + PERF_TIMER_GUARD(block_seek_nanos); + if (data_ == nullptr) { // Not init yet + return; + } + status_ = Status::OK(); + uint32_t index = 0; + bool ok = false; + if (prefix_index_) { + bool prefix_may_exist = true; + ok = PrefixSeek(target, &index, &prefix_may_exist); + if (!prefix_may_exist) { + // This is to let the caller to distinguish between non-existing prefix, + // and when key is larger than the last key, which both set Valid() to + // false. + current_ = restarts_; + status_ = Status::NotFound(); + } + } else if (value_delta_encoded_) { + ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index, + comparator_); + } else { + ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index, + comparator_); + } + + if (!ok) { + return; + } + SeekToRestartPoint(index); + + // Linear search (within restart block) for first key >= target + while (ParseNextIndexKey() && Compare(key_, seek_key) < 0) { + } +} + +void DataBlockIter::SeekForPrev(const Slice& target) { + PERF_TIMER_GUARD(block_seek_nanos); + Slice seek_key = target; + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index, + comparator_); + + if (!ok) { + return; + } + SeekToRestartPoint(index); + + // Linear search (within restart block) for first key >= seek_key + while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) { + } + if (!Valid()) { + SeekToLast(); + } else { + while (Valid() && Compare(key_, seek_key) > 0) { + Prev(); + } + } +} + +void DataBlockIter::SeekToFirst() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(0); + ParseNextDataKey<DecodeEntry>(); +} + +void DataBlockIter::SeekToFirstOrReport() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(0); + ParseNextDataKey<CheckAndDecodeEntry>(); +} + +void IndexBlockIter::SeekToFirst() { + if (data_ == nullptr) { // Not init yet + return; + } + status_ = Status::OK(); + SeekToRestartPoint(0); + ParseNextIndexKey(); +} + +void DataBlockIter::SeekToLast() { + if (data_ == nullptr) { // Not init yet + return; + } + SeekToRestartPoint(num_restarts_ - 1); + while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) { + // Keep skipping + } +} + +void IndexBlockIter::SeekToLast() { + if (data_ == nullptr) { // Not init yet + return; + } + status_ = Status::OK(); + SeekToRestartPoint(num_restarts_ - 1); + while (ParseNextIndexKey() && NextEntryOffset() < restarts_) { + // Keep skipping + } +} + +template <class TValue> +void BlockIter<TValue>::CorruptionError() { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption("bad entry in block"); + key_.Clear(); + value_.clear(); +} + +template <typename DecodeEntryFunc> +bool DataBlockIter::ParseNextDataKey(const char* limit) { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + if (!limit) { + limit = data_ + restarts_; // Restarts come right after data + } + + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + + // Decode next entry + uint32_t shared, non_shared, value_length; + p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length); + if (p == nullptr || key_.Size() < shared) { + CorruptionError(); + return false; + } else { + if (shared == 0) { + // If this key dont share any bytes with prev key then we dont need + // to decode it and can use it's address in the block directly. + key_.SetKey(Slice(p, non_shared), false /* copy */); + key_pinned_ = true; + } else { + // This key share `shared` bytes with prev key, we need to decode it + key_.TrimAppend(shared, p, non_shared); + key_pinned_ = false; + } + + if (global_seqno_ != kDisableGlobalSequenceNumber) { + // If we are reading a file with a global sequence number we should + // expect that all encoded sequence numbers are zeros and any value + // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion. + assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0); + + ValueType value_type = ExtractValueType(key_.GetKey()); + assert(value_type == ValueType::kTypeValue || + value_type == ValueType::kTypeMerge || + value_type == ValueType::kTypeDeletion || + value_type == ValueType::kTypeRangeDeletion); + + if (key_pinned_) { + // TODO(tec): Investigate updating the seqno in the loaded block + // directly instead of doing a copy and update. + + // We cannot use the key address in the block directly because + // we have a global_seqno_ that will overwrite the encoded one. + key_.OwnKey(); + key_pinned_ = false; + } + + key_.UpdateInternalKey(global_seqno_, value_type); + } + + value_ = Slice(p + non_shared, value_length); + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed + return true; + } +} + +bool IndexBlockIter::ParseNextIndexKey() { + current_ = NextEntryOffset(); + const char* p = data_ + current_; + const char* limit = data_ + restarts_; // Restarts come right after data + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; + } + + // Decode next entry + uint32_t shared, non_shared, value_length; + if (value_delta_encoded_) { + p = DecodeKeyV4()(p, limit, &shared, &non_shared); + value_length = 0; + } else { + p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length); + } + if (p == nullptr || key_.Size() < shared) { + CorruptionError(); + return false; + } + if (shared == 0) { + // If this key dont share any bytes with prev key then we dont need + // to decode it and can use it's address in the block directly. + key_.SetKey(Slice(p, non_shared), false /* copy */); + key_pinned_ = true; + } else { + // This key share `shared` bytes with prev key, we need to decode it + key_.TrimAppend(shared, p, non_shared); + key_pinned_ = false; + } + value_ = Slice(p + non_shared, value_length); + if (shared == 0) { + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + } + // else we are in the middle of a restart interval and the restart_index_ + // thus has not changed + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { + DecodeCurrentValue(shared); + } + return true; +} + +// The format: +// restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// ... +// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) +// where, k is key, v is value, and its encoding is in parenthesis. +// The format of each key is (shared_size, non_shared_size, shared, non_shared) +// The format of each value, i.e., block hanlde, is (offset, size) whenever the +// shared_size is 0, which included the first entry in each restart point. +// Otherwise the format is delta-size = block handle size - size of last block +// handle. +void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { + Slice v(value_.data(), data_ + restarts_ - value_.data()); + // Delta encoding is used if `shared` != 0. + Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom( + &v, have_first_key_, + (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr); + assert(decode_s.ok()); + value_ = Slice(value_.data(), v.data() - value_.data()); + + if (global_seqno_state_ != nullptr) { + // Overwrite sequence number the same way as in DataBlockIter. + + IterKey& first_internal_key = global_seqno_state_->first_internal_key; + first_internal_key.SetInternalKey(decoded_value_.first_internal_key, + /* copy */ true); + + assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0); + + ValueType value_type = ExtractValueType(first_internal_key.GetKey()); + assert(value_type == ValueType::kTypeValue || + value_type == ValueType::kTypeMerge || + value_type == ValueType::kTypeDeletion || + value_type == ValueType::kTypeRangeDeletion); + + first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno, + value_type); + decoded_value_.first_internal_key = first_internal_key.GetKey(); + } +} + +// Binary search in restart array to find the first restart point that +// is either the last restart point with a key less than target, +// which means the key of next restart point is larger than target, or +// the first restart point with a key = target +template <class TValue> +template <typename DecodeKeyFunc> +bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left, + uint32_t right, uint32_t* index, + const Comparator* comp) { + assert(left <= right); + + while (left < right) { + uint32_t mid = (left + right + 1) / 2; + uint32_t region_offset = GetRestartPoint(mid); + uint32_t shared, non_shared; + const char* key_ptr = DecodeKeyFunc()( + data_ + region_offset, data_ + restarts_, &shared, &non_shared); + if (key_ptr == nullptr || (shared != 0)) { + CorruptionError(); + return false; + } + Slice mid_key(key_ptr, non_shared); + int cmp = comp->Compare(mid_key, target); + if (cmp < 0) { + // Key at "mid" is smaller than "target". Therefore all + // blocks before "mid" are uninteresting. + left = mid; + } else if (cmp > 0) { + // Key at "mid" is >= "target". Therefore all blocks at or + // after "mid" are uninteresting. + right = mid - 1; + } else { + left = right = mid; + } + } + + *index = left; + return true; +} + +// Compare target key and the block key of the block of `block_index`. +// Return -1 if error. +int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { + uint32_t region_offset = GetRestartPoint(block_index); + uint32_t shared, non_shared; + const char* key_ptr = + value_delta_encoded_ + ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared) + : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared, + &non_shared); + if (key_ptr == nullptr || (shared != 0)) { + CorruptionError(); + return 1; // Return target is smaller + } + Slice block_key(key_ptr, non_shared); + return Compare(block_key, target); +} + +// Binary search in block_ids to find the first block +// with a key >= target +bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, + uint32_t* block_ids, uint32_t left, + uint32_t right, uint32_t* index, + bool* prefix_may_exist) { + assert(left <= right); + assert(index); + assert(prefix_may_exist); + *prefix_may_exist = true; + uint32_t left_bound = left; + + while (left <= right) { + uint32_t mid = (right + left) / 2; + + int cmp = CompareBlockKey(block_ids[mid], target); + if (!status_.ok()) { + return false; + } + if (cmp < 0) { + // Key at "target" is larger than "mid". Therefore all + // blocks before or at "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "target" is <= "mid". Therefore all blocks + // after "mid" are uninteresting. + // If there is only one block left, we found it. + if (left == right) break; + right = mid; + } + } + + if (left == right) { + // In one of the two following cases: + // (1) left is the first one of block_ids + // (2) there is a gap of blocks between block of `left` and `left-1`. + // we can further distinguish the case of key in the block or key not + // existing, by comparing the target key and the key of the previous + // block to the left of the block found. + if (block_ids[left] > 0 && + (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && + CompareBlockKey(block_ids[left] - 1, target) > 0) { + current_ = restarts_; + *prefix_may_exist = false; + return false; + } + + *index = block_ids[left]; + return true; + } else { + assert(left > right); + + // If the next block key is larger than seek key, it is possible that + // no key shares the prefix with `target`, or all keys with the same + // prefix as `target` are smaller than prefix. In the latter case, + // we are mandated to set the position the same as the total order. + // In the latter case, either: + // (1) `target` falls into the range of the next block. In this case, + // we can place the iterator to the next block, or + // (2) `target` is larger than all block keys. In this case we can + // keep the iterator invalidate without setting `prefix_may_exist` + // to false. + // We might sometimes end up with setting the total order position + // while there is no key sharing the prefix as `target`, but it + // still follows the contract. + uint32_t right_index = block_ids[right]; + assert(right_index + 1 <= num_restarts_); + if (right_index + 1 < num_restarts_) { + if (CompareBlockKey(right_index + 1, target) >= 0) { + *index = right_index + 1; + return true; + } else { + // We have to set the flag here because we are not positioning + // the iterator to the total order position. + *prefix_may_exist = false; + } + } + + // Mark iterator invalid + current_ = restarts_; + return false; + } +} + +bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index, + bool* prefix_may_exist) { + assert(index); + assert(prefix_may_exist); + assert(prefix_index_); + *prefix_may_exist = true; + Slice seek_key = target; + if (!key_includes_seq_) { + seek_key = ExtractUserKey(target); + } + uint32_t* block_ids = nullptr; + uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); + + if (num_blocks == 0) { + current_ = restarts_; + *prefix_may_exist = false; + return false; + } else { + assert(block_ids); + return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index, + prefix_may_exist); + } +} + +uint32_t Block::NumRestarts() const { + assert(size_ >= 2 * sizeof(uint32_t)); + uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); + uint32_t num_restarts = block_footer; + if (size_ > kMaxBlockSizeSupportedByHashIndex) { + // In BlockBuilder, we have ensured a block with HashIndex is less than + // kMaxBlockSizeSupportedByHashIndex (64KiB). + // + // Therefore, if we encounter a block with a size > 64KiB, the block + // cannot have HashIndex. So the footer will directly interpreted as + // num_restarts. + // + // Such check is for backward compatibility. We can ensure legacy block + // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted + // correctly as no HashIndex even if the MSB of num_restarts is set. + return num_restarts; + } + BlockBasedTableOptions::DataBlockIndexType index_type; + UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); + return num_restarts; +} + +BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const { + assert(size_ >= 2 * sizeof(uint32_t)); + if (size_ > kMaxBlockSizeSupportedByHashIndex) { + // The check is for the same reason as that in NumRestarts() + return BlockBasedTableOptions::kDataBlockBinarySearch; + } + uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t)); + uint32_t num_restarts = block_footer; + BlockBasedTableOptions::DataBlockIndexType index_type; + UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts); + return index_type; +} + +Block::~Block() { + // This sync point can be re-enabled if RocksDB can control the + // initialization order of any/all static options created by the user. + // TEST_SYNC_POINT("Block::~Block"); +} + +Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, + size_t read_amp_bytes_per_bit, Statistics* statistics) + : contents_(std::move(contents)), + data_(contents_.data.data()), + size_(contents_.data.size()), + restart_offset_(0), + num_restarts_(0), + global_seqno_(_global_seqno) { + TEST_SYNC_POINT("Block::Block:0"); + if (size_ < sizeof(uint32_t)) { + size_ = 0; // Error marker + } else { + // Should only decode restart points for uncompressed blocks + num_restarts_ = NumRestarts(); + switch (IndexType()) { + case BlockBasedTableOptions::kDataBlockBinarySearch: + restart_offset_ = static_cast<uint32_t>(size_) - + (1 + num_restarts_) * sizeof(uint32_t); + if (restart_offset_ > size_ - sizeof(uint32_t)) { + // The size is too small for NumRestarts() and therefore + // restart_offset_ wrapped around. + size_ = 0; + } + break; + case BlockBasedTableOptions::kDataBlockBinaryAndHash: + if (size_ < sizeof(uint32_t) /* block footer */ + + sizeof(uint16_t) /* NUM_BUCK */) { + size_ = 0; + break; + } + + uint16_t map_offset; + data_block_hash_index_.Initialize( + contents.data.data(), + static_cast<uint16_t>(contents.data.size() - + sizeof(uint32_t)), /*chop off + NUM_RESTARTS*/ + &map_offset); + + restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t); + + if (restart_offset_ > map_offset) { + // map_offset is too small for NumRestarts() and + // therefore restart_offset_ wrapped around. + size_ = 0; + break; + } + break; + default: + size_ = 0; // Error marker + } + } + if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { + read_amp_bitmap_.reset(new BlockReadAmpBitmap( + restart_offset_, read_amp_bytes_per_bit, statistics)); + } +} + +DataBlockIter* Block::NewDataIterator(const Comparator* cmp, + const Comparator* ucmp, + DataBlockIter* iter, Statistics* stats, + bool block_contents_pinned) { + DataBlockIter* ret_iter; + if (iter != nullptr) { + ret_iter = iter; + } else { + ret_iter = new DataBlockIter; + } + if (size_ < 2 * sizeof(uint32_t)) { + ret_iter->Invalidate(Status::Corruption("bad block contents")); + return ret_iter; + } + if (num_restarts_ == 0) { + // Empty block. + ret_iter->Invalidate(Status::OK()); + return ret_iter; + } else { + ret_iter->Initialize( + cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_, + read_amp_bitmap_.get(), block_contents_pinned, + data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr); + if (read_amp_bitmap_) { + if (read_amp_bitmap_->GetStatistics() != stats) { + // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ + read_amp_bitmap_->SetStatistics(stats); + } + } + } + + return ret_iter; +} + +IndexBlockIter* Block::NewIndexIterator( + const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter, + Statistics* /*stats*/, bool total_order_seek, bool have_first_key, + bool key_includes_seq, bool value_is_full, bool block_contents_pinned, + BlockPrefixIndex* prefix_index) { + IndexBlockIter* ret_iter; + if (iter != nullptr) { + ret_iter = iter; + } else { + ret_iter = new IndexBlockIter; + } + if (size_ < 2 * sizeof(uint32_t)) { + ret_iter->Invalidate(Status::Corruption("bad block contents")); + return ret_iter; + } + if (num_restarts_ == 0) { + // Empty block. + ret_iter->Invalidate(Status::OK()); + return ret_iter; + } else { + BlockPrefixIndex* prefix_index_ptr = + total_order_seek ? nullptr : prefix_index; + ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, + global_seqno_, prefix_index_ptr, have_first_key, + key_includes_seq, value_is_full, + block_contents_pinned); + } + + return ret_iter; +} + +size_t Block::ApproximateMemoryUsage() const { + size_t usage = usable_size(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size((void*)this); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + if (read_amp_bitmap_) { + usage += read_amp_bitmap_->ApproximateMemoryUsage(); + } + return usage; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block.h b/src/rocksdb/table/block_based/block.h new file mode 100644 index 000000000..e82a1b2a6 --- /dev/null +++ b/src/rocksdb/table/block_based/block.h @@ -0,0 +1,631 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <stddef.h> +#include <stdint.h> +#include <string> +#include <vector> + +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "port/malloc.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/data_block_hash_index.h" +#include "table/format.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +struct BlockContents; +class Comparator; +template <class TValue> +class BlockIter; +class DataBlockIter; +class IndexBlockIter; +class BlockPrefixIndex; + +// BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data +// bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of +// bytes in the Block we update the bitmap and increment +// READ_AMP_ESTIMATE_USEFUL_BYTES. +class BlockReadAmpBitmap { + public: + explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit, + Statistics* statistics) + : bitmap_(nullptr), + bytes_per_bit_pow_(0), + statistics_(statistics), + rnd_(Random::GetTLSInstance()->Uniform( + static_cast<int>(bytes_per_bit))) { + TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_); + assert(block_size > 0 && bytes_per_bit > 0); + + // convert bytes_per_bit to be a power of 2 + while (bytes_per_bit >>= 1) { + bytes_per_bit_pow_++; + } + + // num_bits_needed = ceil(block_size / bytes_per_bit) + size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1; + assert(num_bits_needed > 0); + + // bitmap_size = ceil(num_bits_needed / kBitsPerEntry) + size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1; + + // Create bitmap and set all the bits to 0 + bitmap_ = new std::atomic<uint32_t>[bitmap_size](); + + RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size); + } + + ~BlockReadAmpBitmap() { delete[] bitmap_; } + + void Mark(uint32_t start_offset, uint32_t end_offset) { + assert(end_offset >= start_offset); + // Index of first bit in mask + uint32_t start_bit = + (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >> + bytes_per_bit_pow_; + // Index of last bit in mask + 1 + uint32_t exclusive_end_bit = + (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_; + if (start_bit >= exclusive_end_bit) { + return; + } + assert(exclusive_end_bit > 0); + + if (GetAndSet(start_bit) == 0) { + uint32_t new_useful_bytes = (exclusive_end_bit - start_bit) + << bytes_per_bit_pow_; + RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES, + new_useful_bytes); + } + } + + Statistics* GetStatistics() { + return statistics_.load(std::memory_order_relaxed); + } + + void SetStatistics(Statistics* stats) { statistics_.store(stats); } + + uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; } + + size_t ApproximateMemoryUsage() const { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + return malloc_usable_size((void*)this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return sizeof(*this); + } + + private: + // Get the current value of bit at `bit_idx` and set it to 1 + inline bool GetAndSet(uint32_t bit_idx) { + const uint32_t byte_idx = bit_idx / kBitsPerEntry; + const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry); + + return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) & + bit_mask; + } + + const uint32_t kBytesPersEntry = sizeof(uint32_t); // 4 bytes + const uint32_t kBitsPerEntry = kBytesPersEntry * 8; // 32 bits + + // Bitmap used to record the bytes that we read, use atomic to protect + // against multiple threads updating the same bit + std::atomic<uint32_t>* bitmap_; + // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize + // muliplication and division + uint8_t bytes_per_bit_pow_; + // Pointer to DB Statistics object, Since this bitmap may outlive the DB + // this pointer maybe invalid, but the DB will update it to a valid pointer + // by using SetStatistics() before calling Mark() + std::atomic<Statistics*> statistics_; + uint32_t rnd_; +}; + +// This Block class is not for any old block: it is designed to hold only +// uncompressed blocks containing sorted key-value pairs. It is thus +// suitable for storing uncompressed data blocks, index blocks (including +// partitions), range deletion blocks, properties blocks, metaindex blocks, +// as well as the top level of the partitioned filter structure (which is +// actually an index of the filter partitions). It is NOT suitable for +// compressed blocks in general, filter blocks/partitions, or compression +// dictionaries (since the latter do not contain sorted key-value pairs). +// Use BlockContents directly for those. +// +// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format +// for details of the format and the various block types. +class Block { + public: + // Initialize the block with the specified contents. + explicit Block(BlockContents&& contents, SequenceNumber _global_seqno, + size_t read_amp_bytes_per_bit = 0, + Statistics* statistics = nullptr); + // No copying allowed + Block(const Block&) = delete; + void operator=(const Block&) = delete; + + ~Block(); + + size_t size() const { return size_; } + const char* data() const { return data_; } + // The additional memory space taken by the block data. + size_t usable_size() const { return contents_.usable_size(); } + uint32_t NumRestarts() const; + bool own_bytes() const { return contents_.own_bytes(); } + + BlockBasedTableOptions::DataBlockIndexType IndexType() const; + + // If comparator is InternalKeyComparator, user_comparator is its user + // comparator; they are equal otherwise. + // + // If iter is null, return new Iterator + // If iter is not null, update this one and return it as Iterator* + // + // Updates read_amp_bitmap_ if it is not nullptr. + // + // If `block_contents_pinned` is true, the caller will guarantee that when + // the cleanup functions are transferred from the iterator to other + // classes, e.g. PinnableSlice, the pointer to the bytes will still be + // valid. Either the iterator holds cache handle or ownership of some resource + // and release them in a release function, or caller is sure that the data + // will not go away (for example, it's from mmapped file which will not be + // closed). + // + // NOTE: for the hash based lookup, if a key prefix doesn't match any key, + // the iterator will simply be set as "invalid", rather than returning + // the key that is just pass the target key. + DataBlockIter* NewDataIterator(const Comparator* comparator, + const Comparator* user_comparator, + DataBlockIter* iter = nullptr, + Statistics* stats = nullptr, + bool block_contents_pinned = false); + + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default true, means that no delta encoding is + // applied to values. + // + // If `prefix_index` is not nullptr this block will do hash lookup for the key + // prefix. If total_order_seek is true, prefix_index_ is ignored. + // + // `have_first_key` controls whether IndexValue will contain + // first_internal_key. It affects data serialization format, so the same value + // have_first_key must be used when writing and reading index. + // It is determined by IndexType property of the table. + IndexBlockIter* NewIndexIterator(const Comparator* comparator, + const Comparator* user_comparator, + IndexBlockIter* iter, Statistics* stats, + bool total_order_seek, bool have_first_key, + bool key_includes_seq, bool value_is_full, + bool block_contents_pinned = false, + BlockPrefixIndex* prefix_index = nullptr); + + // Report an approximation of how much memory has been used. + size_t ApproximateMemoryUsage() const; + + SequenceNumber global_seqno() const { return global_seqno_; } + + private: + BlockContents contents_; + const char* data_; // contents_.data.data() + size_t size_; // contents_.data.size() + uint32_t restart_offset_; // Offset in data_ of restart array + uint32_t num_restarts_; + std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_; + // All keys in the block will have seqno = global_seqno_, regardless of + // the encoded value (kDisableGlobalSequenceNumber means disabled) + const SequenceNumber global_seqno_; + + DataBlockHashIndex data_block_hash_index_; +}; + +template <class TValue> +class BlockIter : public InternalIteratorBase<TValue> { + public: + void InitializeBase(const Comparator* comparator, const char* data, + uint32_t restarts, uint32_t num_restarts, + SequenceNumber global_seqno, bool block_contents_pinned) { + assert(data_ == nullptr); // Ensure it is called only once + assert(num_restarts > 0); // Ensure the param is valid + + comparator_ = comparator; + data_ = data; + restarts_ = restarts; + num_restarts_ = num_restarts; + current_ = restarts_; + restart_index_ = num_restarts_; + global_seqno_ = global_seqno; + block_contents_pinned_ = block_contents_pinned; + cache_handle_ = nullptr; + } + + // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do + // nothing. Calls cleanup functions. + void InvalidateBase(Status s) { + // Assert that the BlockIter is never deleted while Pinning is Enabled. + assert(!pinned_iters_mgr_ || + (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); + + data_ = nullptr; + current_ = restarts_; + status_ = s; + + // Call cleanup callbacks. + Cleanable::Reset(); + } + + bool Valid() const override { return current_ < restarts_; } + Status status() const override { return status_; } + Slice key() const override { + assert(Valid()); + return key_.GetKey(); + } + +#ifndef NDEBUG + ~BlockIter() override { + // Assert that the BlockIter is never deleted while Pinning is Enabled. + assert(!pinned_iters_mgr_ || + (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; +#endif + + bool IsKeyPinned() const override { + return block_contents_pinned_ && key_pinned_; + } + + bool IsValuePinned() const override { return block_contents_pinned_; } + + size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; } + + uint32_t ValueOffset() const { + return static_cast<uint32_t>(value_.data() - data_); + } + + void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; } + + Cache::Handle* cache_handle() { return cache_handle_; } + + protected: + // Note: The type could be changed to InternalKeyComparator but we see a weird + // performance drop by that. + const Comparator* comparator_; + const char* data_; // underlying block contents + uint32_t num_restarts_; // Number of uint32_t entries in restart array + + // Index of restart block in which current_ or current_-1 falls + uint32_t restart_index_; + uint32_t restarts_; // Offset of restart array (list of fixed32) + // current_ is offset in data_ of current entry. >= restarts_ if !Valid + uint32_t current_; + IterKey key_; + Slice value_; + Status status_; + bool key_pinned_; + // Whether the block data is guaranteed to outlive this iterator, and + // as long as the cleanup functions are transferred to another class, + // e.g. PinnableSlice, the pointer to the bytes will still be valid. + bool block_contents_pinned_; + SequenceNumber global_seqno_; + + private: + // Store the cache handle, if the block is cached. We need this since the + // only other place the handle is stored is as an argument to the Cleanable + // function callback, which is hard to retrieve. When multiple value + // PinnableSlices reference the block, they need the cache handle in order + // to bump up the ref count + Cache::Handle* cache_handle_; + + public: + // Return the offset in data_ just past the end of the current entry. + inline uint32_t NextEntryOffset() const { + // NOTE: We don't support blocks bigger than 2GB + return static_cast<uint32_t>((value_.data() + value_.size()) - data_); + } + + uint32_t GetRestartPoint(uint32_t index) { + assert(index < num_restarts_); + return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); + } + + void SeekToRestartPoint(uint32_t index) { + key_.Clear(); + restart_index_ = index; + // current_ will be fixed by ParseNextKey(); + + // ParseNextKey() starts at the end of value_, so set value_ accordingly + uint32_t offset = GetRestartPoint(index); + value_ = Slice(data_ + offset, 0); + } + + void CorruptionError(); + + template <typename DecodeKeyFunc> + inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right, + uint32_t* index, const Comparator* comp); +}; + +class DataBlockIter final : public BlockIter<Slice> { + public: + DataBlockIter() + : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} + DataBlockIter(const Comparator* comparator, const Comparator* user_comparator, + const char* data, uint32_t restarts, uint32_t num_restarts, + SequenceNumber global_seqno, + BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, + DataBlockHashIndex* data_block_hash_index) + : DataBlockIter() { + Initialize(comparator, user_comparator, data, restarts, num_restarts, + global_seqno, read_amp_bitmap, block_contents_pinned, + data_block_hash_index); + } + void Initialize(const Comparator* comparator, + const Comparator* user_comparator, const char* data, + uint32_t restarts, uint32_t num_restarts, + SequenceNumber global_seqno, + BlockReadAmpBitmap* read_amp_bitmap, + bool block_contents_pinned, + DataBlockHashIndex* data_block_hash_index) { + InitializeBase(comparator, data, restarts, num_restarts, global_seqno, + block_contents_pinned); + user_comparator_ = user_comparator; + key_.SetIsUserKey(false); + read_amp_bitmap_ = read_amp_bitmap; + last_bitmap_offset_ = current_ + 1; + data_block_hash_index_ = data_block_hash_index; + } + + Slice value() const override { + assert(Valid()); + if (read_amp_bitmap_ && current_ < restarts_ && + current_ != last_bitmap_offset_) { + read_amp_bitmap_->Mark(current_ /* current entry offset */, + NextEntryOffset() - 1); + last_bitmap_offset_ = current_; + } + return value_; + } + + void Seek(const Slice& target) override; + + inline bool SeekForGet(const Slice& target) { + if (!data_block_hash_index_) { + Seek(target); + return true; + } + + return SeekForGetImpl(target); + } + + void SeekForPrev(const Slice& target) override; + + void Prev() override; + + void Next() final override; + + // Try to advance to the next entry in the block. If there is data corruption + // or error, report it to the caller instead of aborting the process. May + // incur higher CPU overhead because we need to perform check on every entry. + void NextOrReport(); + + void SeekToFirst() override; + + // Try to seek to the first entry in the block. If there is data corruption + // or error, report it to caller instead of aborting the process. May incur + // higher CPU overhead because we need to perform check on every entry. + void SeekToFirstOrReport(); + + void SeekToLast() override; + + void Invalidate(Status s) { + InvalidateBase(s); + // Clear prev entries cache. + prev_entries_keys_buff_.clear(); + prev_entries_.clear(); + prev_entries_idx_ = -1; + } + + private: + // read-amp bitmap + BlockReadAmpBitmap* read_amp_bitmap_; + // last `current_` value we report to read-amp bitmp + mutable uint32_t last_bitmap_offset_; + struct CachedPrevEntry { + explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr, + size_t _key_offset, size_t _key_size, Slice _value) + : offset(_offset), + key_ptr(_key_ptr), + key_offset(_key_offset), + key_size(_key_size), + value(_value) {} + + // offset of entry in block + uint32_t offset; + // Pointer to key data in block (nullptr if key is delta-encoded) + const char* key_ptr; + // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr) + size_t key_offset; + // size of key + size_t key_size; + // value slice pointing to data in block + Slice value; + }; + std::string prev_entries_keys_buff_; + std::vector<CachedPrevEntry> prev_entries_; + int32_t prev_entries_idx_ = -1; + + DataBlockHashIndex* data_block_hash_index_; + const Comparator* user_comparator_; + + template <typename DecodeEntryFunc> + inline bool ParseNextDataKey(const char* limit = nullptr); + + inline int Compare(const IterKey& ikey, const Slice& b) const { + return comparator_->Compare(ikey.GetInternalKey(), b); + } + + bool SeekForGetImpl(const Slice& target); +}; + +class IndexBlockIter final : public BlockIter<IndexValue> { + public: + IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} + + Slice key() const override { + assert(Valid()); + return key_.GetKey(); + } + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default true, means that no delta encoding is + // applied to values. + void Initialize(const Comparator* comparator, + const Comparator* user_comparator, const char* data, + uint32_t restarts, uint32_t num_restarts, + SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, + bool have_first_key, bool key_includes_seq, + bool value_is_full, bool block_contents_pinned) { + InitializeBase(key_includes_seq ? comparator : user_comparator, data, + restarts, num_restarts, kDisableGlobalSequenceNumber, + block_contents_pinned); + key_includes_seq_ = key_includes_seq; + key_.SetIsUserKey(!key_includes_seq_); + prefix_index_ = prefix_index; + value_delta_encoded_ = !value_is_full; + have_first_key_ = have_first_key; + if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) { + global_seqno_state_.reset(new GlobalSeqnoState(global_seqno)); + } else { + global_seqno_state_.reset(); + } + } + + Slice user_key() const override { + if (key_includes_seq_) { + return ExtractUserKey(key()); + } + return key(); + } + + IndexValue value() const override { + assert(Valid()); + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { + return decoded_value_; + } else { + IndexValue entry; + Slice v = value_; + Status decode_s __attribute__((__unused__)) = + entry.DecodeFrom(&v, have_first_key_, nullptr); + assert(decode_s.ok()); + return entry; + } + } + + // IndexBlockIter follows a different contract for prefix iterator + // from data iterators. + // If prefix of the seek key `target` exists in the file, it must + // return the same result as total order seek. + // If the prefix of `target` doesn't exist in the file, it can either + // return the result of total order seek, or set both of Valid() = false + // and status() = NotFound(). + void Seek(const Slice& target) override; + + void SeekForPrev(const Slice&) override { + assert(false); + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::InvalidArgument( + "RocksDB internal error: should never call SeekForPrev() on index " + "blocks"); + key_.Clear(); + value_.clear(); + } + + void Prev() override; + + void Next() override; + + void SeekToFirst() override; + + void SeekToLast() override; + + void Invalidate(Status s) { InvalidateBase(s); } + + bool IsValuePinned() const override { + return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); + } + + private: + // Key is in InternalKey format + bool key_includes_seq_; + bool value_delta_encoded_; + bool have_first_key_; // value includes first_internal_key + BlockPrefixIndex* prefix_index_; + // Whether the value is delta encoded. In that case the value is assumed to be + // BlockHandle. The first value in each restart interval is the full encoded + // BlockHandle; the restart of encoded size part of the BlockHandle. The + // offset of delta encoded BlockHandles is computed by adding the size of + // previous delta encoded values in the same restart interval to the offset of + // the first value in that restart interval. + IndexValue decoded_value_; + + // When sequence number overwriting is enabled, this struct contains the seqno + // to overwrite with, and current first_internal_key with overwritten seqno. + // This is rarely used, so we put it behind a pointer and only allocate when + // needed. + struct GlobalSeqnoState { + // First internal key according to current index entry, but with sequence + // number overwritten to global_seqno. + IterKey first_internal_key; + SequenceNumber global_seqno; + + explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {} + }; + + std::unique_ptr<GlobalSeqnoState> global_seqno_state_; + + // Set *prefix_may_exist to false if no key possibly share the same prefix + // as `target`. If not set, the result position should be the same as total + // order Seek. + bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist); + // Set *prefix_may_exist to false if no key can possibly share the same + // prefix as `target`. If not set, the result position should be the same + // as total order seek. + bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, + uint32_t left, uint32_t right, uint32_t* index, + bool* prefix_may_exist); + inline int CompareBlockKey(uint32_t block_index, const Slice& target); + + inline int Compare(const Slice& a, const Slice& b) const { + return comparator_->Compare(a, b); + } + + inline int Compare(const IterKey& ikey, const Slice& b) const { + return comparator_->Compare(ikey.GetKey(), b); + } + + inline bool ParseNextIndexKey(); + + // When value_delta_encoded_ is enabled it decodes the value which is assumed + // to be BlockHandle and put it to decoded_value_ + inline void DecodeCurrentValue(uint32_t shared); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_based_filter_block.cc b/src/rocksdb/table/block_based/block_based_filter_block.cc new file mode 100644 index 000000000..de3f5cb13 --- /dev/null +++ b/src/rocksdb/table/block_based/block_based_filter_block.cc @@ -0,0 +1,347 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based/block_based_filter_block.h" +#include <algorithm> + +#include "db/dbformat.h" +#include "monitoring/perf_context_imp.h" +#include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +void AppendItem(std::string* props, const std::string& key, + const std::string& value) { + char cspace = ' '; + std::string value_str(""); + size_t i = 0; + const size_t dataLength = 64; + const size_t tabLength = 2; + const size_t offLength = 16; + + value_str.append(&value[i], std::min(size_t(dataLength), value.size())); + i += dataLength; + while (i < value.size()) { + value_str.append("\n"); + value_str.append(offLength, cspace); + value_str.append(&value[i], std::min(size_t(dataLength), value.size() - i)); + i += dataLength; + } + + std::string result(""); + if (key.size() < (offLength - tabLength)) + result.append(size_t((offLength - tabLength)) - key.size(), cspace); + result.append(key); + + props->append(result + ": " + value_str + "\n"); +} + +template <class TKey> +void AppendItem(std::string* props, const TKey& key, const std::string& value) { + std::string key_str = ROCKSDB_NAMESPACE::ToString(key); + AppendItem(props, key_str, value); +} +} // namespace + +// See doc/table_format.txt for an explanation of the filter block format. + +// Generate new filter every 2KB of data +static const size_t kFilterBaseLg = 11; +static const size_t kFilterBase = 1 << kFilterBaseLg; + +BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt) + : policy_(table_opt.filter_policy.get()), + prefix_extractor_(prefix_extractor), + whole_key_filtering_(table_opt.whole_key_filtering), + prev_prefix_start_(0), + prev_prefix_size_(0), + num_added_(0) { + assert(policy_); +} + +void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) { + uint64_t filter_index = (block_offset / kFilterBase); + assert(filter_index >= filter_offsets_.size()); + while (filter_index > filter_offsets_.size()) { + GenerateFilter(); + } +} + +void BlockBasedFilterBlockBuilder::Add(const Slice& key) { + if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { + AddPrefix(key); + } + + if (whole_key_filtering_) { + AddKey(key); + } +} + +// Add key to filter if needed +inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) { + num_added_++; + start_.push_back(entries_.size()); + entries_.append(key.data(), key.size()); +} + +// Add prefix to filter if needed +inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) { + // get slice for most recently added entry + Slice prev; + if (prev_prefix_size_ > 0) { + prev = Slice(entries_.data() + prev_prefix_start_, prev_prefix_size_); + } + + Slice prefix = prefix_extractor_->Transform(key); + // insert prefix only when it's different from the previous prefix. + if (prev.size() == 0 || prefix != prev) { + prev_prefix_start_ = entries_.size(); + prev_prefix_size_ = prefix.size(); + AddKey(prefix); + } +} + +Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, + Status* status) { + // In this impl we ignore BlockHandle + *status = Status::OK(); + if (!start_.empty()) { + GenerateFilter(); + } + + // Append array of per-filter offsets + const uint32_t array_offset = static_cast<uint32_t>(result_.size()); + for (size_t i = 0; i < filter_offsets_.size(); i++) { + PutFixed32(&result_, filter_offsets_[i]); + } + + PutFixed32(&result_, array_offset); + result_.push_back(kFilterBaseLg); // Save encoding parameter in result + return Slice(result_); +} + +void BlockBasedFilterBlockBuilder::GenerateFilter() { + const size_t num_entries = start_.size(); + if (num_entries == 0) { + // Fast path if there are no keys for this filter + filter_offsets_.push_back(static_cast<uint32_t>(result_.size())); + return; + } + + // Make list of keys from flattened key structure + start_.push_back(entries_.size()); // Simplify length computation + tmp_entries_.resize(num_entries); + for (size_t i = 0; i < num_entries; i++) { + const char* base = entries_.data() + start_[i]; + size_t length = start_[i + 1] - start_[i]; + tmp_entries_[i] = Slice(base, length); + } + + // Generate filter for current set of keys and append to result_. + filter_offsets_.push_back(static_cast<uint32_t>(result_.size())); + policy_->CreateFilter(&tmp_entries_[0], static_cast<int>(num_entries), + &result_); + + tmp_entries_.clear(); + entries_.clear(); + start_.clear(); + prev_prefix_start_ = 0; + prev_prefix_size_ = 0; +} + +BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( + const BlockBasedTable* t, CachableEntry<BlockContents>&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) { + assert(table()); + assert(table()->get_rep()); + assert(table()->get_rep()->filter_policy); +} + +std::unique_ptr<FilterBlockReader> BlockBasedFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry<BlockContents> filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + use_cache, nullptr /* get_context */, + lookup_context, &filter_block); + if (!s.ok()) { + return std::unique_ptr<FilterBlockReader>(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr<FilterBlockReader>( + new BlockBasedFilterBlockReader(table, std::move(filter_block))); +} + +bool BlockBasedFilterBlockReader::KeyMayMatch( + const Slice& key, const SliceTransform* /* prefix_extractor */, + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { + assert(block_offset != kNotValid); + if (!whole_key_filtering()) { + return true; + } + return MayMatch(key, block_offset, no_io, get_context, lookup_context); +} + +bool BlockBasedFilterBlockReader::PrefixMayMatch( + const Slice& prefix, const SliceTransform* /* prefix_extractor */, + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { + assert(block_offset != kNotValid); + return MayMatch(prefix, block_offset, no_io, get_context, lookup_context); +} + +bool BlockBasedFilterBlockReader::ParseFieldsFromBlock( + const BlockContents& contents, const char** data, const char** offset, + size_t* num, size_t* base_lg) { + assert(data); + assert(offset); + assert(num); + assert(base_lg); + + const size_t n = contents.data.size(); + if (n < 5) { // 1 byte for base_lg and 4 for start of offset array + return false; + } + + const uint32_t last_word = DecodeFixed32(contents.data.data() + n - 5); + if (last_word > n - 5) { + return false; + } + + *data = contents.data.data(); + *offset = (*data) + last_word; + *num = (n - 5 - last_word) / 4; + *base_lg = contents.data[n - 1]; + + return true; +} + +bool BlockBasedFilterBlockReader::MayMatch( + const Slice& entry, uint64_t block_offset, bool no_io, + GetContext* get_context, BlockCacheLookupContext* lookup_context) const { + CachableEntry<BlockContents> filter_block; + + const Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (!s.ok()) { + return true; + } + + assert(filter_block.GetValue()); + + const char* data = nullptr; + const char* offset = nullptr; + size_t num = 0; + size_t base_lg = 0; + if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num, + &base_lg)) { + return true; // Errors are treated as potential matches + } + + const uint64_t index = block_offset >> base_lg; + if (index < num) { + const uint32_t start = DecodeFixed32(offset + index * 4); + const uint32_t limit = DecodeFixed32(offset + index * 4 + 4); + if (start <= limit && limit <= (uint32_t)(offset - data)) { + const Slice filter = Slice(data + start, limit - start); + + assert(table()); + assert(table()->get_rep()); + const FilterPolicy* const policy = table()->get_rep()->filter_policy; + + const bool may_match = policy->KeyMayMatch(entry, filter); + if (may_match) { + PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + return true; + } else { + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + return false; + } + } else if (start == limit) { + // Empty filters do not match any entries + return false; + } + } + return true; // Errors are treated as potential matches +} + +size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const { + size_t usage = ApproximateFilterBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast<BlockBasedFilterBlockReader*>(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; +} + +std::string BlockBasedFilterBlockReader::ToString() const { + CachableEntry<BlockContents> filter_block; + + const Status s = + GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, + nullptr /* lookup_context */, &filter_block); + if (!s.ok()) { + return std::string("Unable to retrieve filter block"); + } + + assert(filter_block.GetValue()); + + const char* data = nullptr; + const char* offset = nullptr; + size_t num = 0; + size_t base_lg = 0; + if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num, + &base_lg)) { + return std::string("Error parsing filter block"); + } + + std::string result; + result.reserve(1024); + + std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks"); + AppendItem(&result, s_fb, ROCKSDB_NAMESPACE::ToString(num)); + AppendItem(&result, s_bo, s_hd); + + for (size_t index = 0; index < num; index++) { + uint32_t start = DecodeFixed32(offset + index * 4); + uint32_t limit = DecodeFixed32(offset + index * 4 + 4); + + if (start != limit) { + result.append(" filter block # " + + ROCKSDB_NAMESPACE::ToString(index + 1) + "\n"); + Slice filter = Slice(data + start, limit - start); + AppendItem(&result, start, filter.ToString(true)); + } + } + return result; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_based_filter_block.h b/src/rocksdb/table/block_based/block_based_filter_block.h new file mode 100644 index 000000000..01c98a70b --- /dev/null +++ b/src/rocksdb/table/block_based/block_based_filter_block.h @@ -0,0 +1,119 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A filter block is stored near the end of a Table file. It contains +// filters (e.g., bloom filters) for all data blocks in the table combined +// into a single filter block. + +#pragma once + +#include <stddef.h> +#include <stdint.h> +#include <memory> +#include <string> +#include <vector> + +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/format.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +// A BlockBasedFilterBlockBuilder is used to construct all of the filters for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// +// The sequence of calls to BlockBasedFilterBlockBuilder must match the regexp: +// (StartBlock Add*)* Finish +class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { + public: + BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt); + // No copying allowed + BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&) = delete; + void operator=(const BlockBasedFilterBlockBuilder&) = delete; + + virtual bool IsBlockBased() override { return true; } + virtual void StartBlock(uint64_t block_offset) override; + virtual void Add(const Slice& key) override; + virtual size_t NumAdded() const override { return num_added_; } + virtual Slice Finish(const BlockHandle& tmp, Status* status) override; + using FilterBlockBuilder::Finish; + + private: + void AddKey(const Slice& key); + void AddPrefix(const Slice& key); + void GenerateFilter(); + + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + + size_t prev_prefix_start_; // the position of the last appended prefix + // to "entries_". + size_t prev_prefix_size_; // the length of the last appended prefix to + // "entries_". + std::string entries_; // Flattened entry contents + std::vector<size_t> start_; // Starting index in entries_ of each entry + std::string result_; // Filter data computed so far + std::vector<Slice> tmp_entries_; // policy_->CreateFilter() argument + std::vector<uint32_t> filter_offsets_; + size_t num_added_; // Number of keys added +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +class BlockBasedFilterBlockReader + : public FilterBlockReaderCommon<BlockContents> { + public: + BlockBasedFilterBlockReader(const BlockBasedTable* t, + CachableEntry<BlockContents>&& filter_block); + // No copying allowed + BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&) = delete; + void operator=(const BlockBasedFilterBlockReader&) = delete; + + static std::unique_ptr<FilterBlockReader> Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); + + bool IsBlockBased() override { return true; } + + bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + bool PrefixMayMatch(const Slice& prefix, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + size_t ApproximateMemoryUsage() const override; + + // convert this object to a human readable form + std::string ToString() const override; + + private: + static bool ParseFieldsFromBlock(const BlockContents& contents, + const char** data, const char** offset, + size_t* num, size_t* base_lg); + + bool MayMatch(const Slice& entry, uint64_t block_offset, bool no_io, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_based_filter_block_test.cc b/src/rocksdb/table/block_based/block_based_filter_block_test.cc new file mode 100644 index 000000000..283d6a9a2 --- /dev/null +++ b/src/rocksdb/table/block_based/block_based_filter_block_test.cc @@ -0,0 +1,434 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based/block_based_filter_block.h" +#include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/mock_block_based_table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// For testing: emit an array with one hash value per key +class TestHashFilter : public FilterPolicy { + public: + const char* Name() const override { return "TestHashFilter"; } + + void CreateFilter(const Slice* keys, int n, std::string* dst) const override { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + bool KeyMayMatch(const Slice& key, const Slice& filter) const override { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } +}; + +class MockBlockBasedTable : public BlockBasedTable { + public: + explicit MockBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {} +}; + +class FilterBlockTest : public mock::MockBlockBasedTableTester, + public testing::Test { + public: + FilterBlockTest() : mock::MockBlockBasedTableTester(new TestHashFilter) {} +}; + +TEST_F(FilterBlockTest, EmptyBuilder) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + Slice slice(builder.Finish()); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice)); + + CachableEntry<BlockContents> block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +TEST_F(FilterBlockTest, SingleChunk) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + ASSERT_EQ(0, builder.NumAdded()); + builder.StartBlock(100); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.StartBlock(200); + builder.Add("box"); + builder.StartBlock(300); + builder.Add("hello"); + ASSERT_EQ(5, builder.NumAdded()); + Slice slice(builder.Finish()); + + CachableEntry<BlockContents> block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +TEST_F(FilterBlockTest, MultiChunk) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + + // First filter + builder.StartBlock(0); + builder.Add("foo"); + builder.StartBlock(2000); + builder.Add("bar"); + + // Second filter + builder.StartBlock(3100); + builder.Add("box"); + + // Third filter is empty + + // Last filter + builder.StartBlock(9000); + builder.Add("box"); + builder.Add("hello"); + + Slice slice(builder.Finish()); + + CachableEntry<BlockContents> block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); + + // Check first filter + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/2000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check second filter + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check third filter (empty) + ASSERT_TRUE(!reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check last filter + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +// Test for block based filter block +// use new interface in FilterPolicy to create filter builder/reader +class BlockBasedFilterBlockTest : public mock::MockBlockBasedTableTester, + public testing::Test { + public: + BlockBasedFilterBlockTest() + : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, true)) {} +}; + +TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { + FilterBlockBuilder* builder = + new BlockBasedFilterBlockBuilder(nullptr, table_options_); + Slice slice(builder->Finish()); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice)); + + CachableEntry<BlockContents> block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/10000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + delete builder; + delete reader; +} + +TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) { + FilterBlockBuilder* builder = + new BlockBasedFilterBlockBuilder(nullptr, table_options_); + builder->StartBlock(100); + builder->Add("foo"); + builder->Add("bar"); + builder->Add("box"); + builder->StartBlock(200); + builder->Add("box"); + builder->StartBlock(300); + builder->Add("hello"); + Slice slice(builder->Finish()); + + CachableEntry<BlockContents> block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + delete builder; + delete reader; +} + +TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { + FilterBlockBuilder* builder = + new BlockBasedFilterBlockBuilder(nullptr, table_options_); + + // First filter + builder->StartBlock(0); + builder->Add("foo"); + builder->StartBlock(2000); + builder->Add("bar"); + + // Second filter + builder->StartBlock(3100); + builder->Add("box"); + + // Third filter is empty + + // Last filter + builder->StartBlock(9000); + builder->Add("box"); + builder->Add("hello"); + + Slice slice(builder->Finish()); + + CachableEntry<BlockContents> block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); + + // Check first filter + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check second filter + ASSERT_TRUE(reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check third filter (empty) + ASSERT_TRUE(!reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check last filter + ASSERT_TRUE(reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + delete builder; + delete reader; +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/block_based/block_based_table_builder.cc b/src/rocksdb/table/block_based/block_based_table_builder.cc new file mode 100644 index 000000000..2003008fe --- /dev/null +++ b/src/rocksdb/table/block_based/block_based_table_builder.cc @@ -0,0 +1,1217 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based/block_based_table_builder.h" + +#include <assert.h> +#include <stdio.h> +#include <list> +#include <map> +#include <memory> +#include <string> +#include <unordered_map> +#include <utility> + +#include "db/dbformat.h" +#include "index_builder.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/table.h" + +#include "table/block_based/block.h" +#include "table/block_based/block_based_filter_block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/filter_policy_internal.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/partitioned_filter_block.h" +#include "table/format.h" +#include "table/table_builder.h" + +#include "memory/memory_allocator.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/xxhash.h" + +namespace ROCKSDB_NAMESPACE { + +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; + +typedef BlockBasedTableOptions::IndexType IndexType; + +// Without anonymous namespace here, we fail the warning -Wmissing-prototypes +namespace { + +// Create a filter block builder based on its type. +FilterBlockBuilder* CreateFilterBlockBuilder( + const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt, + const FilterBuildingContext& context, + const bool use_delta_encoding_for_index_values, + PartitionedIndexBuilder* const p_index_builder) { + const BlockBasedTableOptions& table_opt = context.table_options; + if (table_opt.filter_policy == nullptr) return nullptr; + + FilterBitsBuilder* filter_bits_builder = + BloomFilterPolicy::GetBuilderFromContext(context); + if (filter_bits_builder == nullptr) { + return new BlockBasedFilterBlockBuilder(mopt.prefix_extractor.get(), + table_opt); + } else { + if (table_opt.partition_filters) { + assert(p_index_builder != nullptr); + // Since after partition cut request from filter builder it takes time + // until index builder actully cuts the partition, we take the lower bound + // as partition size. + assert(table_opt.block_size_deviation <= 100); + auto partition_size = + static_cast<uint32_t>(((table_opt.metadata_block_size * + (100 - table_opt.block_size_deviation)) + + 99) / + 100); + partition_size = std::max(partition_size, static_cast<uint32_t>(1)); + return new PartitionedFilterBlockBuilder( + mopt.prefix_extractor.get(), table_opt.whole_key_filtering, + filter_bits_builder, table_opt.index_block_restart_interval, + use_delta_encoding_for_index_values, p_index_builder, partition_size); + } else { + return new FullFilterBlockBuilder(mopt.prefix_extractor.get(), + table_opt.whole_key_filtering, + filter_bits_builder); + } + } +} + +bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { + // Check to see if compressed less than 12.5% + return compressed_size < raw_size - (raw_size / 8u); +} + +bool CompressBlockInternal(const Slice& raw, + const CompressionInfo& compression_info, + uint32_t format_version, + std::string* compressed_output) { + // Will return compressed block contents if (1) the compression method is + // supported in this platform and (2) the compression rate is "good enough". + switch (compression_info.type()) { + case kSnappyCompression: + return Snappy_Compress(compression_info, raw.data(), raw.size(), + compressed_output); + case kZlibCompression: + return Zlib_Compress( + compression_info, + GetCompressFormatForVersion(kZlibCompression, format_version), + raw.data(), raw.size(), compressed_output); + case kBZip2Compression: + return BZip2_Compress( + compression_info, + GetCompressFormatForVersion(kBZip2Compression, format_version), + raw.data(), raw.size(), compressed_output); + case kLZ4Compression: + return LZ4_Compress( + compression_info, + GetCompressFormatForVersion(kLZ4Compression, format_version), + raw.data(), raw.size(), compressed_output); + case kLZ4HCCompression: + return LZ4HC_Compress( + compression_info, + GetCompressFormatForVersion(kLZ4HCCompression, format_version), + raw.data(), raw.size(), compressed_output); + case kXpressCompression: + return XPRESS_Compress(raw.data(), raw.size(), compressed_output); + case kZSTD: + case kZSTDNotFinalCompression: + return ZSTD_Compress(compression_info, raw.data(), raw.size(), + compressed_output); + default: + // Do not recognize this compression type + return false; + } +} + +} // namespace + +// format_version is the block format as defined in include/rocksdb/table.h +Slice CompressBlock(const Slice& raw, const CompressionInfo& info, + CompressionType* type, uint32_t format_version, + bool do_sample, std::string* compressed_output, + std::string* sampled_output_fast, + std::string* sampled_output_slow) { + *type = info.type(); + + if (info.type() == kNoCompression && !info.SampleForCompression()) { + return raw; + } + + // If requested, we sample one in every N block with a + // fast and slow compression algorithm and report the stats. + // The users can use these stats to decide if it is worthwhile + // enabling compression and they also get a hint about which + // compression algorithm wil be beneficial. + if (do_sample && info.SampleForCompression() && + Random::GetTLSInstance()->OneIn((int)info.SampleForCompression()) && + sampled_output_fast && sampled_output_slow) { + // Sampling with a fast compression algorithm + if (LZ4_Supported() || Snappy_Supported()) { + CompressionType c = + LZ4_Supported() ? kLZ4Compression : kSnappyCompression; + CompressionContext context(c); + CompressionOptions options; + CompressionInfo info_tmp(options, context, + CompressionDict::GetEmptyDict(), c, + info.SampleForCompression()); + + CompressBlockInternal(raw, info_tmp, format_version, sampled_output_fast); + } + + // Sampling with a slow but high-compression algorithm + if (ZSTD_Supported() || Zlib_Supported()) { + CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression; + CompressionContext context(c); + CompressionOptions options; + CompressionInfo info_tmp(options, context, + CompressionDict::GetEmptyDict(), c, + info.SampleForCompression()); + CompressBlockInternal(raw, info_tmp, format_version, sampled_output_slow); + } + } + + // Actually compress the data + if (*type != kNoCompression) { + if (CompressBlockInternal(raw, info, format_version, compressed_output) && + GoodCompressionRatio(compressed_output->size(), raw.size())) { + return *compressed_output; + } + } + + // Compression method is not supported, or not good + // compression ratio, so just fall back to uncompressed form. + *type = kNoCompression; + return raw; +} + +// kBlockBasedTableMagicNumber was picked by running +// echo rocksdb.table.block_based | sha1sum +// and taking the leading 64 bits. +// Please note that kBlockBasedTableMagicNumber may also be accessed by other +// .cc files +// for that reason we declare it extern in the header but to get the space +// allocated +// it must be not extern in one place. +const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull; +// We also support reading and writing legacy block based table format (for +// backwards compatibility) +const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull; + +// A collector that collects properties of interest to block-based table. +// For now this class looks heavy-weight since we only write one additional +// property. +// But in the foreseeable future, we will add more and more properties that are +// specific to block-based table. +class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector + : public IntTblPropCollector { + public: + explicit BlockBasedTablePropertiesCollector( + BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering, + bool prefix_filtering) + : index_type_(index_type), + whole_key_filtering_(whole_key_filtering), + prefix_filtering_(prefix_filtering) {} + + Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/, + uint64_t /*file_size*/) override { + // Intentionally left blank. Have no interest in collecting stats for + // individual key/value pairs. + return Status::OK(); + } + + virtual void BlockAdd(uint64_t /* blockRawBytes */, + uint64_t /* blockCompressedBytesFast */, + uint64_t /* blockCompressedBytesSlow */) override { + // Intentionally left blank. No interest in collecting stats for + // blocks. + return; + } + + Status Finish(UserCollectedProperties* properties) override { + std::string val; + PutFixed32(&val, static_cast<uint32_t>(index_type_)); + properties->insert({BlockBasedTablePropertyNames::kIndexType, val}); + properties->insert({BlockBasedTablePropertyNames::kWholeKeyFiltering, + whole_key_filtering_ ? kPropTrue : kPropFalse}); + properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering, + prefix_filtering_ ? kPropTrue : kPropFalse}); + return Status::OK(); + } + + // The name of the properties collector can be used for debugging purpose. + const char* Name() const override { + return "BlockBasedTablePropertiesCollector"; + } + + UserCollectedProperties GetReadableProperties() const override { + // Intentionally left blank. + return UserCollectedProperties(); + } + + private: + BlockBasedTableOptions::IndexType index_type_; + bool whole_key_filtering_; + bool prefix_filtering_; +}; + +struct BlockBasedTableBuilder::Rep { + const ImmutableCFOptions ioptions; + const MutableCFOptions moptions; + const BlockBasedTableOptions table_options; + const InternalKeyComparator& internal_comparator; + WritableFileWriter* file; + uint64_t offset = 0; + Status status; + size_t alignment; + BlockBuilder data_block; + // Buffers uncompressed data blocks and keys to replay later. Needed when + // compression dictionary is enabled so we can finalize the dictionary before + // compressing any data blocks. + // TODO(ajkr): ideally we don't buffer all keys and all uncompressed data + // blocks as it's redundant, but it's easier to implement for now. + std::vector<std::pair<std::string, std::vector<std::string>>> + data_block_and_keys_buffers; + BlockBuilder range_del_block; + + InternalKeySliceTransform internal_prefix_transform; + std::unique_ptr<IndexBuilder> index_builder; + PartitionedIndexBuilder* p_index_builder_ = nullptr; + + std::string last_key; + CompressionType compression_type; + uint64_t sample_for_compression; + CompressionOptions compression_opts; + std::unique_ptr<CompressionDict> compression_dict; + CompressionContext compression_ctx; + std::unique_ptr<UncompressionContext> verify_ctx; + std::unique_ptr<UncompressionDict> verify_dict; + + size_t data_begin_offset = 0; + + TableProperties props; + + // States of the builder. + // + // - `kBuffered`: This is the initial state where zero or more data blocks are + // accumulated uncompressed in-memory. From this state, call + // `EnterUnbuffered()` to finalize the compression dictionary if enabled, + // compress/write out any buffered blocks, and proceed to the `kUnbuffered` + // state. + // + // - `kUnbuffered`: This is the state when compression dictionary is finalized + // either because it wasn't enabled in the first place or it's been created + // from sampling previously buffered data. In this state, blocks are simply + // compressed/written out as they fill up. From this state, call `Finish()` + // to complete the file (write meta-blocks, etc.), or `Abandon()` to delete + // the partially created file. + // + // - `kClosed`: This indicates either `Finish()` or `Abandon()` has been + // called, so the table builder is no longer usable. We must be in this + // state by the time the destructor runs. + enum class State { + kBuffered, + kUnbuffered, + kClosed, + }; + State state; + + const bool use_delta_encoding_for_index_values; + std::unique_ptr<FilterBlockBuilder> filter_builder; + char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; + size_t compressed_cache_key_prefix_size; + + BlockHandle pending_handle; // Handle to add to index block + + std::string compressed_output; + std::unique_ptr<FlushBlockPolicy> flush_block_policy; + int level_at_creation; + uint32_t column_family_id; + const std::string& column_family_name; + uint64_t creation_time = 0; + uint64_t oldest_key_time = 0; + const uint64_t target_file_size; + uint64_t file_creation_time = 0; + + std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors; + + Rep(const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions, + const BlockBasedTableOptions& table_opt, + const InternalKeyComparator& icomparator, + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + int_tbl_prop_collector_factories, + uint32_t _column_family_id, WritableFileWriter* f, + const CompressionType _compression_type, + const uint64_t _sample_for_compression, + const CompressionOptions& _compression_opts, const bool skip_filters, + const int _level_at_creation, const std::string& _column_family_name, + const uint64_t _creation_time, const uint64_t _oldest_key_time, + const uint64_t _target_file_size, const uint64_t _file_creation_time) + : ioptions(_ioptions), + moptions(_moptions), + table_options(table_opt), + internal_comparator(icomparator), + file(f), + alignment(table_options.block_align + ? std::min(table_options.block_size, kDefaultPageSize) + : 0), + data_block(table_options.block_restart_interval, + table_options.use_delta_encoding, + false /* use_value_delta_encoding */, + icomparator.user_comparator() + ->CanKeysWithDifferentByteContentsBeEqual() + ? BlockBasedTableOptions::kDataBlockBinarySearch + : table_options.data_block_index_type, + table_options.data_block_hash_table_util_ratio), + range_del_block(1 /* block_restart_interval */), + internal_prefix_transform(_moptions.prefix_extractor.get()), + compression_type(_compression_type), + sample_for_compression(_sample_for_compression), + compression_opts(_compression_opts), + compression_dict(), + compression_ctx(_compression_type), + verify_dict(), + state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered + : State::kUnbuffered), + use_delta_encoding_for_index_values(table_opt.format_version >= 4 && + !table_opt.block_align), + compressed_cache_key_prefix_size(0), + flush_block_policy( + table_options.flush_block_policy_factory->NewFlushBlockPolicy( + table_options, data_block)), + level_at_creation(_level_at_creation), + column_family_id(_column_family_id), + column_family_name(_column_family_name), + creation_time(_creation_time), + oldest_key_time(_oldest_key_time), + target_file_size(_target_file_size), + file_creation_time(_file_creation_time) { + if (table_options.index_type == + BlockBasedTableOptions::kTwoLevelIndexSearch) { + p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder( + &internal_comparator, use_delta_encoding_for_index_values, + table_options); + index_builder.reset(p_index_builder_); + } else { + index_builder.reset(IndexBuilder::CreateIndexBuilder( + table_options.index_type, &internal_comparator, + &this->internal_prefix_transform, use_delta_encoding_for_index_values, + table_options)); + } + if (skip_filters) { + filter_builder = nullptr; + } else { + FilterBuildingContext context(table_options); + context.column_family_name = column_family_name; + context.compaction_style = ioptions.compaction_style; + context.level_at_creation = level_at_creation; + context.info_log = ioptions.info_log; + filter_builder.reset(CreateFilterBlockBuilder( + ioptions, moptions, context, use_delta_encoding_for_index_values, + p_index_builder_)); + } + + for (auto& collector_factories : *int_tbl_prop_collector_factories) { + table_properties_collectors.emplace_back( + collector_factories->CreateIntTblPropCollector(column_family_id)); + } + table_properties_collectors.emplace_back( + new BlockBasedTablePropertiesCollector( + table_options.index_type, table_options.whole_key_filtering, + _moptions.prefix_extractor != nullptr)); + if (table_options.verify_compression) { + verify_ctx.reset(new UncompressionContext(UncompressionContext::NoCache(), + compression_type)); + } + } + + Rep(const Rep&) = delete; + Rep& operator=(const Rep&) = delete; + + ~Rep() {} +}; + +BlockBasedTableBuilder::BlockBasedTableBuilder( + const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, WritableFileWriter* file, + const CompressionType compression_type, + const uint64_t sample_for_compression, + const CompressionOptions& compression_opts, const bool skip_filters, + const std::string& column_family_name, const int level_at_creation, + const uint64_t creation_time, const uint64_t oldest_key_time, + const uint64_t target_file_size, const uint64_t file_creation_time) { + BlockBasedTableOptions sanitized_table_options(table_options); + if (sanitized_table_options.format_version == 0 && + sanitized_table_options.checksum != kCRC32c) { + ROCKS_LOG_WARN( + ioptions.info_log, + "Silently converting format_version to 1 because checksum is " + "non-default"); + // silently convert format_version to 1 to keep consistent with current + // behavior + sanitized_table_options.format_version = 1; + } + + rep_ = new Rep(ioptions, moptions, sanitized_table_options, + internal_comparator, int_tbl_prop_collector_factories, + column_family_id, file, compression_type, + sample_for_compression, compression_opts, skip_filters, + level_at_creation, column_family_name, creation_time, + oldest_key_time, target_file_size, file_creation_time); + + if (rep_->filter_builder != nullptr) { + rep_->filter_builder->StartBlock(0); + } + if (table_options.block_cache_compressed.get() != nullptr) { + BlockBasedTable::GenerateCachePrefix( + table_options.block_cache_compressed.get(), file->writable_file(), + &rep_->compressed_cache_key_prefix[0], + &rep_->compressed_cache_key_prefix_size); + } +} + +BlockBasedTableBuilder::~BlockBasedTableBuilder() { + // Catch errors where caller forgot to call Finish() + assert(rep_->state == Rep::State::kClosed); + delete rep_; +} + +void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { + Rep* r = rep_; + assert(rep_->state != Rep::State::kClosed); + if (!ok()) return; + ValueType value_type = ExtractValueType(key); + if (IsValueType(value_type)) { +#ifndef NDEBUG + if (r->props.num_entries > r->props.num_range_deletions) { + assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0); + } +#endif // NDEBUG + + auto should_flush = r->flush_block_policy->Update(key, value); + if (should_flush) { + assert(!r->data_block.empty()); + Flush(); + + if (r->state == Rep::State::kBuffered && + r->data_begin_offset > r->target_file_size) { + EnterUnbuffered(); + } + + // Add item to index block. + // We do not emit the index entry for a block until we have seen the + // first key for the next data block. This allows us to use shorter + // keys in the index block. For example, consider a block boundary + // between the keys "the quick brown fox" and "the who". We can use + // "the r" as the key for the index block entry since it is >= all + // entries in the first block and < all entries in subsequent + // blocks. + if (ok() && r->state == Rep::State::kUnbuffered) { + r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle); + } + } + + // Note: PartitionedFilterBlockBuilder requires key being added to filter + // builder after being added to index builder. + if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) { + size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size(); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + } + + r->last_key.assign(key.data(), key.size()); + r->data_block.Add(key, value); + if (r->state == Rep::State::kBuffered) { + // Buffer keys to be replayed during `Finish()` once compression + // dictionary has been finalized. + if (r->data_block_and_keys_buffers.empty() || should_flush) { + r->data_block_and_keys_buffers.emplace_back(); + } + r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString()); + } else { + r->index_builder->OnKeyAdded(key); + } + NotifyCollectTableCollectorsOnAdd(key, value, r->offset, + r->table_properties_collectors, + r->ioptions.info_log); + + } else if (value_type == kTypeRangeDeletion) { + r->range_del_block.Add(key, value); + NotifyCollectTableCollectorsOnAdd(key, value, r->offset, + r->table_properties_collectors, + r->ioptions.info_log); + } else { + assert(false); + } + + r->props.num_entries++; + r->props.raw_key_size += key.size(); + r->props.raw_value_size += value.size(); + if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion) { + r->props.num_deletions++; + } else if (value_type == kTypeRangeDeletion) { + r->props.num_deletions++; + r->props.num_range_deletions++; + } else if (value_type == kTypeMerge) { + r->props.num_merge_operands++; + } +} + +void BlockBasedTableBuilder::Flush() { + Rep* r = rep_; + assert(rep_->state != Rep::State::kClosed); + if (!ok()) return; + if (r->data_block.empty()) return; + WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */); +} + +void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block, + BlockHandle* handle, + bool is_data_block) { + WriteBlock(block->Finish(), handle, is_data_block); + block->Reset(); +} + +void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, + BlockHandle* handle, + bool is_data_block) { + // File format contains a sequence of blocks where each block has: + // block_data: uint8[n] + // type: uint8 + // crc: uint32 + assert(ok()); + Rep* r = rep_; + + auto type = r->compression_type; + uint64_t sample_for_compression = r->sample_for_compression; + Slice block_contents; + bool abort_compression = false; + + StopWatchNano timer( + r->ioptions.env, + ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)); + + if (r->state == Rep::State::kBuffered) { + assert(is_data_block); + assert(!r->data_block_and_keys_buffers.empty()); + r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString(); + r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size(); + return; + } + + if (raw_block_contents.size() < kCompressionSizeLimit) { + const CompressionDict* compression_dict; + if (!is_data_block || r->compression_dict == nullptr) { + compression_dict = &CompressionDict::GetEmptyDict(); + } else { + compression_dict = r->compression_dict.get(); + } + assert(compression_dict != nullptr); + CompressionInfo compression_info(r->compression_opts, r->compression_ctx, + *compression_dict, type, + sample_for_compression); + + std::string sampled_output_fast; + std::string sampled_output_slow; + block_contents = CompressBlock( + raw_block_contents, compression_info, &type, + r->table_options.format_version, is_data_block /* do_sample */, + &r->compressed_output, &sampled_output_fast, &sampled_output_slow); + + // notify collectors on block add + NotifyCollectTableCollectorsOnBlockAdd( + r->table_properties_collectors, raw_block_contents.size(), + sampled_output_fast.size(), sampled_output_slow.size()); + + // Some of the compression algorithms are known to be unreliable. If + // the verify_compression flag is set then try to de-compress the + // compressed data and compare to the input. + if (type != kNoCompression && r->table_options.verify_compression) { + // Retrieve the uncompressed contents into a new buffer + const UncompressionDict* verify_dict; + if (!is_data_block || r->verify_dict == nullptr) { + verify_dict = &UncompressionDict::GetEmptyDict(); + } else { + verify_dict = r->verify_dict.get(); + } + assert(verify_dict != nullptr); + BlockContents contents; + UncompressionInfo uncompression_info(*r->verify_ctx, *verify_dict, + r->compression_type); + Status stat = UncompressBlockContentsForCompressionType( + uncompression_info, block_contents.data(), block_contents.size(), + &contents, r->table_options.format_version, r->ioptions); + + if (stat.ok()) { + bool compressed_ok = contents.data.compare(raw_block_contents) == 0; + if (!compressed_ok) { + // The result of the compression was invalid. abort. + abort_compression = true; + ROCKS_LOG_ERROR(r->ioptions.info_log, + "Decompressed block did not match raw block"); + r->status = + Status::Corruption("Decompressed block did not match raw block"); + } + } else { + // Decompression reported an error. abort. + r->status = Status::Corruption("Could not decompress"); + abort_compression = true; + } + } + } else { + // Block is too big to be compressed. + abort_compression = true; + } + + // Abort compression if the block is too big, or did not pass + // verification. + if (abort_compression) { + RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); + type = kNoCompression; + block_contents = raw_block_contents; + } else if (type != kNoCompression) { + if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)) { + RecordTimeToHistogram(r->ioptions.statistics, COMPRESSION_TIMES_NANOS, + timer.ElapsedNanos()); + } + RecordInHistogram(r->ioptions.statistics, BYTES_COMPRESSED, + raw_block_contents.size()); + RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED); + } else if (type != r->compression_type) { + RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); + } + + WriteRawBlock(block_contents, type, handle, is_data_block); + r->compressed_output.clear(); + if (is_data_block) { + if (r->filter_builder != nullptr) { + r->filter_builder->StartBlock(r->offset); + } + r->props.data_size = r->offset; + ++r->props.num_data_blocks; + } +} + +void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, + CompressionType type, + BlockHandle* handle, + bool is_data_block) { + Rep* r = rep_; + StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS); + handle->set_offset(r->offset); + handle->set_size(block_contents.size()); + assert(r->status.ok()); + r->status = r->file->Append(block_contents); + if (r->status.ok()) { + char trailer[kBlockTrailerSize]; + trailer[0] = type; + char* trailer_without_type = trailer + 1; + switch (r->table_options.checksum) { + case kNoChecksum: + EncodeFixed32(trailer_without_type, 0); + break; + case kCRC32c: { + auto crc = crc32c::Value(block_contents.data(), block_contents.size()); + crc = crc32c::Extend(crc, trailer, 1); // Extend to cover block type + EncodeFixed32(trailer_without_type, crc32c::Mask(crc)); + break; + } + case kxxHash: { + XXH32_state_t* const state = XXH32_createState(); + XXH32_reset(state, 0); + XXH32_update(state, block_contents.data(), + static_cast<uint32_t>(block_contents.size())); + XXH32_update(state, trailer, 1); // Extend to cover block type + EncodeFixed32(trailer_without_type, XXH32_digest(state)); + XXH32_freeState(state); + break; + } + case kxxHash64: { + XXH64_state_t* const state = XXH64_createState(); + XXH64_reset(state, 0); + XXH64_update(state, block_contents.data(), + static_cast<uint32_t>(block_contents.size())); + XXH64_update(state, trailer, 1); // Extend to cover block type + EncodeFixed32( + trailer_without_type, + static_cast<uint32_t>(XXH64_digest(state) & // lower 32 bits + uint64_t{0xffffffff})); + XXH64_freeState(state); + break; + } + } + + assert(r->status.ok()); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum", + static_cast<char*>(trailer)); + r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); + if (r->status.ok()) { + r->status = InsertBlockInCache(block_contents, type, handle); + } + if (r->status.ok()) { + r->offset += block_contents.size() + kBlockTrailerSize; + if (r->table_options.block_align && is_data_block) { + size_t pad_bytes = + (r->alignment - ((block_contents.size() + kBlockTrailerSize) & + (r->alignment - 1))) & + (r->alignment - 1); + r->status = r->file->Pad(pad_bytes); + if (r->status.ok()) { + r->offset += pad_bytes; + } + } + } + } +} + +Status BlockBasedTableBuilder::status() const { return rep_->status; } + +static void DeleteCachedBlockContents(const Slice& /*key*/, void* value) { + BlockContents* bc = reinterpret_cast<BlockContents*>(value); + delete bc; +} + +// +// Make a copy of the block contents and insert into compressed block cache +// +Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle) { + Rep* r = rep_; + Cache* block_cache_compressed = r->table_options.block_cache_compressed.get(); + + if (type != kNoCompression && block_cache_compressed != nullptr) { + size_t size = block_contents.size(); + + auto ubuf = + AllocateBlock(size + 1, block_cache_compressed->memory_allocator()); + memcpy(ubuf.get(), block_contents.data(), size); + ubuf[size] = type; + + BlockContents* block_contents_to_cache = + new BlockContents(std::move(ubuf), size); +#ifndef NDEBUG + block_contents_to_cache->is_raw_block = true; +#endif // NDEBUG + + // make cache key by appending the file offset to the cache prefix id + char* end = EncodeVarint64( + r->compressed_cache_key_prefix + r->compressed_cache_key_prefix_size, + handle->offset()); + Slice key(r->compressed_cache_key_prefix, + static_cast<size_t>(end - r->compressed_cache_key_prefix)); + + // Insert into compressed block cache. + block_cache_compressed->Insert( + key, block_contents_to_cache, + block_contents_to_cache->ApproximateMemoryUsage(), + &DeleteCachedBlockContents); + + // Invalidate OS cache. + r->file->InvalidateCache(static_cast<size_t>(r->offset), size); + } + return Status::OK(); +} + +void BlockBasedTableBuilder::WriteFilterBlock( + MetaIndexBuilder* meta_index_builder) { + BlockHandle filter_block_handle; + bool empty_filter_block = (rep_->filter_builder == nullptr || + rep_->filter_builder->NumAdded() == 0); + if (ok() && !empty_filter_block) { + Status s = Status::Incomplete(); + while (ok() && s.IsIncomplete()) { + Slice filter_content = + rep_->filter_builder->Finish(filter_block_handle, &s); + assert(s.ok() || s.IsIncomplete()); + rep_->props.filter_size += filter_content.size(); + WriteRawBlock(filter_content, kNoCompression, &filter_block_handle); + } + } + if (ok() && !empty_filter_block) { + // Add mapping from "<filter_block_prefix>.Name" to location + // of filter data. + std::string key; + if (rep_->filter_builder->IsBlockBased()) { + key = BlockBasedTable::kFilterBlockPrefix; + } else { + key = rep_->table_options.partition_filters + ? BlockBasedTable::kPartitionedFilterBlockPrefix + : BlockBasedTable::kFullFilterBlockPrefix; + } + key.append(rep_->table_options.filter_policy->Name()); + meta_index_builder->Add(key, filter_block_handle); + } +} + +void BlockBasedTableBuilder::WriteIndexBlock( + MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) { + IndexBuilder::IndexBlocks index_blocks; + auto index_builder_status = rep_->index_builder->Finish(&index_blocks); + if (index_builder_status.IsIncomplete()) { + // We we have more than one index partition then meta_blocks are not + // supported for the index. Currently meta_blocks are used only by + // HashIndexBuilder which is not multi-partition. + assert(index_blocks.meta_blocks.empty()); + } else if (ok() && !index_builder_status.ok()) { + rep_->status = index_builder_status; + } + if (ok()) { + for (const auto& item : index_blocks.meta_blocks) { + BlockHandle block_handle; + WriteBlock(item.second, &block_handle, false /* is_data_block */); + if (!ok()) { + break; + } + meta_index_builder->Add(item.first, block_handle); + } + } + if (ok()) { + if (rep_->table_options.enable_index_compression) { + WriteBlock(index_blocks.index_block_contents, index_block_handle, false); + } else { + WriteRawBlock(index_blocks.index_block_contents, kNoCompression, + index_block_handle); + } + } + // If there are more index partitions, finish them and write them out + Status s = index_builder_status; + while (ok() && s.IsIncomplete()) { + s = rep_->index_builder->Finish(&index_blocks, *index_block_handle); + if (!s.ok() && !s.IsIncomplete()) { + rep_->status = s; + return; + } + if (rep_->table_options.enable_index_compression) { + WriteBlock(index_blocks.index_block_contents, index_block_handle, false); + } else { + WriteRawBlock(index_blocks.index_block_contents, kNoCompression, + index_block_handle); + } + // The last index_block_handle will be for the partition index block + } +} + +void BlockBasedTableBuilder::WritePropertiesBlock( + MetaIndexBuilder* meta_index_builder) { + BlockHandle properties_block_handle; + if (ok()) { + PropertyBlockBuilder property_block_builder; + rep_->props.column_family_id = rep_->column_family_id; + rep_->props.column_family_name = rep_->column_family_name; + rep_->props.filter_policy_name = + rep_->table_options.filter_policy != nullptr + ? rep_->table_options.filter_policy->Name() + : ""; + rep_->props.index_size = + rep_->index_builder->IndexSize() + kBlockTrailerSize; + rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr + ? rep_->ioptions.user_comparator->Name() + : "nullptr"; + rep_->props.merge_operator_name = + rep_->ioptions.merge_operator != nullptr + ? rep_->ioptions.merge_operator->Name() + : "nullptr"; + rep_->props.compression_name = + CompressionTypeToString(rep_->compression_type); + rep_->props.compression_options = + CompressionOptionsToString(rep_->compression_opts); + rep_->props.prefix_extractor_name = + rep_->moptions.prefix_extractor != nullptr + ? rep_->moptions.prefix_extractor->Name() + : "nullptr"; + + std::string property_collectors_names = "["; + for (size_t i = 0; + i < rep_->ioptions.table_properties_collector_factories.size(); ++i) { + if (i != 0) { + property_collectors_names += ","; + } + property_collectors_names += + rep_->ioptions.table_properties_collector_factories[i]->Name(); + } + property_collectors_names += "]"; + rep_->props.property_collectors_names = property_collectors_names; + if (rep_->table_options.index_type == + BlockBasedTableOptions::kTwoLevelIndexSearch) { + assert(rep_->p_index_builder_ != nullptr); + rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions(); + rep_->props.top_level_index_size = + rep_->p_index_builder_->TopLevelIndexSize(rep_->offset); + } + rep_->props.index_key_is_user_key = + !rep_->index_builder->seperator_is_key_plus_seq(); + rep_->props.index_value_is_delta_encoded = + rep_->use_delta_encoding_for_index_values; + rep_->props.creation_time = rep_->creation_time; + rep_->props.oldest_key_time = rep_->oldest_key_time; + rep_->props.file_creation_time = rep_->file_creation_time; + + // Add basic properties + property_block_builder.AddTableProperty(rep_->props); + + // Add use collected properties + NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors, + rep_->ioptions.info_log, + &property_block_builder); + + WriteRawBlock(property_block_builder.Finish(), kNoCompression, + &properties_block_handle); + } + if (ok()) { +#ifndef NDEBUG + { + uint64_t props_block_offset = properties_block_handle.offset(); + uint64_t props_block_size = properties_block_handle.size(); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset", + &props_block_offset); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize", + &props_block_size); + } +#endif // !NDEBUG + meta_index_builder->Add(kPropertiesBlock, properties_block_handle); + } +} + +void BlockBasedTableBuilder::WriteCompressionDictBlock( + MetaIndexBuilder* meta_index_builder) { + if (rep_->compression_dict != nullptr && + rep_->compression_dict->GetRawDict().size()) { + BlockHandle compression_dict_block_handle; + if (ok()) { + WriteRawBlock(rep_->compression_dict->GetRawDict(), kNoCompression, + &compression_dict_block_handle); +#ifndef NDEBUG + Slice compression_dict = rep_->compression_dict->GetRawDict(); + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", + &compression_dict); +#endif // NDEBUG + } + if (ok()) { + meta_index_builder->Add(kCompressionDictBlock, + compression_dict_block_handle); + } + } +} + +void BlockBasedTableBuilder::WriteRangeDelBlock( + MetaIndexBuilder* meta_index_builder) { + if (ok() && !rep_->range_del_block.empty()) { + BlockHandle range_del_block_handle; + WriteRawBlock(rep_->range_del_block.Finish(), kNoCompression, + &range_del_block_handle); + meta_index_builder->Add(kRangeDelBlock, range_del_block_handle); + } +} + +void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle, + BlockHandle& index_block_handle) { + Rep* r = rep_; + // No need to write out new footer if we're using default checksum. + // We're writing legacy magic number because we want old versions of RocksDB + // be able to read files generated with new release (just in case if + // somebody wants to roll back after an upgrade) + // TODO(icanadi) at some point in the future, when we're absolutely sure + // nobody will roll back to RocksDB 2.x versions, retire the legacy magic + // number and always write new table files with new magic number + bool legacy = (r->table_options.format_version == 0); + // this is guaranteed by BlockBasedTableBuilder's constructor + assert(r->table_options.checksum == kCRC32c || + r->table_options.format_version != 0); + Footer footer( + legacy ? kLegacyBlockBasedTableMagicNumber : kBlockBasedTableMagicNumber, + r->table_options.format_version); + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(index_block_handle); + footer.set_checksum(r->table_options.checksum); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + assert(r->status.ok()); + r->status = r->file->Append(footer_encoding); + if (r->status.ok()) { + r->offset += footer_encoding.size(); + } +} + +void BlockBasedTableBuilder::EnterUnbuffered() { + Rep* r = rep_; + assert(r->state == Rep::State::kBuffered); + r->state = Rep::State::kUnbuffered; + const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0 + ? r->compression_opts.zstd_max_train_bytes + : r->compression_opts.max_dict_bytes; + Random64 generator{r->creation_time}; + std::string compression_dict_samples; + std::vector<size_t> compression_dict_sample_lens; + if (!r->data_block_and_keys_buffers.empty()) { + while (compression_dict_samples.size() < kSampleBytes) { + size_t rand_idx = + static_cast<size_t>( + generator.Uniform(r->data_block_and_keys_buffers.size())); + size_t copy_len = + std::min(kSampleBytes - compression_dict_samples.size(), + r->data_block_and_keys_buffers[rand_idx].first.size()); + compression_dict_samples.append( + r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len); + compression_dict_sample_lens.emplace_back(copy_len); + } + } + + // final data block flushed, now we can generate dictionary from the samples. + // OK if compression_dict_samples is empty, we'll just get empty dictionary. + std::string dict; + if (r->compression_opts.zstd_max_train_bytes > 0) { + dict = ZSTD_TrainDictionary(compression_dict_samples, + compression_dict_sample_lens, + r->compression_opts.max_dict_bytes); + } else { + dict = std::move(compression_dict_samples); + } + r->compression_dict.reset(new CompressionDict(dict, r->compression_type, + r->compression_opts.level)); + r->verify_dict.reset(new UncompressionDict( + dict, r->compression_type == kZSTD || + r->compression_type == kZSTDNotFinalCompression)); + + for (size_t i = 0; ok() && i < r->data_block_and_keys_buffers.size(); ++i) { + const auto& data_block = r->data_block_and_keys_buffers[i].first; + auto& keys = r->data_block_and_keys_buffers[i].second; + assert(!data_block.empty()); + assert(!keys.empty()); + + for (const auto& key : keys) { + if (r->filter_builder != nullptr) { + size_t ts_sz = + r->internal_comparator.user_comparator()->timestamp_size(); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + } + r->index_builder->OnKeyAdded(key); + } + WriteBlock(Slice(data_block), &r->pending_handle, true /* is_data_block */); + if (ok() && i + 1 < r->data_block_and_keys_buffers.size()) { + Slice first_key_in_next_block = + r->data_block_and_keys_buffers[i + 1].second.front(); + Slice* first_key_in_next_block_ptr = &first_key_in_next_block; + r->index_builder->AddIndexEntry(&keys.back(), first_key_in_next_block_ptr, + r->pending_handle); + } + } + r->data_block_and_keys_buffers.clear(); +} + +Status BlockBasedTableBuilder::Finish() { + Rep* r = rep_; + assert(r->state != Rep::State::kClosed); + bool empty_data_block = r->data_block.empty(); + Flush(); + if (r->state == Rep::State::kBuffered) { + EnterUnbuffered(); + } + // To make sure properties block is able to keep the accurate size of index + // block, we will finish writing all index entries first. + if (ok() && !empty_data_block) { + r->index_builder->AddIndexEntry( + &r->last_key, nullptr /* no next data block */, r->pending_handle); + } + + // Write meta blocks, metaindex block and footer in the following order. + // 1. [meta block: filter] + // 2. [meta block: index] + // 3. [meta block: compression dictionary] + // 4. [meta block: range deletion tombstone] + // 5. [meta block: properties] + // 6. [metaindex block] + // 7. Footer + BlockHandle metaindex_block_handle, index_block_handle; + MetaIndexBuilder meta_index_builder; + WriteFilterBlock(&meta_index_builder); + WriteIndexBlock(&meta_index_builder, &index_block_handle); + WriteCompressionDictBlock(&meta_index_builder); + WriteRangeDelBlock(&meta_index_builder); + WritePropertiesBlock(&meta_index_builder); + if (ok()) { + // flush the meta index block + WriteRawBlock(meta_index_builder.Finish(), kNoCompression, + &metaindex_block_handle); + } + if (ok()) { + WriteFooter(metaindex_block_handle, index_block_handle); + } + if (r->file != nullptr) { + file_checksum_ = r->file->GetFileChecksum(); + } + r->state = Rep::State::kClosed; + return r->status; +} + +void BlockBasedTableBuilder::Abandon() { + assert(rep_->state != Rep::State::kClosed); + rep_->state = Rep::State::kClosed; +} + +uint64_t BlockBasedTableBuilder::NumEntries() const { + return rep_->props.num_entries; +} + +uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; } + +bool BlockBasedTableBuilder::NeedCompact() const { + for (const auto& collector : rep_->table_properties_collectors) { + if (collector->NeedCompact()) { + return true; + } + } + return false; +} + +TableProperties BlockBasedTableBuilder::GetTableProperties() const { + TableProperties ret = rep_->props; + for (const auto& collector : rep_->table_properties_collectors) { + for (const auto& prop : collector->GetReadableProperties()) { + ret.readable_properties.insert(prop); + } + collector->Finish(&ret.user_collected_properties); + } + return ret; +} + +const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const { + if (rep_->file != nullptr) { + return rep_->file->GetFileChecksumFuncName(); + } else { + return kUnknownFileChecksumFuncName.c_str(); + } +} + +const std::string BlockBasedTable::kFilterBlockPrefix = "filter."; +const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter."; +const std::string BlockBasedTable::kPartitionedFilterBlockPrefix = + "partitionedfilter."; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_based_table_builder.h b/src/rocksdb/table/block_based/block_based_table_builder.h new file mode 100644 index 000000000..97c9bc65a --- /dev/null +++ b/src/rocksdb/table/block_based/block_based_table_builder.h @@ -0,0 +1,157 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <stdint.h> +#include <limits> +#include <string> +#include <utility> +#include <vector> + +#include "db/version_edit.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "table/meta_blocks.h" +#include "table/table_builder.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBuilder; +class BlockHandle; +class WritableFile; +struct BlockBasedTableOptions; + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kLegacyBlockBasedTableMagicNumber; + +class BlockBasedTableBuilder : public TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). + BlockBasedTableBuilder( + const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, WritableFileWriter* file, + const CompressionType compression_type, + const uint64_t sample_for_compression, + const CompressionOptions& compression_opts, const bool skip_filters, + const std::string& column_family_name, const int level_at_creation, + const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0, + const uint64_t target_file_size = 0, + const uint64_t file_creation_time = 0); + + // No copying allowed + BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete; + BlockBasedTableBuilder& operator=(const BlockBasedTableBuilder&) = delete; + + // REQUIRES: Either Finish() or Abandon() has been called. + ~BlockBasedTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + bool NeedCompact() const override; + + // Get table properties + TableProperties GetTableProperties() const override; + + // Get file checksum + const std::string& GetFileChecksum() const override { return file_checksum_; } + + // Get file checksum function name + const char* GetFileChecksumFuncName() const override; + + private: + bool ok() const { return status().ok(); } + + // Transition state from buffered to unbuffered. See `Rep::State` API comment + // for details of the states. + // REQUIRES: `rep_->state == kBuffered` + void EnterUnbuffered(); + + // Call block's Finish() method + // and then write the compressed block contents to file. + void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block); + + // Compress and write block content to the file. + void WriteBlock(const Slice& block_contents, BlockHandle* handle, + bool is_data_block); + // Directly write data to the file. + void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle, + bool is_data_block = false); + Status InsertBlockInCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle); + + void WriteFilterBlock(MetaIndexBuilder* meta_index_builder); + void WriteIndexBlock(MetaIndexBuilder* meta_index_builder, + BlockHandle* index_block_handle); + void WritePropertiesBlock(MetaIndexBuilder* meta_index_builder); + void WriteCompressionDictBlock(MetaIndexBuilder* meta_index_builder); + void WriteRangeDelBlock(MetaIndexBuilder* meta_index_builder); + void WriteFooter(BlockHandle& metaindex_block_handle, + BlockHandle& index_block_handle); + + struct Rep; + class BlockBasedTablePropertiesCollectorFactory; + class BlockBasedTablePropertiesCollector; + Rep* rep_; + + // Advanced operation: flush any buffered key/value pairs to file. + // Can be used to ensure that two adjacent entries never live in + // the same data block. Most clients should not need to use this method. + // REQUIRES: Finish(), Abandon() have not been called + void Flush(); + + // Some compression libraries fail when the raw size is bigger than int. If + // uncompressed size is bigger than kCompressionSizeLimit, don't compress it + const uint64_t kCompressionSizeLimit = std::numeric_limits<int>::max(); + + // Store file checksum. If checksum is disabled, its value is "0". + std::string file_checksum_ = kUnknownFileChecksum; +}; + +Slice CompressBlock(const Slice& raw, const CompressionInfo& info, + CompressionType* type, uint32_t format_version, + bool do_sample, std::string* compressed_output, + std::string* sampled_output_fast, + std::string* sampled_output_slow); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_based_table_factory.cc b/src/rocksdb/table/block_based/block_based_table_factory.cc new file mode 100644 index 000000000..70a6f38d5 --- /dev/null +++ b/src/rocksdb/table/block_based/block_based_table_factory.cc @@ -0,0 +1,649 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <stdint.h> +#include <cinttypes> + +#include <memory> +#include <string> + +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/flush_block_policy.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/format.h" +#include "util/mutexlock.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +void TailPrefetchStats::RecordEffectiveSize(size_t len) { + MutexLock l(&mutex_); + if (num_records_ < kNumTracked) { + num_records_++; + } + records_[next_++] = len; + if (next_ == kNumTracked) { + next_ = 0; + } +} + +size_t TailPrefetchStats::GetSuggestedPrefetchSize() { + std::vector<size_t> sorted; + { + MutexLock l(&mutex_); + + if (num_records_ == 0) { + return 0; + } + sorted.assign(records_, records_ + num_records_); + } + + // Of the historic size, we find the maximum one that satisifis the condtiion + // that if prefetching all, less than 1/8 will be wasted. + std::sort(sorted.begin(), sorted.end()); + + // Assuming we have 5 data points, and after sorting it looks like this: + // + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // +---+ | | | | + // | | | | | | + // +---+ | | | | | | + // | | | | | | | | + // +---+ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // and we use every of the value as a candidate, and estimate how much we + // wasted, compared to read. For example, when we use the 3rd record + // as candiate. This area is what we read: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ***+ *** *** *** *** ** + // * | | | | | | + // +---+ | | | | | * + // * | | | | | | | | + // +---+ | | | | | | | * + // * | | | | X | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // *** *** ***-*** ***--*** ***--*** +**** + // which is (size of the record) X (number of records). + // + // While wasted is this area: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ****---+ | | | | + // * * | | | | | + // * *-*** *** | | | | | + // * * | | | | | | | + // *--** *** | | | | | | | + // | | | | | X | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // Which can be calculated iteratively. + // The difference between wasted using 4st and 3rd record, will + // be following area: + // +---+ + // +--+ +-+ ++ +-+ +-+ +---+ | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // | xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // +-+ +-+ +-+ ++ +---+ +--+ | | | + // | | | | | | | + // +---+ ++ | | | | | | + // | | | | | | X | | | + // +---+ ++ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // which will be the size difference between 4st and 3rd record, + // times 3, which is number of records before the 4st. + // Here we assume that all data within the prefetch range will be useful. In + // reality, it may not be the case when a partial block is inside the range, + // or there are data in the middle that is not read. We ignore those cases + // for simplicity. + assert(!sorted.empty()); + size_t prev_size = sorted[0]; + size_t max_qualified_size = sorted[0]; + size_t wasted = 0; + for (size_t i = 1; i < sorted.size(); i++) { + size_t read = sorted[i] * sorted.size(); + wasted += (sorted[i] - prev_size) * i; + if (wasted <= read / 8) { + max_qualified_size = sorted[i]; + } + prev_size = sorted[i]; + } + const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB + return std::min(kMaxPrefetchSize, max_qualified_size); +} + +// TODO(myabandeh): We should return an error instead of silently changing the +// options +BlockBasedTableFactory::BlockBasedTableFactory( + const BlockBasedTableOptions& _table_options) + : table_options_(_table_options) { + if (table_options_.flush_block_policy_factory == nullptr) { + table_options_.flush_block_policy_factory.reset( + new FlushBlockBySizePolicyFactory()); + } + if (table_options_.no_block_cache) { + table_options_.block_cache.reset(); + } else if (table_options_.block_cache == nullptr) { + LRUCacheOptions co; + co.capacity = 8 << 20; + // It makes little sense to pay overhead for mid-point insertion while the + // block size is only 8MB. + co.high_pri_pool_ratio = 0.0; + table_options_.block_cache = NewLRUCache(co); + } + if (table_options_.block_size_deviation < 0 || + table_options_.block_size_deviation > 100) { + table_options_.block_size_deviation = 0; + } + if (table_options_.block_restart_interval < 1) { + table_options_.block_restart_interval = 1; + } + if (table_options_.index_block_restart_interval < 1) { + table_options_.index_block_restart_interval = 1; + } + if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && + table_options_.index_block_restart_interval != 1) { + // Currently kHashSearch is incompatible with index_block_restart_interval > 1 + table_options_.index_block_restart_interval = 1; + } + if (table_options_.partition_filters && + table_options_.index_type != + BlockBasedTableOptions::kTwoLevelIndexSearch) { + // We do not support partitioned filters without partitioning indexes + table_options_.partition_filters = false; + } +} + +Status BlockBasedTableFactory::NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, + bool prefetch_index_and_filter_in_cache) const { + return BlockBasedTable::Open( + table_reader_options.ioptions, table_reader_options.env_options, + table_options_, table_reader_options.internal_comparator, std::move(file), + file_size, table_reader, table_reader_options.prefix_extractor, + prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, + table_reader_options.level, table_reader_options.immortal, + table_reader_options.largest_seqno, &tail_prefetch_stats_, + table_reader_options.block_cache_tracer); +} + +TableBuilder* BlockBasedTableFactory::NewTableBuilder( + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + WritableFileWriter* file) const { + auto table_builder = new BlockBasedTableBuilder( + table_builder_options.ioptions, table_builder_options.moptions, + table_options_, table_builder_options.internal_comparator, + table_builder_options.int_tbl_prop_collector_factories, column_family_id, + file, table_builder_options.compression_type, + table_builder_options.sample_for_compression, + table_builder_options.compression_opts, + table_builder_options.skip_filters, + table_builder_options.column_family_name, table_builder_options.level, + table_builder_options.creation_time, + table_builder_options.oldest_key_time, + table_builder_options.target_file_size, + table_builder_options.file_creation_time); + + return table_builder; +} + +Status BlockBasedTableFactory::SanitizeOptions( + const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { + if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && + cf_opts.prefix_extractor == nullptr) { + return Status::InvalidArgument( + "Hash index is specified for block-based " + "table, but prefix_extractor is not given"); + } + if (table_options_.cache_index_and_filter_blocks && + table_options_.no_block_cache) { + return Status::InvalidArgument( + "Enable cache_index_and_filter_blocks, " + ", but block cache is disabled"); + } + if (table_options_.pin_l0_filter_and_index_blocks_in_cache && + table_options_.no_block_cache) { + return Status::InvalidArgument( + "Enable pin_l0_filter_and_index_blocks_in_cache, " + ", but block cache is disabled"); + } + if (!BlockBasedTableSupportedVersion(table_options_.format_version)) { + return Status::InvalidArgument( + "Unsupported BlockBasedTable format_version. Please check " + "include/rocksdb/table.h for more info"); + } + if (table_options_.block_align && (cf_opts.compression != kNoCompression)) { + return Status::InvalidArgument( + "Enable block_align, but compression " + "enabled"); + } + if (table_options_.block_align && + (table_options_.block_size & (table_options_.block_size - 1))) { + return Status::InvalidArgument( + "Block alignment requested but block size is not a power of 2"); + } + if (table_options_.block_size > port::kMaxUint32) { + return Status::InvalidArgument( + "block size exceeds maximum number (4GiB) allowed"); + } + if (table_options_.data_block_index_type == + BlockBasedTableOptions::kDataBlockBinaryAndHash && + table_options_.data_block_hash_table_util_ratio <= 0) { + return Status::InvalidArgument( + "data_block_hash_table_util_ratio should be greater than 0 when " + "data_block_index_type is set to kDataBlockBinaryAndHash"); + } + if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) { + // TODO(myabandeh): support it + return Status::InvalidArgument( + "max_successive_merges larger than 0 is currently inconsistent with " + "unordered_write"); + } + return Status::OK(); +} + +std::string BlockBasedTableFactory::GetPrintableTableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n", + table_options_.flush_block_policy_factory->Name(), + static_cast<void*>(table_options_.flush_block_policy_factory.get())); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n", + table_options_.cache_index_and_filter_blocks); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " cache_index_and_filter_blocks_with_high_priority: %d\n", + table_options_.cache_index_and_filter_blocks_with_high_priority); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " pin_l0_filter_and_index_blocks_in_cache: %d\n", + table_options_.pin_l0_filter_and_index_blocks_in_cache); + ret.append(buffer); + snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n", + table_options_.pin_top_level_index_and_filter); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_type: %d\n", + table_options_.index_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " data_block_index_type: %d\n", + table_options_.data_block_index_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_shortening: %d\n", + static_cast<int>(table_options_.index_shortening)); + ret.append(buffer); + snprintf(buffer, kBufferSize, " data_block_hash_table_util_ratio: %lf\n", + table_options_.data_block_hash_table_util_ratio); + ret.append(buffer); + snprintf(buffer, kBufferSize, " hash_index_allow_collision: %d\n", + table_options_.hash_index_allow_collision); + ret.append(buffer); + snprintf(buffer, kBufferSize, " checksum: %d\n", table_options_.checksum); + ret.append(buffer); + snprintf(buffer, kBufferSize, " no_block_cache: %d\n", + table_options_.no_block_cache); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_cache: %p\n", + static_cast<void*>(table_options_.block_cache.get())); + ret.append(buffer); + if (table_options_.block_cache) { + const char* block_cache_name = table_options_.block_cache->Name(); + if (block_cache_name != nullptr) { + snprintf(buffer, kBufferSize, " block_cache_name: %s\n", + block_cache_name); + ret.append(buffer); + } + ret.append(" block_cache_options:\n"); + ret.append(table_options_.block_cache->GetPrintableOptions()); + } + snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n", + static_cast<void*>(table_options_.block_cache_compressed.get())); + ret.append(buffer); + if (table_options_.block_cache_compressed) { + const char* block_cache_compressed_name = + table_options_.block_cache_compressed->Name(); + if (block_cache_compressed_name != nullptr) { + snprintf(buffer, kBufferSize, " block_cache_name: %s\n", + block_cache_compressed_name); + ret.append(buffer); + } + ret.append(" block_cache_compressed_options:\n"); + ret.append(table_options_.block_cache_compressed->GetPrintableOptions()); + } + snprintf(buffer, kBufferSize, " persistent_cache: %p\n", + static_cast<void*>(table_options_.persistent_cache.get())); + ret.append(buffer); + if (table_options_.persistent_cache) { + snprintf(buffer, kBufferSize, " persistent_cache_options:\n"); + ret.append(buffer); + ret.append(table_options_.persistent_cache->GetPrintableOptions()); + } + snprintf(buffer, kBufferSize, " block_size: %" ROCKSDB_PRIszt "\n", + table_options_.block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_size_deviation: %d\n", + table_options_.block_size_deviation); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_restart_interval: %d\n", + table_options_.block_restart_interval); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n", + table_options_.index_block_restart_interval); + ret.append(buffer); + snprintf(buffer, kBufferSize, " metadata_block_size: %" PRIu64 "\n", + table_options_.metadata_block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " partition_filters: %d\n", + table_options_.partition_filters); + ret.append(buffer); + snprintf(buffer, kBufferSize, " use_delta_encoding: %d\n", + table_options_.use_delta_encoding); + ret.append(buffer); + snprintf(buffer, kBufferSize, " filter_policy: %s\n", + table_options_.filter_policy == nullptr + ? "nullptr" + : table_options_.filter_policy->Name()); + ret.append(buffer); + snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n", + table_options_.whole_key_filtering); + ret.append(buffer); + snprintf(buffer, kBufferSize, " verify_compression: %d\n", + table_options_.verify_compression); + ret.append(buffer); + snprintf(buffer, kBufferSize, " read_amp_bytes_per_bit: %d\n", + table_options_.read_amp_bytes_per_bit); + ret.append(buffer); + snprintf(buffer, kBufferSize, " format_version: %d\n", + table_options_.format_version); + ret.append(buffer); + snprintf(buffer, kBufferSize, " enable_index_compression: %d\n", + table_options_.enable_index_compression); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_align: %d\n", + table_options_.block_align); + ret.append(buffer); + return ret; +} + +#ifndef ROCKSDB_LITE +namespace { +bool SerializeSingleBlockBasedTableOption( + std::string* opt_string, const BlockBasedTableOptions& bbt_options, + const std::string& name, const std::string& delimiter) { + auto iter = block_based_table_type_info.find(name); + if (iter == block_based_table_type_info.end()) { + return false; + } + auto& opt_info = iter->second; + const char* opt_address = + reinterpret_cast<const char*>(&bbt_options) + opt_info.offset; + std::string value; + bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value); + if (result) { + *opt_string = name + "=" + value + delimiter; + } + return result; +} +} // namespace + +Status BlockBasedTableFactory::GetOptionString( + std::string* opt_string, const std::string& delimiter) const { + assert(opt_string); + opt_string->clear(); + for (auto iter = block_based_table_type_info.begin(); + iter != block_based_table_type_info.end(); ++iter) { + if (iter->second.verification == OptionVerificationType::kDeprecated) { + // If the option is no longer used in rocksdb and marked as deprecated, + // we skip it in the serialization. + continue; + } + std::string single_output; + bool result = SerializeSingleBlockBasedTableOption( + &single_output, table_options_, iter->first, delimiter); + assert(result); + if (result) { + opt_string->append(single_output); + } + } + return Status::OK(); +} +#else +Status BlockBasedTableFactory::GetOptionString( + std::string* /*opt_string*/, const std::string& /*delimiter*/) const { + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +const BlockBasedTableOptions& BlockBasedTableFactory::table_options() const { + return table_options_; +} + +#ifndef ROCKSDB_LITE +namespace { +std::string ParseBlockBasedTableOption(const std::string& name, + const std::string& org_value, + BlockBasedTableOptions* new_options, + bool input_strings_escaped = false, + bool ignore_unknown_options = false) { + const std::string& value = + input_strings_escaped ? UnescapeOptionString(org_value) : org_value; + if (!input_strings_escaped) { + // if the input string is not escaped, it means this function is + // invoked from SetOptions, which takes the old format. + if (name == "block_cache" || name == "block_cache_compressed") { + // cache options can be specified in the following format + // "block_cache={capacity=1M;num_shard_bits=4; + // strict_capacity_limit=true;high_pri_pool_ratio=0.5;}" + // To support backward compatibility, the following format + // is also supported. + // "block_cache=1M" + std::shared_ptr<Cache> cache; + // block_cache is specified in format block_cache=<cache_size>. + if (value.find('=') == std::string::npos) { + cache = NewLRUCache(ParseSizeT(value)); + } else { + LRUCacheOptions cache_opts; + if (!ParseOptionHelper(reinterpret_cast<char*>(&cache_opts), + OptionType::kLRUCacheOptions, value)) { + return "Invalid cache options"; + } + cache = NewLRUCache(cache_opts); + } + + if (name == "block_cache") { + new_options->block_cache = cache; + } else { + new_options->block_cache_compressed = cache; + } + return ""; + } else if (name == "filter_policy") { + // Expect the following format + // bloomfilter:int:bool + const std::string kName = "bloomfilter:"; + if (value.compare(0, kName.size(), kName) != 0) { + return "Invalid filter policy name"; + } + size_t pos = value.find(':', kName.size()); + if (pos == std::string::npos) { + return "Invalid filter policy config, missing bits_per_key"; + } + double bits_per_key = + ParseDouble(trim(value.substr(kName.size(), pos - kName.size()))); + bool use_block_based_builder = + ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1))); + new_options->filter_policy.reset( + NewBloomFilterPolicy(bits_per_key, use_block_based_builder)); + return ""; + } + } + const auto iter = block_based_table_type_info.find(name); + if (iter == block_based_table_type_info.end()) { + if (ignore_unknown_options) { + return ""; + } else { + return "Unrecognized option"; + } + } + const auto& opt_info = iter->second; + if (opt_info.verification != OptionVerificationType::kDeprecated && + !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset, + opt_info.type, value)) { + return "Invalid value"; + } + return ""; +} +} // namespace + +Status GetBlockBasedTableOptionsFromString( + const BlockBasedTableOptions& table_options, const std::string& opts_str, + BlockBasedTableOptions* new_table_options) { + std::unordered_map<std::string, std::string> opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + return s; + } + + return GetBlockBasedTableOptionsFromMap(table_options, opts_map, + new_table_options); +} + +Status GetBlockBasedTableOptionsFromMap( + const BlockBasedTableOptions& table_options, + const std::unordered_map<std::string, std::string>& opts_map, + BlockBasedTableOptions* new_table_options, bool input_strings_escaped, + bool ignore_unknown_options) { + assert(new_table_options); + *new_table_options = table_options; + for (const auto& o : opts_map) { + auto error_message = ParseBlockBasedTableOption( + o.first, o.second, new_table_options, input_strings_escaped, + ignore_unknown_options); + if (error_message != "") { + const auto iter = block_based_table_type_info.find(o.first); + if (iter == block_based_table_type_info.end() || + !input_strings_escaped || // !input_strings_escaped indicates + // the old API, where everything is + // parsable. + (iter->second.verification != OptionVerificationType::kByName && + iter->second.verification != + OptionVerificationType::kByNameAllowNull && + iter->second.verification != + OptionVerificationType::kByNameAllowFromNull && + iter->second.verification != OptionVerificationType::kDeprecated)) { + // Restore "new_options" to the default "base_options". + *new_table_options = table_options; + return Status::InvalidArgument("Can't parse BlockBasedTableOptions:", + o.first + " " + error_message); + } + } + } + return Status::OK(); +} + +Status VerifyBlockBasedTableFactory( + const BlockBasedTableFactory* base_tf, + const BlockBasedTableFactory* file_tf, + OptionsSanityCheckLevel sanity_check_level) { + if ((base_tf != nullptr) != (file_tf != nullptr) && + sanity_check_level > kSanityLevelNone) { + return Status::Corruption( + "[RocksDBOptionsParser]: Inconsistent TableFactory class type"); + } + if (base_tf == nullptr) { + return Status::OK(); + } + assert(file_tf != nullptr); + + const auto& base_opt = base_tf->table_options(); + const auto& file_opt = file_tf->table_options(); + + for (auto& pair : block_based_table_type_info) { + if (pair.second.verification == OptionVerificationType::kDeprecated) { + // We skip checking deprecated variables as they might + // contain random values since they might not be initialized + continue; + } + if (BBTOptionSanityCheckLevel(pair.first) <= sanity_check_level) { + if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt), + reinterpret_cast<const char*>(&file_opt), + pair.second, pair.first, nullptr)) { + return Status::Corruption( + "[RocksDBOptionsParser]: " + "failed the verification on BlockBasedTableOptions::", + pair.first); + } + } + } + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& _table_options) { + return new BlockBasedTableFactory(_table_options); +} + +const std::string BlockBasedTableFactory::kName = "BlockBasedTable"; +const std::string BlockBasedTablePropertyNames::kIndexType = + "rocksdb.block.based.table.index.type"; +const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering = + "rocksdb.block.based.table.whole.key.filtering"; +const std::string BlockBasedTablePropertyNames::kPrefixFiltering = + "rocksdb.block.based.table.prefix.filtering"; +const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes"; +const std::string kHashIndexPrefixesMetadataBlock = + "rocksdb.hashindex.metadata"; +const std::string kPropTrue = "1"; +const std::string kPropFalse = "0"; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_based_table_factory.h b/src/rocksdb/table/block_based/block_based_table_factory.h new file mode 100644 index 000000000..7c8633c07 --- /dev/null +++ b/src/rocksdb/table/block_based/block_based_table_factory.h @@ -0,0 +1,195 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <stdint.h> + +#include <memory> +#include <string> + +#include "db/dbformat.h" +#include "options/options_helper.h" +#include "options/options_parser.h" +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +struct EnvOptions; + +class BlockBasedTableBuilder; + +// A class used to track actual bytes written from the tail in the recent SST +// file opens, and provide a suggestion for following open. +class TailPrefetchStats { + public: + void RecordEffectiveSize(size_t len); + // 0 indicates no information to determine. + size_t GetSuggestedPrefetchSize(); + + private: + const static size_t kNumTracked = 32; + size_t records_[kNumTracked]; + port::Mutex mutex_; + size_t next_ = 0; + size_t num_records_ = 0; +}; + +class BlockBasedTableFactory : public TableFactory { + public: + explicit BlockBasedTableFactory( + const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); + + ~BlockBasedTableFactory() {} + + const char* Name() const override { return kName.c_str(); } + + Status NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, + bool prefetch_index_and_filter_in_cache = true) const override; + + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + uint32_t column_family_id, WritableFileWriter* file) const override; + + // Sanitizes the specified DB Options. + Status SanitizeOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override; + + std::string GetPrintableTableOptions() const override; + + Status GetOptionString(std::string* opt_string, + const std::string& delimiter) const override; + + const BlockBasedTableOptions& table_options() const; + + void* GetOptions() override { return &table_options_; } + + bool IsDeleteRangeSupported() const override { return true; } + + static const std::string kName; + + private: + BlockBasedTableOptions table_options_; + mutable TailPrefetchStats tail_prefetch_stats_; +}; + +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; +extern const std::string kPropTrue; +extern const std::string kPropFalse; + +#ifndef ROCKSDB_LITE +extern Status VerifyBlockBasedTableFactory( + const BlockBasedTableFactory* base_tf, + const BlockBasedTableFactory* file_tf, + OptionsSanityCheckLevel sanity_check_level); + +static std::unordered_map<std::string, OptionTypeInfo> + block_based_table_type_info = { + /* currently not supported + std::shared_ptr<Cache> block_cache = nullptr; + std::shared_ptr<Cache> block_cache_compressed = nullptr; + */ + {"flush_block_policy_factory", + {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory), + OptionType::kFlushBlockPolicyFactory, OptionVerificationType::kByName, + false, 0}}, + {"cache_index_and_filter_blocks", + {offsetof(struct BlockBasedTableOptions, + cache_index_and_filter_blocks), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"cache_index_and_filter_blocks_with_high_priority", + {offsetof(struct BlockBasedTableOptions, + cache_index_and_filter_blocks_with_high_priority), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"pin_l0_filter_and_index_blocks_in_cache", + {offsetof(struct BlockBasedTableOptions, + pin_l0_filter_and_index_blocks_in_cache), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"index_type", + {offsetof(struct BlockBasedTableOptions, index_type), + OptionType::kBlockBasedTableIndexType, + OptionVerificationType::kNormal, false, 0}}, + {"hash_index_allow_collision", + {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"data_block_index_type", + {offsetof(struct BlockBasedTableOptions, data_block_index_type), + OptionType::kBlockBasedTableDataBlockIndexType, + OptionVerificationType::kNormal, false, 0}}, + {"index_shortening", + {offsetof(struct BlockBasedTableOptions, index_shortening), + OptionType::kBlockBasedTableIndexShorteningMode, + OptionVerificationType::kNormal, false, 0}}, + {"data_block_hash_table_util_ratio", + {offsetof(struct BlockBasedTableOptions, + data_block_hash_table_util_ratio), + OptionType::kDouble, OptionVerificationType::kNormal, false, 0}}, + {"checksum", + {offsetof(struct BlockBasedTableOptions, checksum), + OptionType::kChecksumType, OptionVerificationType::kNormal, false, + 0}}, + {"no_block_cache", + {offsetof(struct BlockBasedTableOptions, no_block_cache), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"block_size", + {offsetof(struct BlockBasedTableOptions, block_size), + OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, + {"block_size_deviation", + {offsetof(struct BlockBasedTableOptions, block_size_deviation), + OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, + {"block_restart_interval", + {offsetof(struct BlockBasedTableOptions, block_restart_interval), + OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, + {"index_block_restart_interval", + {offsetof(struct BlockBasedTableOptions, index_block_restart_interval), + OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, + {"index_per_partition", + {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, false, + 0}}, + {"metadata_block_size", + {offsetof(struct BlockBasedTableOptions, metadata_block_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, + {"partition_filters", + {offsetof(struct BlockBasedTableOptions, partition_filters), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"filter_policy", + {offsetof(struct BlockBasedTableOptions, filter_policy), + OptionType::kFilterPolicy, OptionVerificationType::kByName, false, + 0}}, + {"whole_key_filtering", + {offsetof(struct BlockBasedTableOptions, whole_key_filtering), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"skip_table_builder_flush", + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, + 0}}, + {"format_version", + {offsetof(struct BlockBasedTableOptions, format_version), + OptionType::kUInt32T, OptionVerificationType::kNormal, false, 0}}, + {"verify_compression", + {offsetof(struct BlockBasedTableOptions, verify_compression), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"read_amp_bytes_per_bit", + {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit), + OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, + {"enable_index_compression", + {offsetof(struct BlockBasedTableOptions, enable_index_compression), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"block_align", + {offsetof(struct BlockBasedTableOptions, block_align), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"pin_top_level_index_and_filter", + {offsetof(struct BlockBasedTableOptions, + pin_top_level_index_and_filter), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}}; +#endif // !ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_based_table_reader.cc b/src/rocksdb/table/block_based/block_based_table_reader.cc new file mode 100644 index 000000000..9b37b431f --- /dev/null +++ b/src/rocksdb/table/block_based/block_based_table_reader.cc @@ -0,0 +1,4531 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/block_based_table_reader.h" +#include <algorithm> +#include <array> +#include <limits> +#include <string> +#include <utility> +#include <vector> + +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" + +#include "file/file_prefetch_buffer.h" +#include "file/random_access_file_reader.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" + +#include "table/block_based/block.h" +#include "table/block_based/block_based_filter_block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/partitioned_filter_block.h" +#include "table/block_fetcher.h" +#include "table/format.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" +#include "table/multiget_context.h" +#include "table/persistent_cache_helper.h" +#include "table/sst_file_writer_collectors.h" +#include "table/two_level_iterator.h" + +#include "monitoring/perf_context_imp.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/util.h" +#include "util/xxhash.h" + +namespace ROCKSDB_NAMESPACE { + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; + +typedef BlockBasedTable::IndexReader IndexReader; + +// Found that 256 KB readahead size provides the best performance, based on +// experiments, for auto readahead. Experiment data is in PR #3282. +const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024; + +BlockBasedTable::~BlockBasedTable() { + delete rep_; +} + +std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0); + +template <typename TBlocklike> +class BlocklikeTraits; + +template <> +class BlocklikeTraits<BlockContents> { + public: + static BlockContents* Create(BlockContents&& contents, + SequenceNumber /* global_seqno */, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool /* using_zstd */, + const FilterPolicy* /* filter_policy */) { + return new BlockContents(std::move(contents)); + } + + static uint32_t GetNumRestarts(const BlockContents& /* contents */) { + return 0; + } +}; + +template <> +class BlocklikeTraits<ParsedFullFilterBlock> { + public: + static ParsedFullFilterBlock* Create(BlockContents&& contents, + SequenceNumber /* global_seqno */, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool /* using_zstd */, + const FilterPolicy* filter_policy) { + return new ParsedFullFilterBlock(filter_policy, std::move(contents)); + } + + static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) { + return 0; + } +}; + +template <> +class BlocklikeTraits<Block> { + public: + static Block* Create(BlockContents&& contents, SequenceNumber global_seqno, + size_t read_amp_bytes_per_bit, Statistics* statistics, + bool /* using_zstd */, + const FilterPolicy* /* filter_policy */) { + return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit, + statistics); + } + + static uint32_t GetNumRestarts(const Block& block) { + return block.NumRestarts(); + } +}; + +template <> +class BlocklikeTraits<UncompressionDict> { + public: + static UncompressionDict* Create(BlockContents&& contents, + SequenceNumber /* global_seqno */, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool using_zstd, + const FilterPolicy* /* filter_policy */) { + return new UncompressionDict(contents.data, std::move(contents.allocation), + using_zstd); + } + + static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) { + return 0; + } +}; + +namespace { +// Read the block identified by "handle" from "file". +// The only relevant option is options.verify_checksums for now. +// On failure return non-OK. +// On success fill *result and return OK - caller owns *result +// @param uncompression_dict Data for presetting the compression library's +// dictionary. +template <typename TBlocklike> +Status ReadBlockFromFile( + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const ReadOptions& options, const BlockHandle& handle, + std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions, + bool do_uncompress, bool maybe_compressed, BlockType block_type, + const UncompressionDict& uncompression_dict, + const PersistentCacheOptions& cache_options, SequenceNumber global_seqno, + size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, + bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) { + assert(result); + + BlockContents contents; + BlockFetcher block_fetcher( + file, prefetch_buffer, footer, options, handle, &contents, ioptions, + do_uncompress, maybe_compressed, block_type, uncompression_dict, + cache_options, memory_allocator, nullptr, for_compaction); + Status s = block_fetcher.ReadBlockContents(); + if (s.ok()) { + result->reset(BlocklikeTraits<TBlocklike>::Create( + std::move(contents), global_seqno, read_amp_bytes_per_bit, + ioptions.statistics, using_zstd, filter_policy)); + } + + return s; +} + +inline MemoryAllocator* GetMemoryAllocator( + const BlockBasedTableOptions& table_options) { + return table_options.block_cache.get() + ? table_options.block_cache->memory_allocator() + : nullptr; +} + +inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock( + const BlockBasedTableOptions& table_options) { + return table_options.block_cache_compressed.get() + ? table_options.block_cache_compressed->memory_allocator() + : nullptr; +} + +// Delete the entry resided in the cache. +template <class Entry> +void DeleteCachedEntry(const Slice& /*key*/, void* value) { + auto entry = reinterpret_cast<Entry*>(value); + delete entry; +} + +// Release the cached entry and decrement its ref count. +void ForceReleaseCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast<Cache*>(arg); + Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h); + cache->Release(handle, true /* force_erase */); +} + +// Release the cached entry and decrement its ref count. +// Do not force erase +void ReleaseCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast<Cache*>(arg); + Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h); + cache->Release(handle, false /* force_erase */); +} + +// For hash based index, return true if prefix_extractor and +// prefix_extractor_block mismatch, false otherwise. This flag will be used +// as total_order_seek via NewIndexIterator +bool PrefixExtractorChanged(const TableProperties* table_properties, + const SliceTransform* prefix_extractor) { + // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set. + // Turn off hash index in prefix_extractor is not set; if prefix_extractor + // is set but prefix_extractor_block is not set, also disable hash index + if (prefix_extractor == nullptr || table_properties == nullptr || + table_properties->prefix_extractor_name.empty()) { + return true; + } + + // prefix_extractor and prefix_extractor_block are both non-empty + if (table_properties->prefix_extractor_name.compare( + prefix_extractor->Name()) != 0) { + return true; + } else { + return false; + } +} + +CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) { + CacheAllocationPtr heap_buf; + heap_buf = AllocateBlock(buf.size(), allocator); + memcpy(heap_buf.get(), buf.data(), buf.size()); + return heap_buf; +} + +} // namespace + +// Encapsulates common functionality for the various index reader +// implementations. Provides access to the index block regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { + public: + IndexReaderCommon(const BlockBasedTable* t, + CachableEntry<Block>&& index_block) + : table_(t), index_block_(std::move(index_block)) { + assert(table_ != nullptr); + } + + protected: + static Status ReadIndexBlock(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<Block>* index_block); + + const BlockBasedTable* table() const { return table_; } + + const InternalKeyComparator* internal_comparator() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + + return &table_->get_rep()->internal_comparator; + } + + bool index_has_first_key() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_has_first_key; + } + + bool index_key_includes_seq() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_key_includes_seq; + } + + bool index_value_is_full() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_value_is_full; + } + + bool cache_index_blocks() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->table_options.cache_index_and_filter_blocks; + } + + Status GetOrReadIndexBlock(bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<Block>* index_block) const; + + size_t ApproximateIndexBlockMemoryUsage() const { + assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr); + return index_block_.GetOwnValue() + ? index_block_.GetValue()->ApproximateMemoryUsage() + : 0; + } + + private: + const BlockBasedTable* table_; + CachableEntry<Block> index_block_; +}; + +Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<Block>* index_block) { + PERF_TIMER_GUARD(read_index_block_nanos); + + assert(table != nullptr); + assert(index_block != nullptr); + assert(index_block->IsEmpty()); + + const Rep* const rep = table->get_rep(); + assert(rep != nullptr); + + const Status s = table->RetrieveBlock( + prefetch_buffer, read_options, rep->footer.index_handle(), + UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex, + get_context, lookup_context, /* for_compaction */ false, use_cache); + + return s; +} + +Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<Block>* index_block) const { + assert(index_block != nullptr); + + if (!index_block_.IsEmpty()) { + index_block->SetUnownedValue(index_block_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options, + cache_index_blocks(), get_context, lookup_context, + index_block); +} + +// Index that allows binary search lookup in a two-level index structure. +class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + // Read the partition index from the file and create an instance for + // `PartitionIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr<IndexReader>* index_reader) { + assert(table != nullptr); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(index_reader != nullptr); + + CachableEntry<Block> index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + index_reader->reset( + new PartitionIndexReader(table, std::move(index_block))); + + return Status::OK(); + } + + // return a two-level iterator: first level is on the partition index + InternalIteratorBase<IndexValue>* NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override { + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry<Block> index_block; + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator<IndexValue>(s); + } + + InternalIteratorBase<IndexValue>* it = nullptr; + + Statistics* kNullStats = nullptr; + // Filters are already checked before seeking the index + if (!partition_map_.empty()) { + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + it = NewTwoLevelIterator( + new BlockBasedTable::PartitionedIndexIteratorState(table(), + &partition_map_), + index_block.GetValue()->NewIndexIterator( + internal_comparator(), internal_comparator()->user_comparator(), + nullptr, kNullStats, true, index_has_first_key(), + index_key_includes_seq(), index_value_is_full())); + } else { + ReadOptions ro; + ro.fill_cache = read_options.fill_cache; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + it = new BlockBasedTableIterator<IndexBlockIter, IndexValue>( + table(), ro, *internal_comparator(), + index_block.GetValue()->NewIndexIterator( + internal_comparator(), internal_comparator()->user_comparator(), + nullptr, kNullStats, true, index_has_first_key(), + index_key_includes_seq(), index_value_is_full()), + false, true, /* prefix_extractor */ nullptr, BlockType::kIndex, + lookup_context ? lookup_context->caller + : TableReaderCaller::kUncategorized); + } + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; + + // TODO(myabandeh): Update TwoLevelIterator to be able to make use of + // on-stack BlockIter while the state is on heap. Currentlly it assumes + // the first level iter is always on heap and will attempt to delete it + // in its destructor. + } + + void CacheDependencies(bool pin) override { + // Before read partitions, prefetch them to avoid lots of IOs + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + const BlockBasedTable::Rep* rep = table()->rep_; + IndexBlockIter biter; + BlockHandle handle; + Statistics* kNullStats = nullptr; + + CachableEntry<Block> index_block; + Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */, + &lookup_context, &index_block); + if (!s.ok()) { + ROCKS_LOG_WARN(rep->ioptions.info_log, + "Error retrieving top-level index block while trying to " + "cache index partitions: %s", + s.ToString().c_str()); + return; + } + + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + index_block.GetValue()->NewIndexIterator( + internal_comparator(), internal_comparator()->user_comparator(), &biter, + kNullStats, true, index_has_first_key(), index_key_includes_seq(), + index_value_is_full()); + // Index partitions are assumed to be consecuitive. Prefetch them all. + // Read the first block offset + biter.SeekToFirst(); + if (!biter.Valid()) { + // Empty index. + return; + } + handle = biter.value().handle; + uint64_t prefetch_off = handle.offset(); + + // Read the last block's offset + biter.SeekToLast(); + if (!biter.Valid()) { + // Empty index. + return; + } + handle = biter.value().handle; + uint64_t last_off = handle.offset() + block_size(handle); + uint64_t prefetch_len = last_off - prefetch_off; + std::unique_ptr<FilePrefetchBuffer> prefetch_buffer; + rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer); + s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off, + static_cast<size_t>(prefetch_len)); + + // After prefetch, read the partitions one by one + biter.SeekToFirst(); + auto ro = ReadOptions(); + for (; biter.Valid(); biter.Next()) { + handle = biter.value().handle; + CachableEntry<Block> block; + // TODO: Support counter batch update for partitioned index and + // filter blocks + s = table()->MaybeReadBlockAndLoadToCache( + prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), + &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context, + /*contents=*/nullptr); + + assert(s.ok() || block.GetValue() == nullptr); + if (s.ok() && block.GetValue() != nullptr) { + if (block.IsCached()) { + if (pin) { + partition_map_[handle.offset()] = std::move(block); + } + } + } + } + } + + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + // TODO(myabandeh): more accurate estimate of partition_map_ mem usage + return usage; + } + + private: + PartitionIndexReader(const BlockBasedTable* t, + CachableEntry<Block>&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} + + std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_; +}; + +// Index that allows binary search lookup for the first key of each block. +// This class can be viewed as a thin wrapper for `Block` class which already +// supports binary search. +class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + // Read index from the file and create an intance for + // `BinarySearchIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr<IndexReader>* index_reader) { + assert(table != nullptr); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(index_reader != nullptr); + + CachableEntry<Block> index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + index_reader->reset( + new BinarySearchIndexReader(table, std::move(index_block))); + + return Status::OK(); + } + + InternalIteratorBase<IndexValue>* NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override { + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry<Block> index_block; + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator<IndexValue>(s); + } + + Statistics* kNullStats = nullptr; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + auto it = index_block.GetValue()->NewIndexIterator( + internal_comparator(), internal_comparator()->user_comparator(), iter, + kNullStats, true, index_has_first_key(), index_key_includes_seq(), + index_value_is_full()); + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; + } + + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + } + + private: + BinarySearchIndexReader(const BlockBasedTable* t, + CachableEntry<Block>&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} +}; + +// Index that leverages an internal hash table to quicken the lookup for a given +// key. +class HashIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + static Status Create(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_index_iter, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr<IndexReader>* index_reader) { + assert(table != nullptr); + assert(index_reader != nullptr); + assert(!pin || prefetch); + + const BlockBasedTable::Rep* rep = table->get_rep(); + assert(rep != nullptr); + + CachableEntry<Block> index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + // Note, failure to create prefix hash index does not need to be a + // hard error. We can still fall back to the original binary search index. + // So, Create will succeed regardless, from this point on. + + index_reader->reset(new HashIndexReader(table, std::move(index_block))); + + // Get prefixes block + BlockHandle prefixes_handle; + Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, + &prefixes_handle); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + // Get index metadata block + BlockHandle prefixes_meta_handle; + s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock, + &prefixes_meta_handle); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + RandomAccessFileReader* const file = rep->file.get(); + const Footer& footer = rep->footer; + const ImmutableCFOptions& ioptions = rep->ioptions; + const PersistentCacheOptions& cache_options = rep->persistent_cache_options; + MemoryAllocator* const memory_allocator = + GetMemoryAllocator(rep->table_options); + + // Read contents for the blocks + BlockContents prefixes_contents; + BlockFetcher prefixes_block_fetcher( + file, prefetch_buffer, footer, ReadOptions(), prefixes_handle, + &prefixes_contents, ioptions, true /*decompress*/, + true /*maybe_compressed*/, BlockType::kHashIndexPrefixes, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + s = prefixes_block_fetcher.ReadBlockContents(); + if (!s.ok()) { + return s; + } + BlockContents prefixes_meta_contents; + BlockFetcher prefixes_meta_block_fetcher( + file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle, + &prefixes_meta_contents, ioptions, true /*decompress*/, + true /*maybe_compressed*/, BlockType::kHashIndexMetadata, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + s = prefixes_meta_block_fetcher.ReadBlockContents(); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + BlockPrefixIndex* prefix_index = nullptr; + assert(rep->internal_prefix_transform.get() != nullptr); + s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(), + prefixes_contents.data, + prefixes_meta_contents.data, &prefix_index); + // TODO: log error + if (s.ok()) { + HashIndexReader* const hash_index_reader = + static_cast<HashIndexReader*>(index_reader->get()); + hash_index_reader->prefix_index_.reset(prefix_index); + } + + return Status::OK(); + } + + InternalIteratorBase<IndexValue>* NewIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override { + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry<Block> index_block; + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator<IndexValue>(s); + } + + Statistics* kNullStats = nullptr; + const bool total_order_seek = + read_options.total_order_seek || disable_prefix_seek; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + auto it = index_block.GetValue()->NewIndexIterator( + internal_comparator(), internal_comparator()->user_comparator(), iter, + kNullStats, total_order_seek, index_has_first_key(), + index_key_includes_seq(), index_value_is_full(), + false /* block_contents_pinned */, prefix_index_.get()); + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; + } + + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast<HashIndexReader*>(this)); +#else + if (prefix_index_) { + usage += prefix_index_->ApproximateMemoryUsage(); + } + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + } + + private: + HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} + + std::unique_ptr<BlockPrefixIndex> prefix_index_; +}; + +void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, + GetContext* get_context, + size_t usage) const { + Statistics* const statistics = rep_->ioptions.statistics; + + PERF_COUNTER_ADD(block_cache_hit_count, 1); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, + static_cast<uint32_t>(rep_->level)); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_hit; + get_context->get_context_stats_.num_cache_bytes_read += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_HIT); + RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage); + } + + switch (block_type) { + case BlockType::kFilter: + PERF_COUNTER_ADD(block_cache_filter_hit_count, 1); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_HIT); + } + break; + + case BlockType::kCompressionDictionary: + // TODO: introduce perf counter for compression dictionary hit count + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT); + } + break; + + case BlockType::kIndex: + PERF_COUNTER_ADD(block_cache_index_hit_count, 1); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_HIT); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_HIT); + } + break; + } +} + +void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type, + GetContext* get_context) const { + Statistics* const statistics = rep_->ioptions.statistics; + + // TODO: introduce aggregate (not per-level) block cache miss count + PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1, + static_cast<uint32_t>(rep_->level)); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_MISS); + } + + // TODO: introduce perf counters for misses per block type + switch (block_type) { + case BlockType::kFilter: + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_MISS); + } + break; + + case BlockType::kCompressionDictionary: + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS); + } + break; + + case BlockType::kIndex: + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_MISS); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_MISS); + } + break; + } +} + +void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type, + GetContext* get_context, + size_t usage) const { + Statistics* const statistics = rep_->ioptions.statistics; + + // TODO: introduce perf counters for block cache insertions + if (get_context) { + ++get_context->get_context_stats_.num_cache_add; + get_context->get_context_stats_.num_cache_bytes_write += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage); + } + + switch (block_type) { + case BlockType::kFilter: + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_add; + get_context->get_context_stats_.num_cache_filter_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_ADD); + RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage); + } + break; + + case BlockType::kCompressionDictionary: + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_add; + get_context->get_context_stats_ + .num_cache_compression_dict_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD); + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + usage); + } + break; + + case BlockType::kIndex: + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_add; + get_context->get_context_stats_.num_cache_index_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); + RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_add; + get_context->get_context_stats_.num_cache_data_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_ADD); + RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage); + } + break; + } +} + +Cache::Handle* BlockBasedTable::GetEntryFromCache( + Cache* block_cache, const Slice& key, BlockType block_type, + GetContext* get_context) const { + auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics); + + if (cache_handle != nullptr) { + UpdateCacheHitMetrics(block_type, get_context, + block_cache->GetUsage(cache_handle)); + } else { + UpdateCacheMissMetrics(block_type, get_context); + } + + return cache_handle; +} + +// Helper function to setup the cache key's prefix for the Table. +void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { + assert(kMaxCacheKeyPrefixSize >= 10); + rep->cache_key_prefix_size = 0; + rep->compressed_cache_key_prefix_size = 0; + if (rep->table_options.block_cache != nullptr) { + GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(), + &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); + } + if (rep->table_options.persistent_cache != nullptr) { + GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(), + &rep->persistent_cache_key_prefix[0], + &rep->persistent_cache_key_prefix_size); + } + if (rep->table_options.block_cache_compressed != nullptr) { + GenerateCachePrefix(rep->table_options.block_cache_compressed.get(), + rep->file->file(), &rep->compressed_cache_key_prefix[0], + &rep->compressed_cache_key_prefix_size); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file, + char* buffer, size_t* size) { + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (cc != nullptr && *size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast<size_t>(end - buffer); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file, + char* buffer, size_t* size) { + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (cc != nullptr && *size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast<size_t>(end - buffer); + } +} + +namespace { +// Return True if table_properties has `user_prop_name` has a `true` value +// or it doesn't contain this property (for backward compatible). +bool IsFeatureSupported(const TableProperties& table_properties, + const std::string& user_prop_name, Logger* info_log) { + auto& props = table_properties.user_collected_properties; + auto pos = props.find(user_prop_name); + // Older version doesn't have this value set. Skip this check. + if (pos != props.end()) { + if (pos->second == kPropFalse) { + return false; + } else if (pos->second != kPropTrue) { + ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s", + user_prop_name.c_str(), pos->second.c_str()); + } + } + return true; +} + +// Caller has to ensure seqno is not nullptr. +Status GetGlobalSequenceNumber(const TableProperties& table_properties, + SequenceNumber largest_seqno, + SequenceNumber* seqno) { + const auto& props = table_properties.user_collected_properties; + const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion); + const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno); + + *seqno = kDisableGlobalSequenceNumber; + if (version_pos == props.end()) { + if (seqno_pos != props.end()) { + std::array<char, 200> msg_buf; + // This is not an external sst file, global_seqno is not supported. + snprintf( + msg_buf.data(), msg_buf.max_size(), + "A non-external sst file have global seqno property with value %s", + seqno_pos->second.c_str()); + return Status::Corruption(msg_buf.data()); + } + return Status::OK(); + } + + uint32_t version = DecodeFixed32(version_pos->second.c_str()); + if (version < 2) { + if (seqno_pos != props.end() || version != 1) { + std::array<char, 200> msg_buf; + // This is a v1 external sst file, global_seqno is not supported. + snprintf(msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno " + "property with value %s", + version, seqno_pos->second.c_str()); + return Status::Corruption(msg_buf.data()); + } + return Status::OK(); + } + + // Since we have a plan to deprecate global_seqno, we do not return failure + // if seqno_pos == props.end(). We rely on version_pos to detect whether the + // SST is external. + SequenceNumber global_seqno(0); + if (seqno_pos != props.end()) { + global_seqno = DecodeFixed64(seqno_pos->second.c_str()); + } + // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno + // to denote it is unknown. + if (largest_seqno < kMaxSequenceNumber) { + if (global_seqno == 0) { + global_seqno = largest_seqno; + } + if (global_seqno != largest_seqno) { + std::array<char, 200> msg_buf; + snprintf( + msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno property " + "with value %s, while largest seqno in the file is %llu", + version, seqno_pos->second.c_str(), + static_cast<unsigned long long>(largest_seqno)); + return Status::Corruption(msg_buf.data()); + } + } + *seqno = global_seqno; + + if (global_seqno > kMaxSequenceNumber) { + std::array<char, 200> msg_buf; + snprintf(msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno property " + "with value %llu, which is greater than kMaxSequenceNumber", + version, static_cast<unsigned long long>(global_seqno)); + return Status::Corruption(msg_buf.data()); + } + + return Status::OK(); +} +} // namespace + +Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix, + size_t cache_key_prefix_size, + const BlockHandle& handle, char* cache_key) { + assert(cache_key != nullptr); + assert(cache_key_prefix_size != 0); + assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); + memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); + char* end = + EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset()); + return Slice(cache_key, static_cast<size_t>(end - cache_key)); +} + +Status BlockBasedTable::Open( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, + const SliceTransform* prefix_extractor, + const bool prefetch_index_and_filter_in_cache, const bool skip_filters, + const int level, const bool immortal_table, + const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats, + BlockCacheTracer* const block_cache_tracer) { + table_reader->reset(); + + Status s; + Footer footer; + std::unique_ptr<FilePrefetchBuffer> prefetch_buffer; + + // prefetch both index and filters, down to all partitions + const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; + const bool preload_all = !table_options.cache_index_and_filter_blocks; + + if (!ioptions.allow_mmap_reads) { + s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all, + preload_all, &prefetch_buffer); + } else { + // Should not prefetch for mmap mode. + prefetch_buffer.reset(new FilePrefetchBuffer( + nullptr, 0, 0, false /* enable */, true /* track_min_offset */)); + } + + // Read in the following order: + // 1. Footer + // 2. [metaindex block] + // 3. [meta block: properties] + // 4. [meta block: range deletion tombstone] + // 5. [meta block: compression dictionary] + // 6. [meta block: index] + // 7. [meta block: filter] + s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer, + kBlockBasedTableMagicNumber); + if (!s.ok()) { + return s; + } + if (!BlockBasedTableSupportedVersion(footer.version())) { + return Status::Corruption( + "Unknown Footer version. Maybe this file was created with newer " + "version of RocksDB?"); + } + + // We've successfully read the footer. We are ready to serve requests. + // Better not mutate rep_ after the creation. eg. internal_prefix_transform + // raw pointer will be used to create HashIndexReader, whose reset may + // access a dangling pointer. + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, + internal_comparator, skip_filters, level, + immortal_table); + rep->file = std::move(file); + rep->footer = footer; + rep->hash_index_allow_collision = table_options.hash_index_allow_collision; + // We need to wrap data with internal_prefix_transform to make sure it can + // handle prefix correctly. + if (prefix_extractor != nullptr) { + rep->internal_prefix_transform.reset( + new InternalKeySliceTransform(prefix_extractor)); + } + SetupCacheKeyPrefix(rep); + std::unique_ptr<BlockBasedTable> new_table( + new BlockBasedTable(rep, block_cache_tracer)); + + // page cache options + rep->persistent_cache_options = + PersistentCacheOptions(rep->table_options.persistent_cache, + std::string(rep->persistent_cache_key_prefix, + rep->persistent_cache_key_prefix_size), + rep->ioptions.statistics); + + // Meta-blocks are not dictionary compressed. Explicitly set the dictionary + // handle to null, otherwise it may be seen as uninitialized during the below + // meta-block reads. + rep->compression_dict_handle = BlockHandle::NullBlockHandle(); + + // Read metaindex + std::unique_ptr<Block> metaindex; + std::unique_ptr<InternalIterator> metaindex_iter; + s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex, + &metaindex_iter); + if (!s.ok()) { + return s; + } + + // Populates table_properties and some fields that depend on it, + // such as index_type. + s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), + metaindex_iter.get(), largest_seqno); + if (!s.ok()) { + return s; + } + s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(), + internal_comparator, &lookup_context); + if (!s.ok()) { + return s; + } + s = new_table->PrefetchIndexAndFilterBlocks( + prefetch_buffer.get(), metaindex_iter.get(), new_table.get(), + prefetch_all, table_options, level, &lookup_context); + + if (s.ok()) { + // Update tail prefetch stats + assert(prefetch_buffer.get() != nullptr); + if (tail_prefetch_stats != nullptr) { + assert(prefetch_buffer->min_offset_read() < file_size); + tail_prefetch_stats->RecordEffectiveSize( + static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read()); + } + + *table_reader = std::move(new_table); + } + + return s; +} + +Status BlockBasedTable::PrefetchTail( + RandomAccessFileReader* file, uint64_t file_size, + TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, + const bool preload_all, + std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) { + size_t tail_prefetch_size = 0; + if (tail_prefetch_stats != nullptr) { + // Multiple threads may get a 0 (no history) when running in parallel, + // but it will get cleared after the first of them finishes. + tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); + } + if (tail_prefetch_size == 0) { + // Before read footer, readahead backwards to prefetch data. Do more + // readahead if we're going to read index/filter. + // TODO: This may incorrectly select small readahead in case partitioned + // index/filter is enabled and top-level partition pinning is enabled. + // That's because we need to issue readahead before we read the properties, + // at which point we don't yet know the index type. + tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + } + size_t prefetch_off; + size_t prefetch_len; + if (file_size < tail_prefetch_size) { + prefetch_off = 0; + prefetch_len = static_cast<size_t>(file_size); + } else { + prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size); + prefetch_len = tail_prefetch_size; + } + TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen", + &tail_prefetch_size); + Status s; + // TODO should not have this special logic in the future. + if (!file->use_direct_io()) { + prefetch_buffer->reset(new FilePrefetchBuffer( + nullptr, 0, 0, false /* enable */, true /* track_min_offset */)); + s = file->Prefetch(prefetch_off, prefetch_len); + } else { + prefetch_buffer->reset(new FilePrefetchBuffer( + nullptr, 0, 0, true /* enable */, true /* track_min_offset */)); + s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len); + } + return s; +} + +Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len, + uint32_t expected) { + Status s; + uint32_t actual = 0; + switch (type) { + case kNoChecksum: + break; + case kCRC32c: + expected = crc32c::Unmask(expected); + actual = crc32c::Value(buf, len); + break; + case kxxHash: + actual = XXH32(buf, static_cast<int>(len), 0); + break; + case kxxHash64: + actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) & + uint64_t{0xffffffff}); + break; + default: + s = Status::Corruption("unknown checksum type"); + } + if (s.ok() && actual != expected) { + s = Status::Corruption("properties block checksum mismatched"); + } + return s; +} + +Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno( + FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value, + TableProperties** table_properties) { + assert(table_properties != nullptr); + // If this is an external SST file ingested with write_global_seqno set to + // true, then we expect the checksum mismatch because checksum was written + // by SstFileWriter, but its global seqno in the properties block may have + // been changed during ingestion. In this case, we read the properties + // block, copy it to a memory buffer, change the global seqno to its + // original value, i.e. 0, and verify the checksum again. + BlockHandle props_block_handle; + CacheAllocationPtr tmp_buf; + Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer, + rep_->footer, rep_->ioptions, table_properties, + false /* verify_checksum */, &props_block_handle, + &tmp_buf, false /* compression_type_missing */, + nullptr /* memory_allocator */); + if (s.ok() && tmp_buf) { + const auto seqno_pos_iter = + (*table_properties) + ->properties_offsets.find( + ExternalSstFilePropertyNames::kGlobalSeqno); + size_t block_size = static_cast<size_t>(props_block_handle.size()); + if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) { + uint64_t global_seqno_offset = seqno_pos_iter->second; + EncodeFixed64( + tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0); + } + uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1); + s = ROCKSDB_NAMESPACE::VerifyChecksum(rep_->footer.checksum(), + tmp_buf.get(), block_size + 1, value); + } + return s; +} + +Status BlockBasedTable::ReadPropertiesBlock( + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + const SequenceNumber largest_seqno) { + bool found_properties_block = true; + Status s; + s = SeekToPropertiesBlock(meta_iter, &found_properties_block); + + if (!s.ok()) { + ROCKS_LOG_WARN(rep_->ioptions.info_log, + "Error when seeking to properties block from file: %s", + s.ToString().c_str()); + } else if (found_properties_block) { + s = meta_iter->status(); + TableProperties* table_properties = nullptr; + if (s.ok()) { + s = ReadProperties( + meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer, + rep_->ioptions, &table_properties, true /* verify_checksum */, + nullptr /* ret_block_handle */, nullptr /* ret_block_contents */, + false /* compression_type_missing */, nullptr /* memory_allocator */); + } + + if (s.IsCorruption()) { + s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(), + &table_properties); + } + std::unique_ptr<TableProperties> props_guard; + if (table_properties != nullptr) { + props_guard.reset(table_properties); + } + + if (!s.ok()) { + ROCKS_LOG_WARN(rep_->ioptions.info_log, + "Encountered error while reading data from properties " + "block %s", + s.ToString().c_str()); + } else { + assert(table_properties != nullptr); + rep_->table_properties.reset(props_guard.release()); + rep_->blocks_maybe_compressed = + rep_->table_properties->compression_name != + CompressionTypeToString(kNoCompression); + rep_->blocks_definitely_zstd_compressed = + (rep_->table_properties->compression_name == + CompressionTypeToString(kZSTD) || + rep_->table_properties->compression_name == + CompressionTypeToString(kZSTDNotFinalCompression)); + } + } else { + ROCKS_LOG_ERROR(rep_->ioptions.info_log, + "Cannot find Properties block from file."); + } +#ifndef ROCKSDB_LITE + if (rep_->table_properties) { + ParseSliceTransform(rep_->table_properties->prefix_extractor_name, + &(rep_->table_prefix_extractor)); + } +#endif // ROCKSDB_LITE + + // Read the table properties, if provided. + if (rep_->table_properties) { + rep_->whole_key_filtering &= + IsFeatureSupported(*(rep_->table_properties), + BlockBasedTablePropertyNames::kWholeKeyFiltering, + rep_->ioptions.info_log); + rep_->prefix_filtering &= + IsFeatureSupported(*(rep_->table_properties), + BlockBasedTablePropertyNames::kPrefixFiltering, + rep_->ioptions.info_log); + + rep_->index_key_includes_seq = + rep_->table_properties->index_key_is_user_key == 0; + rep_->index_value_is_full = + rep_->table_properties->index_value_is_delta_encoded == 0; + + // Update index_type with the true type. + // If table properties don't contain index type, we assume that the table + // is in very old format and has kBinarySearch index type. + auto& props = rep_->table_properties->user_collected_properties; + auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (pos != props.end()) { + rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>( + DecodeFixed32(pos->second.c_str())); + } + + rep_->index_has_first_key = + rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey; + + s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno, + &(rep_->global_seqno)); + if (!s.ok()) { + ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str()); + } + } + return s; +} + +Status BlockBasedTable::ReadRangeDelBlock( + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + const InternalKeyComparator& internal_comparator, + BlockCacheLookupContext* lookup_context) { + Status s; + bool found_range_del_block; + BlockHandle range_del_handle; + s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle); + if (!s.ok()) { + ROCKS_LOG_WARN( + rep_->ioptions.info_log, + "Error when seeking to range delete tombstones block from file: %s", + s.ToString().c_str()); + } else if (found_range_del_block && !range_del_handle.IsNull()) { + ReadOptions read_options; + std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>( + read_options, range_del_handle, + /*input_iter=*/nullptr, BlockType::kRangeDeletion, + /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer)); + assert(iter != nullptr); + s = iter->status(); + if (!s.ok()) { + ROCKS_LOG_WARN( + rep_->ioptions.info_log, + "Encountered error while reading data from range del block %s", + s.ToString().c_str()); + } else { + rep_->fragmented_range_dels = + std::make_shared<FragmentedRangeTombstoneList>(std::move(iter), + internal_comparator); + } + } + return s; +} + +Status BlockBasedTable::PrefetchIndexAndFilterBlocks( + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + BlockBasedTable* new_table, bool prefetch_all, + const BlockBasedTableOptions& table_options, const int level, + BlockCacheLookupContext* lookup_context) { + Status s; + + // Find filter handle and filter type + if (rep_->filter_policy) { + for (auto filter_type : + {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter, + Rep::FilterType::kBlockFilter}) { + std::string prefix; + switch (filter_type) { + case Rep::FilterType::kFullFilter: + prefix = kFullFilterBlockPrefix; + break; + case Rep::FilterType::kPartitionedFilter: + prefix = kPartitionedFilterBlockPrefix; + break; + case Rep::FilterType::kBlockFilter: + prefix = kFilterBlockPrefix; + break; + default: + assert(0); + } + std::string filter_block_key = prefix; + filter_block_key.append(rep_->filter_policy->Name()); + if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle) + .ok()) { + rep_->filter_type = filter_type; + break; + } + } + } + + // Find compression dictionary handle + bool found_compression_dict = false; + s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict, + &rep_->compression_dict_handle); + if (!s.ok()) { + return s; + } + + BlockBasedTableOptions::IndexType index_type = rep_->index_type; + + const bool use_cache = table_options.cache_index_and_filter_blocks; + + // pin both index and filters, down to all partitions + const bool pin_all = + rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; + + // prefetch the first level of index + const bool prefetch_index = + prefetch_all || + (table_options.pin_top_level_index_and_filter && + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + // pin the first level of index + const bool pin_index = + pin_all || (table_options.pin_top_level_index_and_filter && + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + + std::unique_ptr<IndexReader> index_reader; + s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache, + prefetch_index, pin_index, lookup_context, + &index_reader); + if (!s.ok()) { + return s; + } + + rep_->index_reader = std::move(index_reader); + + // The partitions of partitioned index are always stored in cache. They + // are hence follow the configuration for pin and prefetch regardless of + // the value of cache_index_and_filter_blocks + if (prefetch_all) { + rep_->index_reader->CacheDependencies(pin_all); + } + + // prefetch the first level of filter + const bool prefetch_filter = + prefetch_all || + (table_options.pin_top_level_index_and_filter && + rep_->filter_type == Rep::FilterType::kPartitionedFilter); + // Partition fitlers cannot be enabled without partition indexes + assert(!prefetch_filter || prefetch_index); + // pin the first level of filter + const bool pin_filter = + pin_all || (table_options.pin_top_level_index_and_filter && + rep_->filter_type == Rep::FilterType::kPartitionedFilter); + + if (rep_->filter_policy) { + auto filter = new_table->CreateFilterBlockReader( + prefetch_buffer, use_cache, prefetch_filter, pin_filter, + lookup_context); + if (filter) { + // Refer to the comment above about paritioned indexes always being cached + if (prefetch_all) { + filter->CacheDependencies(pin_all); + } + + rep_->filter = std::move(filter); + } + } + + if (!rep_->compression_dict_handle.IsNull()) { + std::unique_ptr<UncompressionDictReader> uncompression_dict_reader; + s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache, + prefetch_all, pin_all, lookup_context, + &uncompression_dict_reader); + if (!s.ok()) { + return s; + } + + rep_->uncompression_dict_reader = std::move(uncompression_dict_reader); + } + + assert(s.ok()); + return s; +} + +void BlockBasedTable::SetupForCompaction() { + switch (rep_->ioptions.access_hint_on_compaction_start) { + case Options::NONE: + break; + case Options::NORMAL: + rep_->file->file()->Hint(FSRandomAccessFile::kNormal); + break; + case Options::SEQUENTIAL: + rep_->file->file()->Hint(FSRandomAccessFile::kSequential); + break; + case Options::WILLNEED: + rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed); + break; + default: + assert(false); + } +} + +std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties() + const { + return rep_->table_properties; +} + +size_t BlockBasedTable::ApproximateMemoryUsage() const { + size_t usage = 0; + if (rep_->filter) { + usage += rep_->filter->ApproximateMemoryUsage(); + } + if (rep_->index_reader) { + usage += rep_->index_reader->ApproximateMemoryUsage(); + } + if (rep_->uncompression_dict_reader) { + usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage(); + } + return usage; +} + +// Load the meta-index-block from the file. On success, return the loaded +// metaindex +// block and its iterator. +Status BlockBasedTable::ReadMetaIndexBlock( + FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr<Block>* metaindex_block, + std::unique_ptr<InternalIterator>* iter) { + // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates + // it is an empty block. + std::unique_ptr<Block> metaindex; + Status s = ReadBlockFromFile( + rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(), + rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions, + true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex, + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options, + kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */, + GetMemoryAllocator(rep_->table_options), false /* for_compaction */, + rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */); + + if (!s.ok()) { + ROCKS_LOG_ERROR(rep_->ioptions.info_log, + "Encountered error while reading data from properties" + " block %s", + s.ToString().c_str()); + return s; + } + + *metaindex_block = std::move(metaindex); + // meta block uses bytewise comparator. + iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(), + BytewiseComparator())); + return Status::OK(); +} + +template <typename TBlocklike> +Status BlockBasedTable::GetDataBlockFromCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + const ReadOptions& read_options, CachableEntry<TBlocklike>* block, + const UncompressionDict& uncompression_dict, BlockType block_type, + GetContext* get_context) const { + const size_t read_amp_bytes_per_bit = + block_type == BlockType::kData + ? rep_->table_options.read_amp_bytes_per_bit + : 0; + assert(block); + assert(block->IsEmpty()); + + Status s; + BlockContents* compressed_block = nullptr; + Cache::Handle* block_cache_compressed_handle = nullptr; + + // Lookup uncompressed cache first + if (block_cache != nullptr) { + auto cache_handle = GetEntryFromCache(block_cache, block_cache_key, + block_type, get_context); + if (cache_handle != nullptr) { + block->SetCachedValue( + reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)), + block_cache, cache_handle); + return s; + } + } + + // If not found, search from the compressed block cache. + assert(block->IsEmpty()); + + if (block_cache_compressed == nullptr) { + return s; + } + + assert(!compressed_block_cache_key.empty()); + block_cache_compressed_handle = + block_cache_compressed->Lookup(compressed_block_cache_key); + + Statistics* statistics = rep_->ioptions.statistics; + + // if we found in the compressed cache, then uncompress and insert into + // uncompressed cache + if (block_cache_compressed_handle == nullptr) { + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); + return s; + } + + // found compressed block + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT); + compressed_block = reinterpret_cast<BlockContents*>( + block_cache_compressed->Value(block_cache_compressed_handle)); + CompressionType compression_type = compressed_block->get_compression_type(); + assert(compression_type != kNoCompression); + + // Retrieve the uncompressed contents into a new buffer + BlockContents contents; + UncompressionContext context(compression_type); + UncompressionInfo info(context, uncompression_dict, compression_type); + s = UncompressBlockContents( + info, compressed_block->data.data(), compressed_block->data.size(), + &contents, rep_->table_options.format_version, rep_->ioptions, + GetMemoryAllocator(rep_->table_options)); + + // Insert uncompressed block into block cache + if (s.ok()) { + std::unique_ptr<TBlocklike> block_holder( + BlocklikeTraits<TBlocklike>::Create( + std::move(contents), rep_->get_global_seqno(block_type), + read_amp_bytes_per_bit, statistics, + rep_->blocks_definitely_zstd_compressed, + rep_->table_options.filter_policy.get())); // uncompressed block + + if (block_cache != nullptr && block_holder->own_bytes() && + read_options.fill_cache) { + size_t charge = block_holder->ApproximateMemoryUsage(); + Cache::Handle* cache_handle = nullptr; + s = block_cache->Insert(block_cache_key, block_holder.get(), charge, + &DeleteCachedEntry<TBlocklike>, &cache_handle); + if (s.ok()) { + assert(cache_handle != nullptr); + block->SetCachedValue(block_holder.release(), block_cache, + cache_handle); + + UpdateCacheInsertionMetrics(block_type, get_context, charge); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + } + } else { + block->SetOwnedValue(block_holder.release()); + } + } + + // Release hold on compressed cache entry + block_cache_compressed->Release(block_cache_compressed_handle); + return s; +} + +template <typename TBlocklike> +Status BlockBasedTable::PutDataBlockToCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents, + CompressionType raw_block_comp_type, + const UncompressionDict& uncompression_dict, SequenceNumber seq_no, + MemoryAllocator* memory_allocator, BlockType block_type, + GetContext* get_context) const { + const ImmutableCFOptions& ioptions = rep_->ioptions; + const uint32_t format_version = rep_->table_options.format_version; + const size_t read_amp_bytes_per_bit = + block_type == BlockType::kData + ? rep_->table_options.read_amp_bytes_per_bit + : 0; + const Cache::Priority priority = + rep_->table_options.cache_index_and_filter_blocks_with_high_priority && + (block_type == BlockType::kFilter || + block_type == BlockType::kCompressionDictionary || + block_type == BlockType::kIndex) + ? Cache::Priority::HIGH + : Cache::Priority::LOW; + assert(cached_block); + assert(cached_block->IsEmpty()); + + Status s; + Statistics* statistics = ioptions.statistics; + + std::unique_ptr<TBlocklike> block_holder; + if (raw_block_comp_type != kNoCompression) { + // Retrieve the uncompressed contents into a new buffer + BlockContents uncompressed_block_contents; + UncompressionContext context(raw_block_comp_type); + UncompressionInfo info(context, uncompression_dict, raw_block_comp_type); + s = UncompressBlockContents(info, raw_block_contents->data.data(), + raw_block_contents->data.size(), + &uncompressed_block_contents, format_version, + ioptions, memory_allocator); + if (!s.ok()) { + return s; + } + + block_holder.reset(BlocklikeTraits<TBlocklike>::Create( + std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit, + statistics, rep_->blocks_definitely_zstd_compressed, + rep_->table_options.filter_policy.get())); + } else { + block_holder.reset(BlocklikeTraits<TBlocklike>::Create( + std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit, + statistics, rep_->blocks_definitely_zstd_compressed, + rep_->table_options.filter_policy.get())); + } + + // Insert compressed block into compressed block cache. + // Release the hold on the compressed cache entry immediately. + if (block_cache_compressed != nullptr && + raw_block_comp_type != kNoCompression && raw_block_contents != nullptr && + raw_block_contents->own_bytes()) { +#ifndef NDEBUG + assert(raw_block_contents->is_raw_block); +#endif // NDEBUG + + // We cannot directly put raw_block_contents because this could point to + // an object in the stack. + BlockContents* block_cont_for_comp_cache = + new BlockContents(std::move(*raw_block_contents)); + s = block_cache_compressed->Insert( + compressed_block_cache_key, block_cont_for_comp_cache, + block_cont_for_comp_cache->ApproximateMemoryUsage(), + &DeleteCachedEntry<BlockContents>); + if (s.ok()) { + // Avoid the following code to delete this cached block. + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD); + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + delete block_cont_for_comp_cache; + } + } + + // insert into uncompressed block cache + if (block_cache != nullptr && block_holder->own_bytes()) { + size_t charge = block_holder->ApproximateMemoryUsage(); + Cache::Handle* cache_handle = nullptr; + s = block_cache->Insert(block_cache_key, block_holder.get(), charge, + &DeleteCachedEntry<TBlocklike>, &cache_handle, + priority); + if (s.ok()) { + assert(cache_handle != nullptr); + cached_block->SetCachedValue(block_holder.release(), block_cache, + cache_handle); + + UpdateCacheInsertionMetrics(block_type, get_context, charge); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + } + } else { + cached_block->SetOwnedValue(block_holder.release()); + } + + return s; +} + +std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader( + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context) { + auto& rep = rep_; + auto filter_type = rep->filter_type; + if (filter_type == Rep::FilterType::kNoFilter) { + return std::unique_ptr<FilterBlockReader>(); + } + + assert(rep->filter_policy); + + switch (filter_type) { + case Rep::FilterType::kPartitionedFilter: + return PartitionedFilterBlockReader::Create( + this, prefetch_buffer, use_cache, prefetch, pin, lookup_context); + + case Rep::FilterType::kBlockFilter: + return BlockBasedFilterBlockReader::Create( + this, prefetch_buffer, use_cache, prefetch, pin, lookup_context); + + case Rep::FilterType::kFullFilter: + return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, lookup_context); + + default: + // filter_type is either kNoFilter (exited the function at the first if), + // or it must be covered in this switch block + assert(false); + return std::unique_ptr<FilterBlockReader>(); + } +} + +// disable_prefix_seek should be set to true when prefix_extractor found in SST +// differs from the one in mutable_cf_options and index type is HashBasedIndex +InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* input_iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { + assert(rep_ != nullptr); + assert(rep_->index_reader != nullptr); + + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + return rep_->index_reader->NewIterator(read_options, disable_prefix_seek, + input_iter, get_context, + lookup_context); +} + +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +// If input_iter is null, new a iterator +// If input_iter is not null, update this iter and return it +template <typename TBlockIter> +TBlockIter* BlockBasedTable::NewDataBlockIterator( + const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, + BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, Status s, + FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const { + PERF_TIMER_GUARD(new_table_block_iter_nanos); + + TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + + CachableEntry<UncompressionDict> uncompression_dict; + if (rep_->uncompression_dict_reader) { + const bool no_io = (ro.read_tier == kBlockCacheTier); + s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + prefetch_buffer, no_io, get_context, lookup_context, + &uncompression_dict); + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + } + + const UncompressionDict& dict = uncompression_dict.GetValue() + ? *uncompression_dict.GetValue() + : UncompressionDict::GetEmptyDict(); + + CachableEntry<Block> block; + s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type, + get_context, lookup_context, for_compaction, + /* use_cache */ true); + + if (!s.ok()) { + assert(block.IsEmpty()); + iter->Invalidate(s); + return iter; + } + + assert(block.GetValue() != nullptr); + + // Block contents are pinned and it is still pinned after the iterator + // is destroyed as long as cleanup functions are moved to another object, + // when: + // 1. block cache handle is set to be released in cleanup function, or + // 2. it's pointing to immortal source. If own_bytes is true then we are + // not reading data from the original source, whether immortal or not. + // Otherwise, the block is pinned iff the source is immortal. + const bool block_contents_pinned = + block.IsCached() || + (!block.GetValue()->own_bytes() && rep_->immortal_table); + iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter, + block_contents_pinned); + + if (!block.IsCached()) { + if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) { + // insert a dummy record to block cache to track the memory usage + Cache* const block_cache = rep_->table_options.block_cache.get(); + Cache::Handle* cache_handle = nullptr; + // There are two other types of cache keys: 1) SST cache key added in + // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in + // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate + // from SST cache key(31 bytes), and use non-zero prefix to + // differentiate from `write_buffer_manager` + const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; + char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; + // Prefix: use rep_->cache_key_prefix padded by 0s + memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); + assert(rep_->cache_key_prefix_size != 0); + assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix); + memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size); + char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, + next_cache_key_id_++); + assert(end - cache_key <= + static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length)); + const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key)); + s = block_cache->Insert(unique_key, nullptr, + block.GetValue()->ApproximateMemoryUsage(), + nullptr, &cache_handle); + + if (s.ok()) { + assert(cache_handle != nullptr); + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, + cache_handle); + } + } + } else { + iter->SetCacheHandle(block.GetCacheHandle()); + } + + block.TransferTo(iter); + + return iter; +} + +template <> +DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>( + const Rep* rep, Block* block, DataBlockIter* input_iter, + bool block_contents_pinned) { + return block->NewDataIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + input_iter, rep->ioptions.statistics, block_contents_pinned); +} + +template <> +IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>( + const Rep* rep, Block* block, IndexBlockIter* input_iter, + bool block_contents_pinned) { + return block->NewIndexIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + input_iter, rep->ioptions.statistics, /* total_order_seek */ true, + rep->index_has_first_key, rep->index_key_includes_seq, + rep->index_value_is_full, block_contents_pinned); +} + +// Convert an uncompressed data block (i.e CachableEntry<Block>) +// into an iterator over the contents of the corresponding block. +// If input_iter is null, new a iterator +// If input_iter is not null, update this iter and return it +template <typename TBlockIter> +TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro, + CachableEntry<Block>& block, + TBlockIter* input_iter, + Status s) const { + PERF_TIMER_GUARD(new_table_block_iter_nanos); + + TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + + assert(block.GetValue() != nullptr); + // Block contents are pinned and it is still pinned after the iterator + // is destroyed as long as cleanup functions are moved to another object, + // when: + // 1. block cache handle is set to be released in cleanup function, or + // 2. it's pointing to immortal source. If own_bytes is true then we are + // not reading data from the original source, whether immortal or not. + // Otherwise, the block is pinned iff the source is immortal. + const bool block_contents_pinned = + block.IsCached() || + (!block.GetValue()->own_bytes() && rep_->immortal_table); + iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter, + block_contents_pinned); + + if (!block.IsCached()) { + if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) { + // insert a dummy record to block cache to track the memory usage + Cache* const block_cache = rep_->table_options.block_cache.get(); + Cache::Handle* cache_handle = nullptr; + // There are two other types of cache keys: 1) SST cache key added in + // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in + // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate + // from SST cache key(31 bytes), and use non-zero prefix to + // differentiate from `write_buffer_manager` + const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; + char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; + // Prefix: use rep_->cache_key_prefix padded by 0s + memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); + assert(rep_->cache_key_prefix_size != 0); + assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix); + memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size); + char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, + next_cache_key_id_++); + assert(end - cache_key <= + static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length)); + const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key)); + s = block_cache->Insert(unique_key, nullptr, + block.GetValue()->ApproximateMemoryUsage(), + nullptr, &cache_handle); + if (s.ok()) { + assert(cache_handle != nullptr); + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, + cache_handle); + } + } + } else { + iter->SetCacheHandle(block.GetCacheHandle()); + } + + block.TransferTo(iter); + return iter; +} + +// If contents is nullptr, this function looks up the block caches for the +// data block referenced by handle, and read the block from disk if necessary. +// If contents is non-null, it skips the cache lookup and disk read, since +// the caller has already read it. In both cases, if ro.fill_cache is true, +// it inserts the block into the block cache. +template <typename TBlocklike> +Status BlockBasedTable::MaybeReadBlockAndLoadToCache( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry<TBlocklike>* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + BlockContents* contents) const { + assert(block_entry != nullptr); + const bool no_io = (ro.read_tier == kBlockCacheTier); + Cache* block_cache = rep_->table_options.block_cache.get(); + // No point to cache compressed blocks if it never goes away + Cache* block_cache_compressed = + rep_->immortal_table ? nullptr + : rep_->table_options.block_cache_compressed.get(); + + // First, try to get the block from the cache + // + // If either block cache is enabled, we'll try to read from it. + Status s; + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice key /* key to the block cache */; + Slice ckey /* key to the compressed block cache */; + bool is_cache_hit = false; + if (block_cache != nullptr || block_cache_compressed != nullptr) { + // create key for block cache + if (block_cache != nullptr) { + key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + handle, cache_key); + } + + if (block_cache_compressed != nullptr) { + ckey = GetCacheKey(rep_->compressed_cache_key_prefix, + rep_->compressed_cache_key_prefix_size, handle, + compressed_cache_key); + } + + if (!contents) { + s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, + ro, block_entry, uncompression_dict, block_type, + get_context); + if (block_entry->GetValue()) { + // TODO(haoyu): Differentiate cache hit on uncompressed block cache and + // compressed block cache. + is_cache_hit = true; + } + } + + // Can't find the block from the cache. If I/O is allowed, read from the + // file. + if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { + Statistics* statistics = rep_->ioptions.statistics; + const bool maybe_compressed = + block_type != BlockType::kFilter && + block_type != BlockType::kCompressionDictionary && + rep_->blocks_maybe_compressed; + const bool do_uncompress = maybe_compressed && !block_cache_compressed; + CompressionType raw_block_comp_type; + BlockContents raw_block_contents; + if (!contents) { + StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS); + BlockFetcher block_fetcher( + rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, + &raw_block_contents, rep_->ioptions, do_uncompress, + maybe_compressed, block_type, uncompression_dict, + rep_->persistent_cache_options, + GetMemoryAllocator(rep_->table_options), + GetMemoryAllocatorForCompressedBlock(rep_->table_options)); + s = block_fetcher.ReadBlockContents(); + raw_block_comp_type = block_fetcher.get_compression_type(); + contents = &raw_block_contents; + } else { + raw_block_comp_type = contents->get_compression_type(); + } + + if (s.ok()) { + SequenceNumber seq_no = rep_->get_global_seqno(block_type); + // If filling cache is allowed and a cache is configured, try to put the + // block to the cache. + s = PutDataBlockToCache( + key, ckey, block_cache, block_cache_compressed, block_entry, + contents, raw_block_comp_type, uncompression_dict, seq_no, + GetMemoryAllocator(rep_->table_options), block_type, get_context); + } + } + } + + // Fill lookup_context. + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() && + lookup_context) { + size_t usage = 0; + uint64_t nkeys = 0; + if (block_entry->GetValue()) { + // Approximate the number of keys in the block using restarts. + nkeys = + rep_->table_options.block_restart_interval * + BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue()); + usage = block_entry->GetValue()->ApproximateMemoryUsage(); + } + TraceType trace_block_type = TraceType::kTraceMax; + switch (block_type) { + case BlockType::kData: + trace_block_type = TraceType::kBlockTraceDataBlock; + break; + case BlockType::kFilter: + trace_block_type = TraceType::kBlockTraceFilterBlock; + break; + case BlockType::kCompressionDictionary: + trace_block_type = TraceType::kBlockTraceUncompressionDictBlock; + break; + case BlockType::kRangeDeletion: + trace_block_type = TraceType::kBlockTraceRangeDeletionBlock; + break; + case BlockType::kIndex: + trace_block_type = TraceType::kBlockTraceIndexBlock; + break; + default: + // This cannot happen. + assert(false); + break; + } + bool no_insert = no_io || !ro.fill_cache; + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock( + trace_block_type, lookup_context->caller)) { + // Defer logging the access to Get() and MultiGet() to trace additional + // information, e.g., referenced_key_exist_in_block. + + // Make a copy of the block key here since it will be logged later. + lookup_context->FillLookupContext( + is_cache_hit, no_insert, trace_block_type, + /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys); + } else { + // Avoid making copy of block_key and cf_name when constructing the access + // record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", trace_block_type, + /*block_size=*/usage, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, + no_insert, lookup_context->get_id, + lookup_context->get_from_user_specified_snapshot, + /*referenced_key=*/""); + block_cache_tracer_->WriteBlockAccess(access_record, key, + rep_->cf_name_for_tracing(), + lookup_context->referenced_key); + } + } + + assert(s.ok() || block_entry->GetValue() == nullptr); + return s; +} + +// This function reads multiple data blocks from disk using Env::MultiRead() +// and optionally inserts them into the block cache. It uses the scratch +// buffer provided by the caller, which is contiguous. If scratch is a nullptr +// it allocates a separate buffer for each block. Typically, if the blocks +// need to be uncompressed and there is no compressed block cache, callers +// can allocate a temporary scratch buffer in order to minimize memory +// allocations. +// If options.fill_cache is true, it inserts the blocks into cache. If its +// false and scratch is non-null and the blocks are uncompressed, it copies +// the buffers to heap. In any case, the CachableEntry<Block> returned will +// own the data bytes. +// If compression is enabled and also there is no compressed block cache, +// the adjacent blocks are read out in one IO (combined read) +// batch - A MultiGetRange with only those keys with unique data blocks not +// found in cache +// handles - A vector of block handles. Some of them me be NULL handles +// scratch - An optional contiguous buffer to read compressed blocks into +void BlockBasedTable::RetrieveMultipleBlocks( + const ReadOptions& options, const MultiGetRange* batch, + const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles, + autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses, + autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results, + char* scratch, const UncompressionDict& uncompression_dict) const { + RandomAccessFileReader* file = rep_->file.get(); + const Footer& footer = rep_->footer; + const ImmutableCFOptions& ioptions = rep_->ioptions; + SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData); + size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit; + MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options); + + if (file->use_direct_io() || ioptions.allow_mmap_reads) { + size_t idx_in_batch = 0; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + BlockCacheLookupContext lookup_data_block_context( + TableReaderCaller::kUserMultiGet); + const BlockHandle& handle = (*handles)[idx_in_batch]; + if (handle.IsNull()) { + continue; + } + + (*statuses)[idx_in_batch] = + RetrieveBlock(nullptr, options, handle, uncompression_dict, + &(*results)[idx_in_batch], BlockType::kData, + mget_iter->get_context, &lookup_data_block_context, + /* for_compaction */ false, /* use_cache */ true); + } + return; + } + + autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs; + size_t buf_offset = 0; + size_t idx_in_batch = 0; + + uint64_t prev_offset = 0; + size_t prev_len = 0; + autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block; + autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + const BlockHandle& handle = (*handles)[idx_in_batch]; + if (handle.IsNull()) { + continue; + } + + size_t prev_end = static_cast<size_t>(prev_offset) + prev_len; + + // If current block is adjacent to the previous one, at the same time, + // compression is enabled and there is no compressed cache, we combine + // the two block read as one. + if (scratch != nullptr && prev_end == handle.offset()) { + req_offset_for_block.emplace_back(prev_len); + prev_len += block_size(handle); + } else { + // No compression or current block and previous one is not adjacent: + // Step 1, create a new request for previous blocks + if (prev_len != 0) { + FSReadRequest req; + req.offset = prev_offset; + req.len = prev_len; + if (scratch == nullptr) { + req.scratch = new char[req.len]; + } else { + req.scratch = scratch + buf_offset; + buf_offset += req.len; + } + req.status = IOStatus::OK(); + read_reqs.emplace_back(req); + } + + // Step 2, remeber the previous block info + prev_offset = handle.offset(); + prev_len = block_size(handle); + req_offset_for_block.emplace_back(0); + } + req_idx_for_block.emplace_back(read_reqs.size()); + } + // Handle the last block and process the pending last request + if (prev_len != 0) { + FSReadRequest req; + req.offset = prev_offset; + req.len = prev_len; + if (scratch == nullptr) { + req.scratch = new char[req.len]; + } else { + req.scratch = scratch + buf_offset; + } + req.status = IOStatus::OK(); + read_reqs.emplace_back(req); + } + + file->MultiRead(&read_reqs[0], read_reqs.size()); + + idx_in_batch = 0; + size_t valid_batch_idx = 0; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + const BlockHandle& handle = (*handles)[idx_in_batch]; + + if (handle.IsNull()) { + continue; + } + + assert(valid_batch_idx < req_idx_for_block.size()); + assert(valid_batch_idx < req_offset_for_block.size()); + assert(req_idx_for_block[valid_batch_idx] < read_reqs.size()); + size_t& req_idx = req_idx_for_block[valid_batch_idx]; + size_t& req_offset = req_offset_for_block[valid_batch_idx]; + valid_batch_idx++; + FSReadRequest& req = read_reqs[req_idx]; + Status s = req.status; + if (s.ok()) { + if (req.result.size() != req.len) { + s = Status::Corruption( + "truncated block read from " + rep_->file->file_name() + + " offset " + ToString(handle.offset()) + ", expected " + + ToString(req.len) + " bytes, got " + ToString(req.result.size())); + } + } + + BlockContents raw_block_contents; + size_t cur_read_end = req_offset + block_size(handle); + if (cur_read_end > req.result.size()) { + s = Status::Corruption( + "truncated block read from " + rep_->file->file_name() + " offset " + + ToString(handle.offset()) + ", expected " + ToString(req.len) + + " bytes, got " + ToString(req.result.size())); + } + + bool blocks_share_read_buffer = (req.result.size() != block_size(handle)); + if (s.ok()) { + if (scratch == nullptr && !blocks_share_read_buffer) { + // We allocated a buffer for this block. Give ownership of it to + // BlockContents so it can free the memory + assert(req.result.data() == req.scratch); + std::unique_ptr<char[]> raw_block(req.scratch + req_offset); + raw_block_contents = BlockContents(std::move(raw_block), handle.size()); + } else { + // We used the scratch buffer which are shared by the blocks. + // raw_block_contents does not have the ownership. + raw_block_contents = + BlockContents(Slice(req.scratch + req_offset, handle.size())); + } + +#ifndef NDEBUG + raw_block_contents.is_raw_block = true; +#endif + if (options.verify_checksums) { + PERF_TIMER_GUARD(block_checksum_time); + const char* data = req.result.data(); + uint32_t expected = + DecodeFixed32(data + req_offset + handle.size() + 1); + // Since the scratch might be shared. the offset of the data block in + // the buffer might not be 0. req.result.data() only point to the + // begin address of each read request, we need to add the offset + // in each read request. Checksum is stored in the block trailer, + // which is handle.size() + 1. + s = ROCKSDB_NAMESPACE::VerifyChecksum(footer.checksum(), + req.result.data() + req_offset, + handle.size() + 1, expected); + TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s); + } + } + + if (s.ok()) { + // It handles a rare case: compression is set and these is no compressed + // cache (enable combined read). In this case, the scratch != nullptr. + // At the same time, some blocks are actually not compressed, + // since its compression space saving is smaller than the threshold. In + // this case, if the block shares the scratch memory, we need to copy it + // to the heap such that it can be added to the regular block cache. + CompressionType compression_type = + raw_block_contents.get_compression_type(); + if (scratch != nullptr && compression_type == kNoCompression) { + Slice raw = Slice(req.scratch + req_offset, block_size(handle)); + raw_block_contents = BlockContents( + CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw), + handle.size()); +#ifndef NDEBUG + raw_block_contents.is_raw_block = true; +#endif + } + } + + if (s.ok()) { + if (options.fill_cache) { + BlockCacheLookupContext lookup_data_block_context( + TableReaderCaller::kUserMultiGet); + CachableEntry<Block>* block_entry = &(*results)[idx_in_batch]; + // MaybeReadBlockAndLoadToCache will insert into the block caches if + // necessary. Since we're passing the raw block contents, it will + // avoid looking up the block cache + s = MaybeReadBlockAndLoadToCache( + nullptr, options, handle, uncompression_dict, block_entry, + BlockType::kData, mget_iter->get_context, + &lookup_data_block_context, &raw_block_contents); + + // block_entry value could be null if no block cache is present, i.e + // BlockBasedTableOptions::no_block_cache is true and no compressed + // block cache is configured. In that case, fall + // through and set up the block explicitly + if (block_entry->GetValue() != nullptr) { + continue; + } + } + + CompressionType compression_type = + raw_block_contents.get_compression_type(); + BlockContents contents; + if (compression_type != kNoCompression) { + UncompressionContext context(compression_type); + UncompressionInfo info(context, uncompression_dict, compression_type); + s = UncompressBlockContents(info, req.result.data() + req_offset, + handle.size(), &contents, footer.version(), + rep_->ioptions, memory_allocator); + } else { + // There are two cases here: 1) caller uses the scratch buffer; 2) we + // use the requst buffer. If scratch buffer is used, we ensure that + // all raw blocks are copyed to the heap as single blocks. If scratch + // buffer is not used, we also have no combined read, so the raw + // block can be used directly. + contents = std::move(raw_block_contents); + } + if (s.ok()) { + (*results)[idx_in_batch].SetOwnedValue( + new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit, + ioptions.statistics)); + } + } + (*statuses)[idx_in_batch] = s; + } +} + +template <typename TBlocklike> +Status BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry<TBlocklike>* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const { + assert(block_entry); + assert(block_entry->IsEmpty()); + + Status s; + if (use_cache) { + s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle, + uncompression_dict, block_entry, + block_type, get_context, lookup_context, + /*contents=*/nullptr); + + if (!s.ok()) { + return s; + } + + if (block_entry->GetValue() != nullptr) { + assert(s.ok()); + return s; + } + } + + assert(block_entry->IsEmpty()); + + const bool no_io = ro.read_tier == kBlockCacheTier; + if (no_io) { + return Status::Incomplete("no blocking io"); + } + + const bool maybe_compressed = + block_type != BlockType::kFilter && + block_type != BlockType::kCompressionDictionary && + rep_->blocks_maybe_compressed; + const bool do_uncompress = maybe_compressed; + std::unique_ptr<TBlocklike> block; + + { + StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics, + READ_BLOCK_GET_MICROS); + s = ReadBlockFromFile( + rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, + rep_->ioptions, do_uncompress, maybe_compressed, block_type, + uncompression_dict, rep_->persistent_cache_options, + rep_->get_global_seqno(block_type), + block_type == BlockType::kData + ? rep_->table_options.read_amp_bytes_per_bit + : 0, + GetMemoryAllocator(rep_->table_options), for_compaction, + rep_->blocks_definitely_zstd_compressed, + rep_->table_options.filter_policy.get()); + } + + if (!s.ok()) { + return s; + } + + block_entry->SetOwnedValue(block.release()); + + assert(s.ok()); + return s; +} + +// Explicitly instantiate templates for both "blocklike" types we use. +// This makes it possible to keep the template definitions in the .cc file. +template Status BlockBasedTable::RetrieveBlock<BlockContents>( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry<BlockContents>* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const; + +template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const; + +template Status BlockBasedTable::RetrieveBlock<Block>( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry<Block>* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const; + +template Status BlockBasedTable::RetrieveBlock<UncompressionDict>( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry<UncompressionDict>* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const; + +BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( + const BlockBasedTable* table, + std::unordered_map<uint64_t, CachableEntry<Block>>* block_map) + : table_(table), block_map_(block_map) {} + +InternalIteratorBase<IndexValue>* +BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( + const BlockHandle& handle) { + // Return a block iterator on the index partition + auto block = block_map_->find(handle.offset()); + // This is a possible scenario since block cache might not have had space + // for the partition + if (block != block_map_->end()) { + const Rep* rep = table_->get_rep(); + assert(rep); + + Statistics* kNullStats = nullptr; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + return block->second.GetValue()->NewIndexIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + nullptr, kNullStats, true, rep->index_has_first_key, + rep->index_key_includes_seq, rep->index_value_is_full); + } + // Create an empty iterator + return new IndexBlockIter(); +} + +// This will be broken if the user specifies an unusual implementation +// of Options.comparator, or if the user specifies an unusual +// definition of prefixes in BlockBasedTableOptions.filter_policy. +// In particular, we require the following three properties: +// +// 1) key.starts_with(prefix(key)) +// 2) Compare(prefix(key), key) <= 0. +// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 +// +// Otherwise, this method guarantees no I/O will be incurred. +// +// REQUIRES: this method shouldn't be called while the DB lock is held. +bool BlockBasedTable::PrefixMayMatch( + const Slice& internal_key, const ReadOptions& read_options, + const SliceTransform* options_prefix_extractor, + const bool need_upper_bound_check, + BlockCacheLookupContext* lookup_context) const { + if (!rep_->filter_policy) { + return true; + } + + const SliceTransform* prefix_extractor; + + if (rep_->table_prefix_extractor == nullptr) { + if (need_upper_bound_check) { + return true; + } + prefix_extractor = options_prefix_extractor; + } else { + prefix_extractor = rep_->table_prefix_extractor.get(); + } + auto user_key = ExtractUserKey(internal_key); + if (!prefix_extractor->InDomain(user_key)) { + return true; + } + + bool may_match = true; + Status s; + + // First, try check with full filter + FilterBlockReader* const filter = rep_->filter.get(); + bool filter_checked = true; + if (filter != nullptr) { + if (!filter->IsBlockBased()) { + const Slice* const const_ikey_ptr = &internal_key; + may_match = filter->RangeMayExist( + read_options.iterate_upper_bound, user_key, prefix_extractor, + rep_->internal_comparator.user_comparator(), const_ikey_ptr, + &filter_checked, need_upper_bound_check, lookup_context); + } else { + // if prefix_extractor changed for block based filter, skip filter + if (need_upper_bound_check) { + return true; + } + auto prefix = prefix_extractor->Transform(user_key); + InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); + auto internal_prefix = internal_key_prefix.Encode(); + + // To prevent any io operation in this method, we set `read_tier` to make + // sure we always read index or filter only when they have already been + // loaded to memory. + ReadOptions no_io_read_options; + no_io_read_options.read_tier = kBlockCacheTier; + + // Then, try find it within each block + // we already know prefix_extractor and prefix_extractor_name must match + // because `CheckPrefixMayMatch` first checks `check_filter_ == true` + std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator( + no_io_read_options, + /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, + /*get_context=*/nullptr, lookup_context)); + iiter->Seek(internal_prefix); + + if (!iiter->Valid()) { + // we're past end of file + // if it's incomplete, it means that we avoided I/O + // and we're not really sure that we're past the end + // of the file + may_match = iiter->status().IsIncomplete(); + } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key()) + : iiter->key()) + .starts_with(ExtractUserKey(internal_prefix))) { + // we need to check for this subtle case because our only + // guarantee is that "the key is a string >= last key in that data + // block" according to the doc/table_format.txt spec. + // + // Suppose iiter->key() starts with the desired prefix; it is not + // necessarily the case that the corresponding data block will + // contain the prefix, since iiter->key() need not be in the + // block. However, the next data block may contain the prefix, so + // we return true to play it safe. + may_match = true; + } else if (filter->IsBlockBased()) { + // iiter->key() does NOT start with the desired prefix. Because + // Seek() finds the first key that is >= the seek target, this + // means that iiter->key() > prefix. Thus, any data blocks coming + // after the data block corresponding to iiter->key() cannot + // possibly contain the key. Thus, the corresponding data block + // is the only on could potentially contain the prefix. + BlockHandle handle = iiter->value().handle; + may_match = filter->PrefixMayMatch( + prefix, prefix_extractor, handle.offset(), /*no_io=*/false, + /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context); + } + } + } + + if (filter_checked) { + Statistics* statistics = rep_->ioptions.statistics; + RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); + if (!may_match) { + RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); + } + } + + return may_match; +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) { + SeekImpl(&target); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() { + SeekImpl(nullptr); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl( + const Slice* target) { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) { + ResetDataIter(); + return; + } + + bool need_seek_index = true; + if (block_iter_points_to_real_block_ && block_iter_.Valid()) { + // Reseek. + prev_block_offset_ = index_iter_->value().handle.offset(); + + if (target) { + // We can avoid an index seek if: + // 1. The new seek key is larger than the current key + // 2. The new seek key is within the upper bound of the block + // Since we don't necessarily know the internal key for either + // the current key or the upper bound, we check user keys and + // exclude the equality case. Considering internal keys can + // improve for the boundary cases, but it would complicate the + // code. + if (user_comparator_.Compare(ExtractUserKey(*target), + block_iter_.user_key()) > 0 && + user_comparator_.Compare(ExtractUserKey(*target), + index_iter_->user_key()) < 0) { + need_seek_index = false; + } + } + } + + if (need_seek_index) { + if (target) { + index_iter_->Seek(*target); + } else { + index_iter_->SeekToFirst(); + } + + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + + IndexValue v = index_iter_->value(); + const bool same_block = block_iter_points_to_real_block_ && + v.handle.offset() == prev_block_offset_; + + // TODO(kolmike): Remove the != kBlockCacheTier condition. + if (!v.first_internal_key.empty() && !same_block && + (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) && + read_options_.read_tier != kBlockCacheTier) { + // Index contains the first key of the block, and it's >= target. + // We can defer reading the block. + is_at_first_key_from_index_ = true; + // ResetDataIter() will invalidate block_iter_. Thus, there is no need to + // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound + // as that will be done later when the data block is actually read. + ResetDataIter(); + } else { + // Need to use the data block. + if (!same_block) { + InitDataBlock(); + } else { + // When the user does a reseek, the iterate_upper_bound might have + // changed. CheckDataBlockWithinUpperBound() needs to be called + // explicitly if the reseek ends up in the same data block. + // If the reseek ends up in a different block, InitDataBlock() will do + // the iterator upper bound check. + CheckDataBlockWithinUpperBound(); + } + + if (target) { + block_iter_.Seek(*target); + } else { + block_iter_.SeekToFirst(); + } + FindKeyForward(); + } + + CheckOutOfBound(); + + if (target) { + assert(!Valid() || ((block_type_ == BlockType::kIndex && + !table_->get_rep()->index_key_includes_seq) + ? (user_comparator_.Compare(ExtractUserKey(*target), + key()) <= 0) + : (icomp_.Compare(*target, key()) <= 0))); + } +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev( + const Slice& target) { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + // For now totally disable prefix seek in auto prefix mode because we don't + // have logic + if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) { + ResetDataIter(); + return; + } + + SavePrevIndexValue(); + + // Call Seek() rather than SeekForPrev() in the index block, because the + // target data block will likely to contain the position for `target`, the + // same as Seek(), rather than than before. + // For example, if we have three data blocks, each containing two keys: + // [2, 4] [6, 8] [10, 12] + // (the keys in the index block would be [4, 8, 12]) + // and the user calls SeekForPrev(7), we need to go to the second block, + // just like if they call Seek(7). + // The only case where the block is difference is when they seek to a position + // in the boundary. For example, if they SeekForPrev(5), we should go to the + // first block, rather than the second. However, we don't have the information + // to distinguish the two unless we read the second block. In this case, we'll + // end up with reading two blocks. + index_iter_->Seek(target); + + if (!index_iter_->Valid()) { + auto seek_status = index_iter_->status(); + // Check for IO error + if (!seek_status.IsNotFound() && !seek_status.ok()) { + ResetDataIter(); + return; + } + + // With prefix index, Seek() returns NotFound if the prefix doesn't exist + if (seek_status.IsNotFound()) { + // Any key less than the target is fine for prefix seek + ResetDataIter(); + return; + } else { + index_iter_->SeekToLast(); + } + // Check for IO error + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + + InitDataBlock(); + + block_iter_.SeekForPrev(target); + + FindKeyBackward(); + CheckDataBlockWithinUpperBound(); + assert(!block_iter_.Valid() || + icomp_.Compare(target, block_iter_.key()) >= 0); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + SavePrevIndexValue(); + index_iter_->SeekToLast(); + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + InitDataBlock(); + block_iter_.SeekToLast(); + FindKeyBackward(); + CheckDataBlockWithinUpperBound(); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::Next() { + if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) { + return; + } + assert(block_iter_points_to_real_block_); + block_iter_.Next(); + FindKeyForward(); + CheckOutOfBound(); +} + +template <class TBlockIter, typename TValue> +bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult( + IterateResult* result) { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + } + return is_valid; +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::Prev() { + if (is_at_first_key_from_index_) { + is_at_first_key_from_index_ = false; + + index_iter_->Prev(); + if (!index_iter_->Valid()) { + return; + } + + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + assert(block_iter_points_to_real_block_); + block_iter_.Prev(); + } + + FindKeyBackward(); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() { + BlockHandle data_block_handle = index_iter_->value().handle; + if (!block_iter_points_to_real_block_ || + data_block_handle.offset() != prev_block_offset_ || + // if previous attempt of reading the block missed cache, try again + block_iter_.status().IsIncomplete()) { + if (block_iter_points_to_real_block_) { + ResetDataIter(); + } + auto* rep = table_->get_rep(); + + // Prefetch additional data for range scans (iterators). Enabled only for + // user reads. + // Implicit auto readahead: + // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. + // Explicit user requested readahead: + // Enabled from the very first IO when ReadOptions.readahead_size is set. + if (lookup_context_.caller != TableReaderCaller::kCompaction) { + if (read_options_.readahead_size == 0) { + // Implicit auto readahead + num_file_reads_++; + if (num_file_reads_ > + BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) { + if (!rep->file->use_direct_io() && + (data_block_handle.offset() + + static_cast<size_t>(block_size(data_block_handle)) > + readahead_limit_)) { + // Buffered I/O + // Discarding the return status of Prefetch calls intentionally, as + // we can fallback to reading from disk if Prefetch fails. + rep->file->Prefetch(data_block_handle.offset(), readahead_size_); + readahead_limit_ = static_cast<size_t>(data_block_handle.offset() + + readahead_size_); + // Keep exponentially increasing readahead size until + // kMaxAutoReadaheadSize. + readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize, + readahead_size_ * 2); + } else if (rep->file->use_direct_io() && !prefetch_buffer_) { + // Direct I/O + // Let FilePrefetchBuffer take care of the readahead. + rep->CreateFilePrefetchBuffer( + BlockBasedTable::kInitAutoReadaheadSize, + BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_); + } + } + } else if (!prefetch_buffer_) { + // Explicit user requested readahead + // The actual condition is: + // if (read_options_.readahead_size != 0 && !prefetch_buffer_) + rep->CreateFilePrefetchBuffer(read_options_.readahead_size, + read_options_.readahead_size, + &prefetch_buffer_); + } + } else if (!prefetch_buffer_) { + rep->CreateFilePrefetchBuffer(compaction_readahead_size_, + compaction_readahead_size_, + &prefetch_buffer_); + } + + Status s; + table_->NewDataBlockIterator<TBlockIter>( + read_options_, data_block_handle, &block_iter_, block_type_, + /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(), + /*for_compaction=*/lookup_context_.caller == + TableReaderCaller::kCompaction); + block_iter_points_to_real_block_ = true; + CheckDataBlockWithinUpperBound(); + } +} + +template <class TBlockIter, typename TValue> +bool BlockBasedTableIterator<TBlockIter, TValue>::MaterializeCurrentBlock() { + assert(is_at_first_key_from_index_); + assert(!block_iter_points_to_real_block_); + assert(index_iter_->Valid()); + + is_at_first_key_from_index_ = false; + InitDataBlock(); + assert(block_iter_points_to_real_block_); + block_iter_.SeekToFirst(); + + if (!block_iter_.Valid() || + icomp_.Compare(block_iter_.key(), + index_iter_->value().first_internal_key) != 0) { + // Uh oh. + block_iter_.Invalidate(Status::Corruption( + "first key in index doesn't match first key in block")); + return false; + } + + return true; +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() { + // This method's code is kept short to make it likely to be inlined. + + assert(!is_out_of_bound_); + assert(block_iter_points_to_real_block_); + + if (!block_iter_.Valid()) { + // This is the only call site of FindBlockForward(), but it's extracted into + // a separate method to keep FindKeyForward() short and likely to be + // inlined. When transitioning to a different block, we call + // FindBlockForward(), which is much longer and is probably not inlined. + FindBlockForward(); + } else { + // This is the fast path that avoids a function call. + } +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() { + // TODO the while loop inherits from two-level-iterator. We don't know + // whether a block can be empty so it can be replaced by an "if". + do { + if (!block_iter_.status().ok()) { + return; + } + // Whether next data block is out of upper bound, if there is one. + const bool next_block_is_out_of_bound = + read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && !data_block_within_upper_bound_; + assert(!next_block_is_out_of_bound || + user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) <= 0); + ResetDataIter(); + index_iter_->Next(); + if (next_block_is_out_of_bound) { + // The next block is out of bound. No need to read it. + TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); + // We need to make sure this is not the last data block before setting + // is_out_of_bound_, since the index key for the last data block can be + // larger than smallest key of the next file on the same level. + if (index_iter_->Valid()) { + is_out_of_bound_ = true; + } + return; + } + + if (!index_iter_->Valid()) { + return; + } + + IndexValue v = index_iter_->value(); + + // TODO(kolmike): Remove the != kBlockCacheTier condition. + if (!v.first_internal_key.empty() && + read_options_.read_tier != kBlockCacheTier) { + // Index contains the first key of the block. Defer reading the block. + is_at_first_key_from_index_ = true; + return; + } + + InitDataBlock(); + block_iter_.SeekToFirst(); + } while (!block_iter_.Valid()); +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() { + while (!block_iter_.Valid()) { + if (!block_iter_.status().ok()) { + return; + } + + ResetDataIter(); + index_iter_->Prev(); + + if (index_iter_->Valid()) { + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + return; + } + } + + // We could have check lower bound here too, but we opt not to do it for + // code simplicity. +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() { + if (read_options_.iterate_upper_bound != nullptr && Valid()) { + is_out_of_bound_ = user_comparator_.Compare( + *read_options_.iterate_upper_bound, user_key()) <= 0; + } +} + +template <class TBlockIter, typename TValue> +void BlockBasedTableIterator<TBlockIter, + TValue>::CheckDataBlockWithinUpperBound() { + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_) { + data_block_within_upper_bound_ = + (user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) > 0); + } +} + +InternalIterator* BlockBasedTable::NewIterator( + const ReadOptions& read_options, const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, TableReaderCaller caller, + size_t compaction_readahead_size) { + BlockCacheLookupContext lookup_context{caller}; + bool need_upper_bound_check = + read_options.auto_prefix_mode || + PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); + if (arena == nullptr) { + return new BlockBasedTableIterator<DataBlockIter>( + this, read_options, rep_->internal_comparator, + NewIndexIterator( + read_options, + need_upper_bound_check && + rep_->index_type == BlockBasedTableOptions::kHashSearch, + /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context), + !skip_filters && !read_options.total_order_seek && + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, BlockType::kData, caller, + compaction_readahead_size); + } else { + auto* mem = + arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>)); + return new (mem) BlockBasedTableIterator<DataBlockIter>( + this, read_options, rep_->internal_comparator, + NewIndexIterator( + read_options, + need_upper_bound_check && + rep_->index_type == BlockBasedTableOptions::kHashSearch, + /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context), + !skip_filters && !read_options.total_order_seek && + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, BlockType::kData, caller, + compaction_readahead_size); + } +} + +FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator( + const ReadOptions& read_options) { + if (rep_->fragmented_range_dels == nullptr) { + return nullptr; + } + SequenceNumber snapshot = kMaxSequenceNumber; + if (read_options.snapshot != nullptr) { + snapshot = read_options.snapshot->GetSequenceNumber(); + } + return new FragmentedRangeTombstoneIterator( + rep_->fragmented_range_dels, rep_->internal_comparator, snapshot); +} + +bool BlockBasedTable::FullFilterKeyMayMatch( + const ReadOptions& read_options, FilterBlockReader* filter, + const Slice& internal_key, const bool no_io, + const SliceTransform* prefix_extractor, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { + if (filter == nullptr || filter->IsBlockBased()) { + return true; + } + Slice user_key = ExtractUserKey(internal_key); + const Slice* const const_ikey_ptr = &internal_key; + bool may_match = true; + if (rep_->whole_key_filtering) { + size_t ts_sz = + rep_->internal_comparator.user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); + may_match = + filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid, + no_io, const_ikey_ptr, get_context, lookup_context); + } else if (!read_options.total_order_seek && prefix_extractor && + rep_->table_properties->prefix_extractor_name.compare( + prefix_extractor->Name()) == 0 && + prefix_extractor->InDomain(user_key) && + !filter->PrefixMayMatch(prefix_extractor->Transform(user_key), + prefix_extractor, kNotValid, no_io, + const_ikey_ptr, get_context, + lookup_context)) { + may_match = false; + } + if (may_match) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); + } + return may_match; +} + +void BlockBasedTable::FullFilterKeysMayMatch( + const ReadOptions& read_options, FilterBlockReader* filter, + MultiGetRange* range, const bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const { + if (filter == nullptr || filter->IsBlockBased()) { + return; + } + if (rep_->whole_key_filtering) { + filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io, + lookup_context); + } else if (!read_options.total_order_seek && prefix_extractor && + rep_->table_properties->prefix_extractor_name.compare( + prefix_extractor->Name()) == 0) { + filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false, + lookup_context); + } +} + +Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, + GetContext* get_context, + const SliceTransform* prefix_extractor, + bool skip_filters) { + assert(key.size() >= 8); // key must be internal key + assert(get_context != nullptr); + Status s; + const bool no_io = read_options.read_tier == kBlockCacheTier; + + FilterBlockReader* const filter = + !skip_filters ? rep_->filter.get() : nullptr; + + // First check the full filter + // If full filter not useful, Then go into each block + uint64_t tracing_get_id = get_context->get_tracing_get_id(); + BlockCacheLookupContext lookup_context{ + TableReaderCaller::kUserGet, tracing_get_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Trace the key since it contains both user key and sequence number. + lookup_context.referenced_key = key.ToString(); + lookup_context.get_from_user_specified_snapshot = + read_options.snapshot != nullptr; + } + const bool may_match = + FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor, + get_context, &lookup_context); + if (!may_match) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + } else { + IndexBlockIter iiter_on_stack; + // if prefix_extractor found in block differs from options, disable + // BlockPrefixIndex. Only do this check when index_type is kHashSearch. + bool need_upper_bound_check = false; + if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { + need_upper_bound_check = PrefixExtractorChanged( + rep_->table_properties.get(), prefix_extractor); + } + auto iiter = + NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, + get_context, &lookup_context); + std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr.reset(iiter); + } + + size_t ts_sz = + rep_->internal_comparator.user_comparator()->timestamp_size(); + bool matched = false; // if such user key mathced a key in SST + bool done = false; + for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { + IndexValue v = iiter->value(); + + bool not_exist_in_filter = + filter != nullptr && filter->IsBlockBased() == true && + !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz), + prefix_extractor, v.handle.offset(), no_io, + /*const_ikey_ptr=*/nullptr, get_context, + &lookup_context); + + if (not_exist_in_filter) { + // Not found + // TODO: think about interaction with Merge. If a user key cannot + // cross one data block, we should be fine. + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + break; + } + + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } + + BlockCacheLookupContext lookup_data_block_context{ + TableReaderCaller::kUserGet, tracing_get_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != + nullptr}; + bool does_referenced_key_exist = false; + DataBlockIter biter; + uint64_t referenced_data_size = 0; + NewDataBlockIterator<DataBlockIter>( + read_options, v.handle, &biter, BlockType::kData, get_context, + &lookup_data_block_context, + /*s=*/Status(), /*prefetch_buffer*/ nullptr); + + if (no_io && biter.status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for + // whether we can guarantee the key is not there when "no_io" is set + get_context->MarkKeyMayExist(); + break; + } + if (!biter.status().ok()) { + s = biter.status(); + break; + } + + bool may_exist = biter.SeekForGet(key); + // If user-specified timestamp is supported, we cannot end the search + // just because hash index lookup indicates the key+ts does not exist. + if (!may_exist && ts_sz == 0) { + // HashSeek cannot find the key this block and the the iter is not + // the end of the block, i.e. cannot be in the following blocks + // either. In this case, the seek_key cannot be found, so we break + // from the top level for-loop. + done = true; + } else { + // Call the *saver function on each entry/block until it returns false + for (; biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + + if (!get_context->SaveValue( + parsed_key, biter.value(), &matched, + biter.IsValuePinned() ? &biter : nullptr)) { + if (get_context->State() == GetContext::GetState::kFound) { + does_referenced_key_exist = true; + referenced_data_size = biter.key().size() + biter.value().size(); + } + done = true; + break; + } + } + s = biter.status(); + } + // Write the block cache access record. + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + Slice referenced_key; + if (does_referenced_key_exist) { + referenced_key = biter.key(); + } else { + referenced_key = key; + } + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", lookup_data_block_context.block_type, + lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_data_block_context.caller, + lookup_data_block_context.is_cache_hit, + lookup_data_block_context.no_insert, + lookup_data_block_context.get_id, + lookup_data_block_context.get_from_user_specified_snapshot, + /*referenced_key=*/"", referenced_data_size, + lookup_data_block_context.num_keys_in_block, + does_referenced_key_exist); + block_cache_tracer_->WriteBlockAccess( + access_record, lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), referenced_key); + } + + if (done) { + // Avoid the extra Next which is expensive in two-level indexes + break; + } + } + if (matched && filter != nullptr && !filter->IsBlockBased()) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, + rep_->level); + } + if (s.ok() && !iiter->status().IsNotFound()) { + s = iiter->status(); + } + } + + return s; +} + +using MultiGetRange = MultiGetContext::Range; +void BlockBasedTable::MultiGet(const ReadOptions& read_options, + const MultiGetRange* mget_range, + const SliceTransform* prefix_extractor, + bool skip_filters) { + FilterBlockReader* const filter = + !skip_filters ? rep_->filter.get() : nullptr; + MultiGetRange sst_file_range(*mget_range, mget_range->begin(), + mget_range->end()); + + // First check the full filter + // If full filter not useful, Then go into each block + const bool no_io = read_options.read_tier == kBlockCacheTier; + uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; + if (!sst_file_range.empty() && sst_file_range.begin()->get_context) { + tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id(); + } + BlockCacheLookupContext lookup_context{ + TableReaderCaller::kUserMultiGet, tracing_mget_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; + FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io, + prefix_extractor, &lookup_context); + + if (skip_filters || !sst_file_range.empty()) { + IndexBlockIter iiter_on_stack; + // if prefix_extractor found in block differs from options, disable + // BlockPrefixIndex. Only do this check when index_type is kHashSearch. + bool need_upper_bound_check = false; + if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { + need_upper_bound_check = PrefixExtractorChanged( + rep_->table_properties.get(), prefix_extractor); + } + auto iiter = + NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, + sst_file_range.begin()->get_context, &lookup_context); + std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr.reset(iiter); + } + + uint64_t offset = std::numeric_limits<uint64_t>::max(); + autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles; + autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results; + autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses; + char stack_buf[kMultiGetReadStackBufSize]; + std::unique_ptr<char[]> block_buf; + { + MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(), + sst_file_range.end()); + + CachableEntry<UncompressionDict> uncompression_dict; + Status uncompression_dict_status; + if (rep_->uncompression_dict_reader) { + uncompression_dict_status = + rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + nullptr /* prefetch_buffer */, no_io, + sst_file_range.begin()->get_context, &lookup_context, + &uncompression_dict); + } + + const UncompressionDict& dict = uncompression_dict.GetValue() + ? *uncompression_dict.GetValue() + : UncompressionDict::GetEmptyDict(); + + size_t total_len = 0; + ReadOptions ro = read_options; + ro.read_tier = kBlockCacheTier; + + for (auto miter = data_block_range.begin(); + miter != data_block_range.end(); ++miter) { + const Slice& key = miter->ikey; + iiter->Seek(miter->ikey); + + IndexValue v; + if (iiter->Valid()) { + v = iiter->value(); + } + if (!iiter->Valid() || + (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0)) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + *(miter->s) = iiter->status(); + data_block_range.SkipKey(miter); + sst_file_range.SkipKey(miter); + continue; + } + + if (!uncompression_dict_status.ok()) { + *(miter->s) = uncompression_dict_status; + data_block_range.SkipKey(miter); + sst_file_range.SkipKey(miter); + continue; + } + + statuses.emplace_back(); + results.emplace_back(); + if (v.handle.offset() == offset) { + // We're going to reuse the block for this key later on. No need to + // look it up now. Place a null handle + block_handles.emplace_back(BlockHandle::NullBlockHandle()); + continue; + } + // Lookup the cache for the given data block referenced by an index + // iterator value (i.e BlockHandle). If it exists in the cache, + // initialize block to the contents of the data block. + offset = v.handle.offset(); + BlockHandle handle = v.handle; + BlockCacheLookupContext lookup_data_block_context( + TableReaderCaller::kUserMultiGet); + Status s = RetrieveBlock( + nullptr, ro, handle, dict, &(results.back()), BlockType::kData, + miter->get_context, &lookup_data_block_context, + /* for_compaction */ false, /* use_cache */ true); + if (s.IsIncomplete()) { + s = Status::OK(); + } + if (s.ok() && !results.back().IsEmpty()) { + // Found it in the cache. Add NULL handle to indicate there is + // nothing to read from disk + block_handles.emplace_back(BlockHandle::NullBlockHandle()); + } else { + block_handles.emplace_back(handle); + total_len += block_size(handle); + } + } + + if (total_len) { + char* scratch = nullptr; + // If the blocks need to be uncompressed and we don't need the + // compressed blocks, then we can use a contiguous block of + // memory to read in all the blocks as it will be temporary + // storage + // 1. If blocks are compressed and compressed block cache is there, + // alloc heap bufs + // 2. If blocks are uncompressed, alloc heap bufs + // 3. If blocks are compressed and no compressed block cache, use + // stack buf + if (rep_->table_options.block_cache_compressed == nullptr && + rep_->blocks_maybe_compressed) { + if (total_len <= kMultiGetReadStackBufSize) { + scratch = stack_buf; + } else { + scratch = new char[total_len]; + block_buf.reset(scratch); + } + } + RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles, + &statuses, &results, scratch, dict); + } + } + + DataBlockIter first_biter; + DataBlockIter next_biter; + size_t idx_in_batch = 0; + for (auto miter = sst_file_range.begin(); miter != sst_file_range.end(); + ++miter) { + Status s; + GetContext* get_context = miter->get_context; + const Slice& key = miter->ikey; + bool matched = false; // if such user key matched a key in SST + bool done = false; + bool first_block = true; + do { + DataBlockIter* biter = nullptr; + bool reusing_block = true; + uint64_t referenced_data_size = 0; + bool does_referenced_key_exist = false; + BlockCacheLookupContext lookup_data_block_context( + TableReaderCaller::kUserMultiGet, tracing_mget_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != + nullptr); + if (first_block) { + if (!block_handles[idx_in_batch].IsNull() || + !results[idx_in_batch].IsEmpty()) { + first_biter.Invalidate(Status::OK()); + NewDataBlockIterator<DataBlockIter>( + read_options, results[idx_in_batch], &first_biter, + statuses[idx_in_batch]); + reusing_block = false; + } + biter = &first_biter; + idx_in_batch++; + } else { + IndexValue v = iiter->value(); + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } + + next_biter.Invalidate(Status::OK()); + NewDataBlockIterator<DataBlockIter>( + read_options, iiter->value().handle, &next_biter, + BlockType::kData, get_context, &lookup_data_block_context, + Status(), nullptr); + biter = &next_biter; + reusing_block = false; + } + + if (read_options.read_tier == kBlockCacheTier && + biter->status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for + // whether we can guarantee the key is not there when "no_io" is set + get_context->MarkKeyMayExist(); + break; + } + if (!biter->status().ok()) { + s = biter->status(); + break; + } + + bool may_exist = biter->SeekForGet(key); + if (!may_exist) { + // HashSeek cannot find the key this block and the the iter is not + // the end of the block, i.e. cannot be in the following blocks + // either. In this case, the seek_key cannot be found, so we break + // from the top level for-loop. + break; + } + + // Call the *saver function on each entry/block until it returns false + for (; biter->Valid(); biter->Next()) { + ParsedInternalKey parsed_key; + Cleanable dummy; + Cleanable* value_pinner = nullptr; + if (!ParseInternalKey(biter->key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + if (biter->IsValuePinned()) { + if (reusing_block) { + Cache* block_cache = rep_->table_options.block_cache.get(); + assert(biter->cache_handle() != nullptr); + block_cache->Ref(biter->cache_handle()); + dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache, + biter->cache_handle()); + value_pinner = &dummy; + } else { + value_pinner = biter; + } + } + if (!get_context->SaveValue(parsed_key, biter->value(), &matched, + value_pinner)) { + if (get_context->State() == GetContext::GetState::kFound) { + does_referenced_key_exist = true; + referenced_data_size = + biter->key().size() + biter->value().size(); + } + done = true; + break; + } + s = biter->status(); + } + // Write the block cache access. + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + Slice referenced_key; + if (does_referenced_key_exist) { + referenced_key = biter->key(); + } else { + referenced_key = key; + } + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", lookup_data_block_context.block_type, + lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_data_block_context.caller, + lookup_data_block_context.is_cache_hit, + lookup_data_block_context.no_insert, + lookup_data_block_context.get_id, + lookup_data_block_context.get_from_user_specified_snapshot, + /*referenced_key=*/"", referenced_data_size, + lookup_data_block_context.num_keys_in_block, + does_referenced_key_exist); + block_cache_tracer_->WriteBlockAccess( + access_record, lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), referenced_key); + } + s = biter->status(); + if (done) { + // Avoid the extra Next which is expensive in two-level indexes + break; + } + if (first_block) { + iiter->Seek(key); + } + first_block = false; + iiter->Next(); + } while (iiter->Valid()); + + if (matched && filter != nullptr && !filter->IsBlockBased()) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, + rep_->level); + } + if (s.ok()) { + s = iiter->status(); + } + *(miter->s) = s; + } + } +} + +Status BlockBasedTable::Prefetch(const Slice* const begin, + const Slice* const end) { + auto& comparator = rep_->internal_comparator; + UserComparatorWrapper user_comparator(comparator.user_comparator()); + // pre-condition + if (begin && end && comparator.Compare(*begin, *end) > 0) { + return Status::InvalidArgument(*begin, *end); + } + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + IndexBlockIter iiter_on_stack; + auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + &iiter_on_stack, /*get_context=*/nullptr, + &lookup_context); + std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter); + } + + if (!iiter->status().ok()) { + // error opening index iterator + return iiter->status(); + } + + // indicates if we are on the last page that need to be pre-fetched + bool prefetching_boundary_page = false; + + for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); + iiter->Next()) { + BlockHandle block_handle = iiter->value().handle; + const bool is_user_key = !rep_->index_key_includes_seq; + if (end && + ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) || + (is_user_key && + user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) { + if (prefetching_boundary_page) { + break; + } + + // The index entry represents the last key in the data block. + // We should load this page into memory as well, but no more + prefetching_boundary_page = true; + } + + // Load the block specified by the block_handle into the block cache + DataBlockIter biter; + + NewDataBlockIterator<DataBlockIter>( + ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData, + /*get_context=*/nullptr, &lookup_context, Status(), + /*prefetch_buffer=*/nullptr); + + if (!biter.status().ok()) { + // there was an unexpected error while pre-fetching + return biter.status(); + } + } + + return Status::OK(); +} + +Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options, + TableReaderCaller caller) { + Status s; + // Check Meta blocks + std::unique_ptr<Block> metaindex; + std::unique_ptr<InternalIterator> metaindex_iter; + s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex, + &metaindex_iter); + if (s.ok()) { + s = VerifyChecksumInMetaBlocks(metaindex_iter.get()); + if (!s.ok()) { + return s; + } + } else { + return s; + } + // Check Data blocks + IndexBlockIter iiter_on_stack; + BlockCacheLookupContext context{caller}; + InternalIteratorBase<IndexValue>* iiter = NewIndexIterator( + read_options, /*disable_prefix_seek=*/false, &iiter_on_stack, + /*get_context=*/nullptr, &context); + std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter); + } + if (!iiter->status().ok()) { + // error opening index iterator + return iiter->status(); + } + s = VerifyChecksumInBlocks(read_options, iiter); + return s; +} + +Status BlockBasedTable::VerifyChecksumInBlocks( + const ReadOptions& read_options, + InternalIteratorBase<IndexValue>* index_iter) { + Status s; + // We are scanning the whole file, so no need to do exponential + // increasing of the buffer size. + size_t readahead_size = (read_options.readahead_size != 0) + ? read_options.readahead_size + : kMaxAutoReadaheadSize; + // FilePrefetchBuffer doesn't work in mmap mode and readahead is not + // needed there. + FilePrefetchBuffer prefetch_buffer( + rep_->file.get(), readahead_size /* readadhead_size */, + readahead_size /* max_readahead_size */, + !rep_->ioptions.allow_mmap_reads /* enable */); + + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); + if (!s.ok()) { + break; + } + BlockHandle handle = index_iter->value().handle; + BlockContents contents; + BlockFetcher block_fetcher( + rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle, + &contents, rep_->ioptions, false /* decompress */, + false /*maybe_compressed*/, BlockType::kData, + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + break; + } + } + return s; +} + +BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName( + const Slice& meta_block_name) { + if (meta_block_name.starts_with(kFilterBlockPrefix) || + meta_block_name.starts_with(kFullFilterBlockPrefix) || + meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) { + return BlockType::kFilter; + } + + if (meta_block_name == kPropertiesBlock) { + return BlockType::kProperties; + } + + if (meta_block_name == kCompressionDictBlock) { + return BlockType::kCompressionDictionary; + } + + if (meta_block_name == kRangeDelBlock) { + return BlockType::kRangeDeletion; + } + + if (meta_block_name == kHashIndexPrefixesBlock) { + return BlockType::kHashIndexPrefixes; + } + + if (meta_block_name == kHashIndexPrefixesMetadataBlock) { + return BlockType::kHashIndexMetadata; + } + + assert(false); + return BlockType::kInvalid; +} + +Status BlockBasedTable::VerifyChecksumInMetaBlocks( + InternalIteratorBase<Slice>* index_iter) { + Status s; + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); + if (!s.ok()) { + break; + } + BlockHandle handle; + Slice input = index_iter->value(); + s = handle.DecodeFrom(&input); + BlockContents contents; + const Slice meta_block_name = index_iter->key(); + BlockFetcher block_fetcher( + rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, + ReadOptions(), handle, &contents, rep_->ioptions, + false /* decompress */, false /*maybe_compressed*/, + GetBlockTypeForMetaBlockByName(meta_block_name), + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (s.IsCorruption() && meta_block_name == kPropertiesBlock) { + TableProperties* table_properties; + s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */, + index_iter->value(), + &table_properties); + delete table_properties; + } + if (!s.ok()) { + break; + } + } + return s; +} + +bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const { + assert(rep_ != nullptr); + + Cache* const cache = rep_->table_options.block_cache.get(); + if (cache == nullptr) { + return false; + } + + char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice cache_key = + GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, + cache_key_storage); + + Cache::Handle* const cache_handle = cache->Lookup(cache_key); + if (cache_handle == nullptr) { + return false; + } + + cache->Release(cache_handle); + + return true; +} + +bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, + const Slice& key) { + std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator( + options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); + iiter->Seek(key); + assert(iiter->Valid()); + + return TEST_BlockInCache(iiter->value().handle); +} + +// REQUIRES: The following fields of rep_ should have already been populated: +// 1. file +// 2. index_handle, +// 3. options +// 4. internal_comparator +// 5. index_type +Status BlockBasedTable::CreateIndexReader( + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context, + std::unique_ptr<IndexReader>* index_reader) { + // kHashSearch requires non-empty prefix_extractor but bypass checking + // prefix_extractor here since we have no access to MutableCFOptions. + // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator. + // If prefix_extractor does not match prefix_extractor_name from table + // properties, turn off Hash Index by setting total_order_seek to true + + switch (rep_->index_type) { + case BlockBasedTableOptions::kTwoLevelIndexSearch: { + return PartitionIndexReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, lookup_context, + index_reader); + } + case BlockBasedTableOptions::kBinarySearch: + FALLTHROUGH_INTENDED; + case BlockBasedTableOptions::kBinarySearchWithFirstKey: { + return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, lookup_context, + index_reader); + } + case BlockBasedTableOptions::kHashSearch: { + std::unique_ptr<Block> metaindex_guard; + std::unique_ptr<InternalIterator> metaindex_iter_guard; + auto meta_index_iter = preloaded_meta_index_iter; + bool should_fallback = false; + if (rep_->internal_prefix_transform.get() == nullptr) { + ROCKS_LOG_WARN(rep_->ioptions.info_log, + "No prefix extractor passed in. Fall back to binary" + " search index."); + should_fallback = true; + } else if (meta_index_iter == nullptr) { + auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard, + &metaindex_iter_guard); + if (!s.ok()) { + // we simply fall back to binary search in case there is any + // problem with prefix hash index loading. + ROCKS_LOG_WARN(rep_->ioptions.info_log, + "Unable to read the metaindex block." + " Fall back to binary search index."); + should_fallback = true; + } + meta_index_iter = metaindex_iter_guard.get(); + } + + if (should_fallback) { + return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, lookup_context, + index_reader); + } else { + return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter, + use_cache, prefetch, pin, lookup_context, + index_reader); + } + } + default: { + std::string error_message = + "Unrecognized index type: " + ToString(rep_->index_type); + return Status::InvalidArgument(error_message.c_str()); + } + } +} + +uint64_t BlockBasedTable::ApproximateOffsetOf( + const InternalIteratorBase<IndexValue>& index_iter) const { + uint64_t result = 0; + if (index_iter.Valid()) { + BlockHandle handle = index_iter.value().handle; + result = handle.offset(); + } else { + // The iterator is past the last key in the file. If table_properties is not + // available, approximate the offset by returning the offset of the + // metaindex block (which is right near the end of the file). + if (rep_->table_properties) { + result = rep_->table_properties->data_size; + } + // table_properties is not present in the table. + if (result == 0) { + result = rep_->footer.metaindex_handle().offset(); + } + } + + return result; +} + +uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, + TableReaderCaller caller) { + BlockCacheLookupContext context(caller); + IndexBlockIter iiter_on_stack; + ReadOptions ro; + ro.total_order_seek = true; + auto index_iter = + NewIndexIterator(ro, /*disable_prefix_seek=*/true, + /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, + /*lookup_context=*/&context); + std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr; + if (index_iter != &iiter_on_stack) { + iiter_unique_ptr.reset(index_iter); + } + + index_iter->Seek(key); + return ApproximateOffsetOf(*index_iter); +} + +uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, + TableReaderCaller caller) { + assert(rep_->internal_comparator.Compare(start, end) <= 0); + + BlockCacheLookupContext context(caller); + IndexBlockIter iiter_on_stack; + ReadOptions ro; + ro.total_order_seek = true; + auto index_iter = + NewIndexIterator(ro, /*disable_prefix_seek=*/true, + /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, + /*lookup_context=*/&context); + std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr; + if (index_iter != &iiter_on_stack) { + iiter_unique_ptr.reset(index_iter); + } + + index_iter->Seek(start); + uint64_t start_offset = ApproximateOffsetOf(*index_iter); + index_iter->Seek(end); + uint64_t end_offset = ApproximateOffsetOf(*index_iter); + + assert(end_offset >= start_offset); + return end_offset - start_offset; +} + +bool BlockBasedTable::TEST_FilterBlockInCache() const { + assert(rep_ != nullptr); + return TEST_BlockInCache(rep_->filter_handle); +} + +bool BlockBasedTable::TEST_IndexBlockInCache() const { + assert(rep_ != nullptr); + + return TEST_BlockInCache(rep_->footer.index_handle()); +} + +Status BlockBasedTable::GetKVPairsFromDataBlocks( + std::vector<KVPairBlock>* kv_pair_blocks) { + std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter( + NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); + + Status s = blockhandles_iter->status(); + if (!s.ok()) { + // Cannot read Index Block + return s; + } + + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + + if (!s.ok()) { + break; + } + + std::unique_ptr<InternalIterator> datablock_iter; + datablock_iter.reset(NewDataBlockIterator<DataBlockIter>( + ReadOptions(), blockhandles_iter->value().handle, + /*input_iter=*/nullptr, /*type=*/BlockType::kData, + /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(), + /*prefetch_buffer=*/nullptr)); + s = datablock_iter->status(); + + if (!s.ok()) { + // Error reading the block - Skipped + continue; + } + + KVPairBlock kv_pair_block; + for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); + datablock_iter->Next()) { + s = datablock_iter->status(); + if (!s.ok()) { + // Error reading the block - Skipped + break; + } + const Slice& key = datablock_iter->key(); + const Slice& value = datablock_iter->value(); + std::string key_copy = std::string(key.data(), key.size()); + std::string value_copy = std::string(value.data(), value.size()); + + kv_pair_block.push_back( + std::make_pair(std::move(key_copy), std::move(value_copy))); + } + kv_pair_blocks->push_back(std::move(kv_pair_block)); + } + return Status::OK(); +} + +Status BlockBasedTable::DumpTable(WritableFile* out_file) { + // Output Footer + out_file->Append( + "Footer Details:\n" + "--------------------------------------\n" + " "); + out_file->Append(rep_->footer.ToString().c_str()); + out_file->Append("\n"); + + // Output MetaIndex + out_file->Append( + "Metaindex Details:\n" + "--------------------------------------\n"); + std::unique_ptr<Block> metaindex; + std::unique_ptr<InternalIterator> metaindex_iter; + Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex, + &metaindex_iter); + if (s.ok()) { + for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); + metaindex_iter->Next()) { + s = metaindex_iter->status(); + if (!s.ok()) { + return s; + } + if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kPropertiesBlock) { + out_file->Append(" Properties block handle: "); + out_file->Append(metaindex_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } else if (metaindex_iter->key() == + ROCKSDB_NAMESPACE::kCompressionDictBlock) { + out_file->Append(" Compression dictionary block handle: "); + out_file->Append(metaindex_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } else if (strstr(metaindex_iter->key().ToString().c_str(), + "filter.rocksdb.") != nullptr) { + out_file->Append(" Filter block handle: "); + out_file->Append(metaindex_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } else if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kRangeDelBlock) { + out_file->Append(" Range deletion block handle: "); + out_file->Append(metaindex_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } + } + out_file->Append("\n"); + } else { + return s; + } + + // Output TableProperties + const ROCKSDB_NAMESPACE::TableProperties* table_properties; + table_properties = rep_->table_properties.get(); + + if (table_properties != nullptr) { + out_file->Append( + "Table Properties:\n" + "--------------------------------------\n" + " "); + out_file->Append(table_properties->ToString("\n ", ": ").c_str()); + out_file->Append("\n"); + } + + if (rep_->filter) { + out_file->Append( + "Filter Details:\n" + "--------------------------------------\n" + " "); + out_file->Append(rep_->filter->ToString().c_str()); + out_file->Append("\n"); + } + + // Output Index block + s = DumpIndexBlock(out_file); + if (!s.ok()) { + return s; + } + + // Output compression dictionary + if (rep_->uncompression_dict_reader) { + CachableEntry<UncompressionDict> uncompression_dict; + s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + nullptr /* prefetch_buffer */, false /* no_io */, + nullptr /* get_context */, nullptr /* lookup_context */, + &uncompression_dict); + if (!s.ok()) { + return s; + } + + assert(uncompression_dict.GetValue()); + + const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict(); + out_file->Append( + "Compression Dictionary:\n" + "--------------------------------------\n"); + out_file->Append(" size (bytes): "); + out_file->Append(ROCKSDB_NAMESPACE::ToString(raw_dict.size())); + out_file->Append("\n\n"); + out_file->Append(" HEX "); + out_file->Append(raw_dict.ToString(true).c_str()); + out_file->Append("\n\n"); + } + + // Output range deletions block + auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions()); + if (range_del_iter != nullptr) { + range_del_iter->SeekToFirst(); + if (range_del_iter->Valid()) { + out_file->Append( + "Range deletions:\n" + "--------------------------------------\n" + " "); + for (; range_del_iter->Valid(); range_del_iter->Next()) { + DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file); + } + out_file->Append("\n"); + } + delete range_del_iter; + } + // Output Data blocks + s = DumpDataBlocks(out_file); + + return s; +} + +Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { + out_file->Append( + "Index Details:\n" + "--------------------------------------\n"); + std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter( + NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); + Status s = blockhandles_iter->status(); + if (!s.ok()) { + out_file->Append("Can not read Index Block \n\n"); + return s; + } + + out_file->Append(" Block key hex dump: Data block handle\n"); + out_file->Append(" Block key ascii\n\n"); + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + if (!s.ok()) { + break; + } + Slice key = blockhandles_iter->key(); + Slice user_key; + InternalKey ikey; + if (!rep_->index_key_includes_seq) { + user_key = key; + } else { + ikey.DecodeFrom(key); + user_key = ikey.user_key(); + } + + out_file->Append(" HEX "); + out_file->Append(user_key.ToString(true).c_str()); + out_file->Append(": "); + out_file->Append(blockhandles_iter->value() + .ToString(true, rep_->index_has_first_key) + .c_str()); + out_file->Append("\n"); + + std::string str_key = user_key.ToString(); + std::string res_key(""); + char cspace = ' '; + for (size_t i = 0; i < str_key.size(); i++) { + res_key.append(&str_key[i], 1); + res_key.append(1, cspace); + } + out_file->Append(" ASCII "); + out_file->Append(res_key.c_str()); + out_file->Append("\n ------\n"); + } + out_file->Append("\n"); + return Status::OK(); +} + +Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { + std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter( + NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); + Status s = blockhandles_iter->status(); + if (!s.ok()) { + out_file->Append("Can not read Index Block \n\n"); + return s; + } + + uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max(); + uint64_t datablock_size_max = 0; + uint64_t datablock_size_sum = 0; + + size_t block_id = 1; + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + block_id++, blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + if (!s.ok()) { + break; + } + + BlockHandle bh = blockhandles_iter->value().handle; + uint64_t datablock_size = bh.size(); + datablock_size_min = std::min(datablock_size_min, datablock_size); + datablock_size_max = std::max(datablock_size_max, datablock_size); + datablock_size_sum += datablock_size; + + out_file->Append("Data Block # "); + out_file->Append(ROCKSDB_NAMESPACE::ToString(block_id)); + out_file->Append(" @ "); + out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str()); + out_file->Append("\n"); + out_file->Append("--------------------------------------\n"); + + std::unique_ptr<InternalIterator> datablock_iter; + datablock_iter.reset(NewDataBlockIterator<DataBlockIter>( + ReadOptions(), blockhandles_iter->value().handle, + /*input_iter=*/nullptr, /*type=*/BlockType::kData, + /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(), + /*prefetch_buffer=*/nullptr)); + s = datablock_iter->status(); + + if (!s.ok()) { + out_file->Append("Error reading the block - Skipped \n\n"); + continue; + } + + for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); + datablock_iter->Next()) { + s = datablock_iter->status(); + if (!s.ok()) { + out_file->Append("Error reading the block - Skipped \n"); + break; + } + DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file); + } + out_file->Append("\n"); + } + + uint64_t num_datablocks = block_id - 1; + if (num_datablocks) { + double datablock_size_avg = + static_cast<double>(datablock_size_sum) / num_datablocks; + out_file->Append("Data Block Summary:\n"); + out_file->Append("--------------------------------------"); + out_file->Append("\n # data blocks: "); + out_file->Append(ROCKSDB_NAMESPACE::ToString(num_datablocks)); + out_file->Append("\n min data block size: "); + out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_min)); + out_file->Append("\n max data block size: "); + out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_max)); + out_file->Append("\n avg data block size: "); + out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_avg)); + out_file->Append("\n"); + } + + return Status::OK(); +} + +void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, + WritableFile* out_file) { + InternalKey ikey; + ikey.DecodeFrom(key); + + out_file->Append(" HEX "); + out_file->Append(ikey.user_key().ToString(true).c_str()); + out_file->Append(": "); + out_file->Append(value.ToString(true).c_str()); + out_file->Append("\n"); + + std::string str_key = ikey.user_key().ToString(); + std::string str_value = value.ToString(); + std::string res_key(""), res_value(""); + char cspace = ' '; + for (size_t i = 0; i < str_key.size(); i++) { + if (str_key[i] == '\0') { + res_key.append("\\0", 2); + } else { + res_key.append(&str_key[i], 1); + } + res_key.append(1, cspace); + } + for (size_t i = 0; i < str_value.size(); i++) { + if (str_value[i] == '\0') { + res_value.append("\\0", 2); + } else { + res_value.append(&str_value[i], 1); + } + res_value.append(1, cspace); + } + + out_file->Append(" ASCII "); + out_file->Append(res_key.c_str()); + out_file->Append(": "); + out_file->Append(res_value.c_str()); + out_file->Append("\n ------\n"); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_based_table_reader.h b/src/rocksdb/table/block_based/block_based_table_reader.h new file mode 100644 index 000000000..28a378988 --- /dev/null +++ b/src/rocksdb/table/block_based/block_based_table_reader.h @@ -0,0 +1,824 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <stdint.h> +#include <memory> +#include <set> +#include <string> +#include <utility> +#include <vector> + +#include "db/range_tombstone_fragmenter.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "options/cf_options.h" +#include "rocksdb/options.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_type.h" +#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/uncompression_dict_reader.h" +#include "table/format.h" +#include "table/get_context.h" +#include "table/multiget_context.h" +#include "table/persistent_cache_helper.h" +#include "table/table_properties_internal.h" +#include "table/table_reader.h" +#include "table/two_level_iterator.h" +#include "trace_replay/block_cache_tracer.h" +#include "util/coding.h" +#include "util/user_comparator_wrapper.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; +class FilterBlockReader; +class BlockBasedFilterBlockReader; +class FullFilterBlockReader; +class Footer; +class InternalKeyComparator; +class Iterator; +class FSRandomAccessFile; +class TableCache; +class TableReader; +class WritableFile; +struct BlockBasedTableOptions; +struct EnvOptions; +struct ReadOptions; +class GetContext; + +typedef std::vector<std::pair<std::string, std::string>> KVPairBlock; + +// Reader class for BlockBasedTable format. +// For the format of BlockBasedTable refer to +// https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format. +// This is the default table type. Data is chucked into fixed size blocks and +// each block in-turn stores entries. When storing data, we can compress and/or +// encode data efficiently within a block, which often results in a much smaller +// data size compared with the raw data size. As for the record retrieval, we'll +// first locate the block where target record may reside, then read the block to +// memory, and finally search that record within the block. Of course, to avoid +// frequent reads of the same block, we introduced the block cache to keep the +// loaded blocks in the memory. +class BlockBasedTable : public TableReader { + public: + static const std::string kFilterBlockPrefix; + static const std::string kFullFilterBlockPrefix; + static const std::string kPartitionedFilterBlockPrefix; + // The longest prefix of the cache key used to identify blocks. + // For Posix files the unique ID is three varints. + static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1; + + // All the below fields control iterator readahead + static const size_t kInitAutoReadaheadSize = 8 * 1024; + // Found that 256 KB readahead size provides the best performance, based on + // experiments, for auto readahead. Experiment data is in PR #3282. + static const size_t kMaxAutoReadaheadSize; + static const int kMinNumFileReadsToStartAutoReadahead = 2; + + // Attempt to open the table that is stored in bytes [0..file_size) + // of "file", and read the metadata entries necessary to allow + // retrieving data from the table. + // + // If successful, returns ok and sets "*table_reader" to the newly opened + // table. The client should delete "*table_reader" when no longer needed. + // If there was an error while initializing the table, sets "*table_reader" + // to nullptr and returns a non-ok status. + // + // @param file must remain live while this Table is in use. + // @param prefetch_index_and_filter_in_cache can be used to disable + // prefetching of + // index and filter blocks into block cache at startup + // @param skip_filters Disables loading/accessing the filter block. Overrides + // prefetch_index_and_filter_in_cache, so filter will be skipped if both + // are set. + static Status Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_key_comparator, + std::unique_ptr<RandomAccessFileReader>&& file, + uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, + const SliceTransform* prefix_extractor = nullptr, + bool prefetch_index_and_filter_in_cache = true, + bool skip_filters = false, int level = -1, + const bool immortal_table = false, + const SequenceNumber largest_seqno = 0, + TailPrefetchStats* tail_prefetch_stats = nullptr, + BlockCacheTracer* const block_cache_tracer = nullptr); + + bool PrefixMayMatch(const Slice& internal_key, + const ReadOptions& read_options, + const SliceTransform* options_prefix_extractor, + const bool need_upper_bound_check, + BlockCacheLookupContext* lookup_context) const; + + // Returns a new iterator over the table contents. + // The result of NewIterator() is initially invalid (caller must + // call one of the Seek methods on the iterator before using it). + // @param skip_filters Disables loading/accessing the filter block + // compaction_readahead_size: its value will only be used if caller = + // kCompaction. + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0) override; + + FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( + const ReadOptions& read_options) override; + + // @param skip_filters Disables loading/accessing the filter block + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + void MultiGet(const ReadOptions& readOptions, + const MultiGetContext::Range* mget_range, + const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + // Pre-fetch the disk blocks that correspond to the key range specified by + // (kbegin, kend). The call will return error status in the event of + // IO or iteration error. + Status Prefetch(const Slice* begin, const Slice* end) override; + + // Given a key, return an approximate byte offset in the file where + // the data for that key begins (or would begin if the key were + // present in the file). The returned value is in terms of file + // bytes, and so includes effects like compression of the underlying data. + // E.g., the approximate offset of the last key in the table will + // be close to the file length. + uint64_t ApproximateOffsetOf(const Slice& key, + TableReaderCaller caller) override; + + // Given start and end keys, return the approximate data size in the file + // between the keys. The returned value is in terms of file bytes, and so + // includes effects like compression of the underlying data. + // The start key must not be greater than the end key. + uint64_t ApproximateSize(const Slice& start, const Slice& end, + TableReaderCaller caller) override; + + bool TEST_BlockInCache(const BlockHandle& handle) const; + + // Returns true if the block for the specified key is in cache. + // REQUIRES: key is in this table && block cache enabled + bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); + + // Set up the table for Compaction. Might change some parameters with + // posix_fadvise + void SetupForCompaction() override; + + std::shared_ptr<const TableProperties> GetTableProperties() const override; + + size_t ApproximateMemoryUsage() const override; + + // convert SST file to a human readable form + Status DumpTable(WritableFile* out_file) override; + + Status VerifyChecksum(const ReadOptions& readOptions, + TableReaderCaller caller) override; + + ~BlockBasedTable(); + + bool TEST_FilterBlockInCache() const; + bool TEST_IndexBlockInCache() const; + + // IndexReader is the interface that provides the functionality for index + // access. + class IndexReader { + public: + virtual ~IndexReader() = default; + + // Create an iterator for index access. If iter is null, then a new object + // is created on the heap, and the callee will have the ownership. + // If a non-null iter is passed in, it will be used, and the returned value + // is either the same as iter or a new on-heap object that + // wraps the passed iter. In the latter case the return value points + // to a different object then iter, and the callee has the ownership of the + // returned object. + virtual InternalIteratorBase<IndexValue>* NewIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; + + // Report an approximation of how much memory has been used other than + // memory that was allocated in block cache. + virtual size_t ApproximateMemoryUsage() const = 0; + // Cache the dependencies of the index reader (e.g. the partitions + // of a partitioned index). + virtual void CacheDependencies(bool /* pin */) {} + }; + + class IndexReaderCommon; + + static Slice GetCacheKey(const char* cache_key_prefix, + size_t cache_key_prefix_size, + const BlockHandle& handle, char* cache_key); + + // Retrieve all key value pairs from data blocks in the table. + // The key retrieved are internal keys. + Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks); + + struct Rep; + + Rep* get_rep() { return rep_; } + const Rep* get_rep() const { return rep_; } + + // input_iter: if it is not null, update this one and return it as Iterator + template <typename TBlockIter> + TBlockIter* NewDataBlockIterator( + const ReadOptions& ro, const BlockHandle& block_handle, + TBlockIter* input_iter, BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, Status s, + FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const; + + // input_iter: if it is not null, update this one and return it as Iterator + template <typename TBlockIter> + TBlockIter* NewDataBlockIterator(const ReadOptions& ro, + CachableEntry<Block>& block, + TBlockIter* input_iter, Status s) const; + + class PartitionedIndexIteratorState; + + template <typename TBlocklike> + friend class FilterBlockReaderCommon; + + friend class PartitionIndexReader; + + friend class UncompressionDictReader; + + protected: + Rep* rep_; + explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer) + : rep_(rep), block_cache_tracer_(block_cache_tracer) {} + // No copying allowed + explicit BlockBasedTable(const TableReader&) = delete; + void operator=(const TableReader&) = delete; + + private: + friend class MockedBlockBasedTable; + static std::atomic<uint64_t> next_cache_key_id_; + BlockCacheTracer* const block_cache_tracer_; + + void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context, + size_t usage) const; + void UpdateCacheMissMetrics(BlockType block_type, + GetContext* get_context) const; + void UpdateCacheInsertionMetrics(BlockType block_type, + GetContext* get_context, size_t usage) const; + Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, + BlockType block_type, + GetContext* get_context) const; + + // Either Block::NewDataIterator() or Block::NewIndexIterator(). + template <typename TBlockIter> + static TBlockIter* InitBlockIterator(const Rep* rep, Block* block, + TBlockIter* input_iter, + bool block_contents_pinned); + + // If block cache enabled (compressed or uncompressed), looks for the block + // identified by handle in (1) uncompressed cache, (2) compressed cache, and + // then (3) file. If found, inserts into the cache(s) that were searched + // unsuccessfully (e.g., if found in file, will add to both uncompressed and + // compressed caches if they're enabled). + // + // @param block_entry value is set to the uncompressed block if found. If + // in uncompressed block cache, also sets cache_handle to reference that + // block. + template <typename TBlocklike> + Status MaybeReadBlockAndLoadToCache( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry<TBlocklike>* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + BlockContents* contents) const; + + // Similar to the above, with one crucial difference: it will retrieve the + // block from the file even if there are no caches configured (assuming the + // read options allow I/O). + template <typename TBlocklike> + Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& ro, const BlockHandle& handle, + const UncompressionDict& uncompression_dict, + CachableEntry<TBlocklike>* block_entry, + BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const; + + void RetrieveMultipleBlocks( + const ReadOptions& options, const MultiGetRange* batch, + const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles, + autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses, + autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* + results, + char* scratch, const UncompressionDict& uncompression_dict) const; + + // Get the iterator from the index reader. + // + // If input_iter is not set, return a new Iterator. + // If input_iter is set, try to update it and return it as Iterator. + // However note that in some cases the returned iterator may be different + // from input_iter. In such case the returned iterator should be freed. + // + // Note: ErrorIterator with Status::Incomplete shall be returned if all the + // following conditions are met: + // 1. We enabled table_options.cache_index_and_filter_blocks. + // 2. index is not present in block cache. + // 3. We disallowed any io to be performed, that is, read_options == + // kBlockCacheTier + InternalIteratorBase<IndexValue>* NewIndexIterator( + const ReadOptions& read_options, bool need_upper_bound_check, + IndexBlockIter* input_iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; + + // Read block cache from block caches (if set): block_cache and + // block_cache_compressed. + // On success, Status::OK with be returned and @block will be populated with + // pointer to the block as well as its block handle. + // @param uncompression_dict Data for presetting the compression library's + // dictionary. + template <typename TBlocklike> + Status GetDataBlockFromCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + const ReadOptions& read_options, CachableEntry<TBlocklike>* block, + const UncompressionDict& uncompression_dict, BlockType block_type, + GetContext* get_context) const; + + // Put a raw block (maybe compressed) to the corresponding block caches. + // This method will perform decompression against raw_block if needed and then + // populate the block caches. + // On success, Status::OK will be returned; also @block will be populated with + // uncompressed block and its cache handle. + // + // Allocated memory managed by raw_block_contents will be transferred to + // PutDataBlockToCache(). After the call, the object will be invalid. + // @param uncompression_dict Data for presetting the compression library's + // dictionary. + template <typename TBlocklike> + Status PutDataBlockToCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + CachableEntry<TBlocklike>* cached_block, + BlockContents* raw_block_contents, CompressionType raw_block_comp_type, + const UncompressionDict& uncompression_dict, SequenceNumber seq_no, + MemoryAllocator* memory_allocator, BlockType block_type, + GetContext* get_context) const; + + // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found + // after a call to Seek(key), until handle_result returns false. + // May not make such a call if filter policy says that key is not present. + friend class TableCache; + friend class BlockBasedTableBuilder; + + // Create a index reader based on the index type stored in the table. + // Optionally, user can pass a preloaded meta_index_iter for the index that + // need to access extra meta blocks for index construction. This parameter + // helps avoid re-reading meta index block if caller already created one. + Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer, + InternalIterator* preloaded_meta_index_iter, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr<IndexReader>* index_reader); + + bool FullFilterKeyMayMatch(const ReadOptions& read_options, + FilterBlockReader* filter, const Slice& user_key, + const bool no_io, + const SliceTransform* prefix_extractor, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; + + void FullFilterKeysMayMatch(const ReadOptions& read_options, + FilterBlockReader* filter, MultiGetRange* range, + const bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const; + + static Status PrefetchTail( + RandomAccessFileReader* file, uint64_t file_size, + TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, + const bool preload_all, + std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer); + Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr<Block>* metaindex_block, + std::unique_ptr<InternalIterator>* iter); + Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer, + const Slice& handle_value, + TableProperties** table_properties); + Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, + const SequenceNumber largest_seqno); + Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, + const InternalKeyComparator& internal_comparator, + BlockCacheLookupContext* lookup_context); + Status PrefetchIndexAndFilterBlocks( + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + BlockBasedTable* new_table, bool prefetch_all, + const BlockBasedTableOptions& table_options, const int level, + BlockCacheLookupContext* lookup_context); + + static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); + + Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter); + Status VerifyChecksumInBlocks(const ReadOptions& read_options, + InternalIteratorBase<IndexValue>* index_iter); + + // Create the filter from the filter block. + std::unique_ptr<FilterBlockReader> CreateFilterBlockReader( + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context); + + static void SetupCacheKeyPrefix(Rep* rep); + + // Generate a cache key prefix from the file + static void GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file, + char* buffer, size_t* size); + static void GenerateCachePrefix(Cache* cc, FSWritableFile* file, char* buffer, + size_t* size); + + // Given an iterator return its offset in file. + uint64_t ApproximateOffsetOf( + const InternalIteratorBase<IndexValue>& index_iter) const; + + // Helper functions for DumpTable() + Status DumpIndexBlock(WritableFile* out_file); + Status DumpDataBlocks(WritableFile* out_file); + void DumpKeyValue(const Slice& key, const Slice& value, + WritableFile* out_file); + + // A cumulative data block file read in MultiGet lower than this size will + // use a stack buffer + static constexpr size_t kMultiGetReadStackBufSize = 8192; + + friend class PartitionedFilterBlockReader; + friend class PartitionedFilterBlockTest; + friend class DBBasicTest_MultiGetIOBufferOverrun_Test; +}; + +// Maitaning state of a two-level iteration on a partitioned index structure. +class BlockBasedTable::PartitionedIndexIteratorState + : public TwoLevelIteratorState { + public: + PartitionedIndexIteratorState( + const BlockBasedTable* table, + std::unordered_map<uint64_t, CachableEntry<Block>>* block_map); + InternalIteratorBase<IndexValue>* NewSecondaryIterator( + const BlockHandle& index_value) override; + + private: + // Don't own table_ + const BlockBasedTable* table_; + std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_; +}; + +// Stores all the properties associated with a BlockBasedTable. +// These are immutable. +struct BlockBasedTable::Rep { + Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, + const BlockBasedTableOptions& _table_opt, + const InternalKeyComparator& _internal_comparator, bool skip_filters, + int _level, const bool _immortal_table) + : ioptions(_ioptions), + env_options(_env_options), + table_options(_table_opt), + filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()), + internal_comparator(_internal_comparator), + filter_type(FilterType::kNoFilter), + index_type(BlockBasedTableOptions::IndexType::kBinarySearch), + hash_index_allow_collision(false), + whole_key_filtering(_table_opt.whole_key_filtering), + prefix_filtering(true), + global_seqno(kDisableGlobalSequenceNumber), + level(_level), + immortal_table(_immortal_table) {} + + const ImmutableCFOptions& ioptions; + const EnvOptions& env_options; + const BlockBasedTableOptions table_options; + const FilterPolicy* const filter_policy; + const InternalKeyComparator& internal_comparator; + Status status; + std::unique_ptr<RandomAccessFileReader> file; + char cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t cache_key_prefix_size = 0; + char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t persistent_cache_key_prefix_size = 0; + char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t compressed_cache_key_prefix_size = 0; + PersistentCacheOptions persistent_cache_options; + + // Footer contains the fixed table information + Footer footer; + + std::unique_ptr<IndexReader> index_reader; + std::unique_ptr<FilterBlockReader> filter; + std::unique_ptr<UncompressionDictReader> uncompression_dict_reader; + + enum class FilterType { + kNoFilter, + kFullFilter, + kBlockFilter, + kPartitionedFilter, + }; + FilterType filter_type; + BlockHandle filter_handle; + BlockHandle compression_dict_handle; + + std::shared_ptr<const TableProperties> table_properties; + BlockBasedTableOptions::IndexType index_type; + bool hash_index_allow_collision; + bool whole_key_filtering; + bool prefix_filtering; + // TODO(kailiu) It is very ugly to use internal key in table, since table + // module should not be relying on db module. However to make things easier + // and compatible with existing code, we introduce a wrapper that allows + // block to extract prefix without knowing if a key is internal or not. + // null if no prefix_extractor is passed in when opening the table reader. + std::unique_ptr<SliceTransform> internal_prefix_transform; + std::shared_ptr<const SliceTransform> table_prefix_extractor; + + std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels; + + // If global_seqno is used, all Keys in this file will have the same + // seqno with value `global_seqno`. + // + // A value of kDisableGlobalSequenceNumber means that this feature is disabled + // and every key have it's own seqno. + SequenceNumber global_seqno; + + // the level when the table is opened, could potentially change when trivial + // move is involved + int level; + + // If false, blocks in this file are definitely all uncompressed. Knowing this + // before reading individual blocks enables certain optimizations. + bool blocks_maybe_compressed = true; + + // If true, data blocks in this file are definitely ZSTD compressed. If false + // they might not be. When false we skip creating a ZSTD digested + // uncompression dictionary. Even if we get a false negative, things should + // still work, just not as quickly. + bool blocks_definitely_zstd_compressed = false; + + // These describe how index is encoded. + bool index_has_first_key = false; + bool index_key_includes_seq = true; + bool index_value_is_full = true; + + const bool immortal_table; + + SequenceNumber get_global_seqno(BlockType block_type) const { + return (block_type == BlockType::kFilter || + block_type == BlockType::kCompressionDictionary) + ? kDisableGlobalSequenceNumber + : global_seqno; + } + + uint64_t cf_id_for_tracing() const { + return table_properties + ? table_properties->column_family_id + : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context:: + kUnknownColumnFamily; + } + + Slice cf_name_for_tracing() const { + return table_properties ? table_properties->column_family_name + : BlockCacheTraceHelper::kUnknownColumnFamilyName; + } + + uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; } + + uint64_t sst_number_for_tracing() const { + return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX; + } + void CreateFilePrefetchBuffer( + size_t readahead_size, size_t max_readahead_size, + std::unique_ptr<FilePrefetchBuffer>* fpb) const { + fpb->reset(new FilePrefetchBuffer(file.get(), readahead_size, + max_readahead_size, + !ioptions.allow_mmap_reads /* enable */)); + } +}; + +// Iterates over the contents of BlockBasedTable. +template <class TBlockIter, typename TValue = Slice> +class BlockBasedTableIterator : public InternalIteratorBase<TValue> { + // compaction_readahead_size: its value will only be used if for_compaction = + // true + public: + BlockBasedTableIterator(const BlockBasedTable* table, + const ReadOptions& read_options, + const InternalKeyComparator& icomp, + InternalIteratorBase<IndexValue>* index_iter, + bool check_filter, bool need_upper_bound_check, + const SliceTransform* prefix_extractor, + BlockType block_type, TableReaderCaller caller, + size_t compaction_readahead_size = 0) + : table_(table), + read_options_(read_options), + icomp_(icomp), + user_comparator_(icomp.user_comparator()), + index_iter_(index_iter), + pinned_iters_mgr_(nullptr), + block_iter_points_to_real_block_(false), + check_filter_(check_filter), + need_upper_bound_check_(need_upper_bound_check), + prefix_extractor_(prefix_extractor), + block_type_(block_type), + lookup_context_(caller), + compaction_readahead_size_(compaction_readahead_size) {} + + ~BlockBasedTableIterator() { delete index_iter_; } + + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void SeekToFirst() override; + void SeekToLast() override; + void Next() final override; + bool NextAndGetResult(IterateResult* result) override; + void Prev() override; + bool Valid() const override { + return !is_out_of_bound_ && + (is_at_first_key_from_index_ || + (block_iter_points_to_real_block_ && block_iter_.Valid())); + } + Slice key() const override { + assert(Valid()); + if (is_at_first_key_from_index_) { + return index_iter_->value().first_internal_key; + } else { + return block_iter_.key(); + } + } + Slice user_key() const override { + assert(Valid()); + if (is_at_first_key_from_index_) { + return ExtractUserKey(index_iter_->value().first_internal_key); + } else { + return block_iter_.user_key(); + } + } + TValue value() const override { + assert(Valid()); + + // Load current block if not loaded. + if (is_at_first_key_from_index_ && + !const_cast<BlockBasedTableIterator*>(this) + ->MaterializeCurrentBlock()) { + // Oops, index is not consistent with block contents, but we have + // no good way to report error at this point. Let's return empty value. + return TValue(); + } + + return block_iter_.value(); + } + Status status() const override { + // Prefix index set status to NotFound when the prefix does not exist + if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { + return index_iter_->status(); + } else if (block_iter_points_to_real_block_) { + return block_iter_.status(); + } else { + return Status::OK(); + } + } + + // Whether iterator invalidated for being out of bound. + bool IsOutOfBound() override { return is_out_of_bound_; } + + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return !data_block_within_upper_bound_; + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + bool IsKeyPinned() const override { + // Our key comes either from block_iter_'s current key + // or index_iter_'s current *value*. + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) || + (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned())); + } + bool IsValuePinned() const override { + // Load current block if not loaded. + if (is_at_first_key_from_index_) { + const_cast<BlockBasedTableIterator*>(this)->MaterializeCurrentBlock(); + } + // BlockIter::IsValuePinned() is always true. No need to check + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + block_iter_points_to_real_block_; + } + + void ResetDataIter() { + if (block_iter_points_to_real_block_) { + if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) { + block_iter_.DelegateCleanupsTo(pinned_iters_mgr_); + } + block_iter_.Invalidate(Status::OK()); + block_iter_points_to_real_block_ = false; + } + } + + void SavePrevIndexValue() { + if (block_iter_points_to_real_block_) { + // Reseek. If they end up with the same data block, we shouldn't re-fetch + // the same data block. + prev_block_offset_ = index_iter_->value().handle.offset(); + } + } + + private: + enum class IterDirection { + kForward, + kBackward, + }; + + const BlockBasedTable* table_; + const ReadOptions read_options_; + const InternalKeyComparator& icomp_; + UserComparatorWrapper user_comparator_; + InternalIteratorBase<IndexValue>* index_iter_; + PinnedIteratorsManager* pinned_iters_mgr_; + TBlockIter block_iter_; + + // True if block_iter_ is initialized and points to the same block + // as index iterator. + bool block_iter_points_to_real_block_; + // See InternalIteratorBase::IsOutOfBound(). + bool is_out_of_bound_ = false; + // Whether current data block being fully within iterate upper bound. + bool data_block_within_upper_bound_ = false; + // True if we're standing at the first key of a block, and we haven't loaded + // that block yet. A call to value() will trigger loading the block. + bool is_at_first_key_from_index_ = false; + bool check_filter_; + // TODO(Zhongyi): pick a better name + bool need_upper_bound_check_; + const SliceTransform* prefix_extractor_; + BlockType block_type_; + uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max(); + BlockCacheLookupContext lookup_context_; + // Readahead size used in compaction, its value is used only if + // lookup_context_.caller = kCompaction. + size_t compaction_readahead_size_; + + size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; + size_t readahead_limit_ = 0; + int64_t num_file_reads_ = 0; + std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_; + + // If `target` is null, seek to first. + void SeekImpl(const Slice* target); + + void InitDataBlock(); + bool MaterializeCurrentBlock(); + void FindKeyForward(); + void FindBlockForward(); + void FindKeyBackward(); + void CheckOutOfBound(); + + // Check if data block is fully within iterate_upper_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update data_block_within_upper_bound_ accordingly. + void CheckDataBlockWithinUpperBound(); + + bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) { + if (need_upper_bound_check_ && direction == IterDirection::kBackward) { + // Upper bound check isn't sufficnet for backward direction to + // guarantee the same result as total order, so disable prefix + // check. + return true; + } + if (check_filter_ && + !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, + need_upper_bound_check_, &lookup_context_)) { + // TODO remember the iterator is invalidated because of prefix + // match. This can avoid the upper level file iterator to falsely + // believe the position is the end of the SST file and move to + // the first key of the next file. + ResetDataIter(); + return false; + } + return true; + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_builder.cc b/src/rocksdb/table/block_based/block_builder.cc new file mode 100644 index 000000000..6f77ef97c --- /dev/null +++ b/src/rocksdb/table/block_based/block_builder.cc @@ -0,0 +1,196 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// BlockBuilder generates blocks where keys are prefix-compressed: +// +// When we store a key, we drop the prefix shared with the previous +// string. This helps reduce the space requirement significantly. +// Furthermore, once every K keys, we do not apply the prefix +// compression and store the entire key. We call this a "restart +// point". The tail end of the block stores the offsets of all of the +// restart points, and can be used to do a binary search when looking +// for a particular key. Values are stored as-is (without compression) +// immediately following the corresponding key. +// +// An entry for a particular key-value pair has the form: +// shared_bytes: varint32 +// unshared_bytes: varint32 +// value_length: varint32 +// key_delta: char[unshared_bytes] +// value: char[value_length] +// shared_bytes == 0 for restart points. +// +// The trailer of the block has the form: +// restarts: uint32[num_restarts] +// num_restarts: uint32 +// restarts[i] contains the offset within the block of the ith restart point. + +#include "table/block_based/block_builder.h" + +#include <assert.h> +#include <algorithm> +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "table/block_based/data_block_footer.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +BlockBuilder::BlockBuilder( + int block_restart_interval, bool use_delta_encoding, + bool use_value_delta_encoding, + BlockBasedTableOptions::DataBlockIndexType index_type, + double data_block_hash_table_util_ratio) + : block_restart_interval_(block_restart_interval), + use_delta_encoding_(use_delta_encoding), + use_value_delta_encoding_(use_value_delta_encoding), + restarts_(), + counter_(0), + finished_(false) { + switch (index_type) { + case BlockBasedTableOptions::kDataBlockBinarySearch: + break; + case BlockBasedTableOptions::kDataBlockBinaryAndHash: + data_block_hash_index_builder_.Initialize( + data_block_hash_table_util_ratio); + break; + default: + assert(0); + } + assert(block_restart_interval_ >= 1); + restarts_.push_back(0); // First restart point is at offset 0 + estimate_ = sizeof(uint32_t) + sizeof(uint32_t); +} + +void BlockBuilder::Reset() { + buffer_.clear(); + restarts_.clear(); + restarts_.push_back(0); // First restart point is at offset 0 + estimate_ = sizeof(uint32_t) + sizeof(uint32_t); + counter_ = 0; + finished_ = false; + last_key_.clear(); + if (data_block_hash_index_builder_.Valid()) { + data_block_hash_index_builder_.Reset(); + } +} + +size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, + const Slice& value) const { + size_t estimate = CurrentSizeEstimate(); + // Note: this is an imprecise estimate as it accounts for the whole key size + // instead of non-shared key size. + estimate += key.size(); + // In value delta encoding we estimate the value delta size as half the full + // value size since only the size field of block handle is encoded. + estimate += + !use_value_delta_encoding_ || (counter_ >= block_restart_interval_) + ? value.size() + : value.size() / 2; + + if (counter_ >= block_restart_interval_) { + estimate += sizeof(uint32_t); // a new restart entry. + } + + estimate += sizeof(int32_t); // varint for shared prefix length. + // Note: this is an imprecise estimate as we will have to encoded size, one + // for shared key and one for non-shared key. + estimate += VarintLength(key.size()); // varint for key length. + if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) { + estimate += VarintLength(value.size()); // varint for value length. + } + + return estimate; +} + +Slice BlockBuilder::Finish() { + // Append restart array + for (size_t i = 0; i < restarts_.size(); i++) { + PutFixed32(&buffer_, restarts_[i]); + } + + uint32_t num_restarts = static_cast<uint32_t>(restarts_.size()); + BlockBasedTableOptions::DataBlockIndexType index_type = + BlockBasedTableOptions::kDataBlockBinarySearch; + if (data_block_hash_index_builder_.Valid() && + CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) { + data_block_hash_index_builder_.Finish(buffer_); + index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; + } + + // footer is a packed format of data_block_index_type and num_restarts + uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts); + + PutFixed32(&buffer_, block_footer); + finished_ = true; + return Slice(buffer_); +} + +void BlockBuilder::Add(const Slice& key, const Slice& value, + const Slice* const delta_value) { + assert(!finished_); + assert(counter_ <= block_restart_interval_); + assert(!use_value_delta_encoding_ || delta_value); + size_t shared = 0; // number of bytes shared with prev key + if (counter_ >= block_restart_interval_) { + // Restart compression + restarts_.push_back(static_cast<uint32_t>(buffer_.size())); + estimate_ += sizeof(uint32_t); + counter_ = 0; + + if (use_delta_encoding_) { + // Update state + last_key_.assign(key.data(), key.size()); + } + } else if (use_delta_encoding_) { + Slice last_key_piece(last_key_); + // See how much sharing to do with previous string + shared = key.difference_offset(last_key_piece); + + // Update state + // We used to just copy the changed data here, but it appears to be + // faster to just copy the whole thing. + last_key_.assign(key.data(), key.size()); + } + + const size_t non_shared = key.size() - shared; + const size_t curr_size = buffer_.size(); + + if (use_value_delta_encoding_) { + // Add "<shared><non_shared>" to buffer_ + PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared), + static_cast<uint32_t>(non_shared)); + } else { + // Add "<shared><non_shared><value_size>" to buffer_ + PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared), + static_cast<uint32_t>(non_shared), + static_cast<uint32_t>(value.size())); + } + + // Add string delta to buffer_ followed by value + buffer_.append(key.data() + shared, non_shared); + // Use value delta encoding only when the key has shared bytes. This would + // simplify the decoding, where it can figure which decoding to use simply by + // looking at the shared bytes size. + if (shared != 0 && use_value_delta_encoding_) { + buffer_.append(delta_value->data(), delta_value->size()); + } else { + buffer_.append(value.data(), value.size()); + } + + if (data_block_hash_index_builder_.Valid()) { + data_block_hash_index_builder_.Add(ExtractUserKey(key), + restarts_.size() - 1); + } + + counter_++; + estimate_ += buffer_.size() - curr_size; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_builder.h b/src/rocksdb/table/block_based/block_builder.h new file mode 100644 index 000000000..42c996e5b --- /dev/null +++ b/src/rocksdb/table/block_based/block_builder.h @@ -0,0 +1,75 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include <vector> + +#include <stdint.h> +#include "rocksdb/slice.h" +#include "rocksdb/table.h" +#include "table/block_based/data_block_hash_index.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBuilder { + public: + BlockBuilder(const BlockBuilder&) = delete; + void operator=(const BlockBuilder&) = delete; + + explicit BlockBuilder(int block_restart_interval, + bool use_delta_encoding = true, + bool use_value_delta_encoding = false, + BlockBasedTableOptions::DataBlockIndexType index_type = + BlockBasedTableOptions::kDataBlockBinarySearch, + double data_block_hash_table_util_ratio = 0.75); + + // Reset the contents as if the BlockBuilder was just constructed. + void Reset(); + + // REQUIRES: Finish() has not been called since the last call to Reset(). + // REQUIRES: key is larger than any previously added key + void Add(const Slice& key, const Slice& value, + const Slice* const delta_value = nullptr); + + // Finish building the block and return a slice that refers to the + // block contents. The returned slice will remain valid for the + // lifetime of this builder or until Reset() is called. + Slice Finish(); + + // Returns an estimate of the current (uncompressed) size of the block + // we are building. + inline size_t CurrentSizeEstimate() const { + return estimate_ + (data_block_hash_index_builder_.Valid() + ? data_block_hash_index_builder_.EstimateSize() + : 0); + } + + // Returns an estimated block size after appending key and value. + size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const; + + // Return true iff no entries have been added since the last Reset() + bool empty() const { return buffer_.empty(); } + + private: + const int block_restart_interval_; + // TODO(myabandeh): put it into a separate IndexBlockBuilder + const bool use_delta_encoding_; + // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values + const bool use_value_delta_encoding_; + + std::string buffer_; // Destination buffer + std::vector<uint32_t> restarts_; // Restart points + size_t estimate_; + int counter_; // Number of entries emitted since restart + bool finished_; // Has Finish() been called? + std::string last_key_; + DataBlockHashIndexBuilder data_block_hash_index_builder_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_prefix_index.cc b/src/rocksdb/table/block_based/block_prefix_index.cc new file mode 100644 index 000000000..f9d92c74c --- /dev/null +++ b/src/rocksdb/table/block_based/block_prefix_index.cc @@ -0,0 +1,232 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_based/block_prefix_index.h" + +#include <vector> + +#include "memory/arena.h" +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +inline uint32_t Hash(const Slice& s) { + return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0); +} + +inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) { + return Hash(prefix) % num_buckets; +} + +// The prefix block index is simply a bucket array, with each entry pointing to +// the blocks that span the prefixes hashed to this bucket. +// +// To reduce memory footprint, if there is only one block per bucket, the entry +// stores the block id directly. If there are more than one blocks per bucket, +// because of hash collision or a single prefix spanning multiple blocks, +// the entry points to an array of block ids. The block array is an array of +// uint32_t's. The first uint32_t indicates the total number of blocks, followed +// by the block ids. +// +// To differentiate the two cases, the high order bit of the entry indicates +// whether it is a 'pointer' into a separate block array. +// 0x7FFFFFFF is reserved for empty bucket. + +const uint32_t kNoneBlock = 0x7FFFFFFF; +const uint32_t kBlockArrayMask = 0x80000000; + +inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; } + +inline bool IsBlockId(uint32_t block_id) { + return (block_id & kBlockArrayMask) == 0; +} + +inline uint32_t DecodeIndex(uint32_t block_id) { + uint32_t index = block_id ^ kBlockArrayMask; + assert(index < kBlockArrayMask); + return index; +} + +inline uint32_t EncodeIndex(uint32_t index) { + assert(index < kBlockArrayMask); + return index | kBlockArrayMask; +} + +// temporary storage for prefix information during index building +struct PrefixRecord { + Slice prefix; + uint32_t start_block; + uint32_t end_block; + uint32_t num_blocks; + PrefixRecord* next; +}; + +class BlockPrefixIndex::Builder { + public: + explicit Builder(const SliceTransform* internal_prefix_extractor) + : internal_prefix_extractor_(internal_prefix_extractor) {} + + void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) { + PrefixRecord* record = reinterpret_cast<PrefixRecord*>( + arena_.AllocateAligned(sizeof(PrefixRecord))); + record->prefix = key_prefix; + record->start_block = start_block; + record->end_block = start_block + num_blocks - 1; + record->num_blocks = num_blocks; + prefixes_.push_back(record); + } + + BlockPrefixIndex* Finish() { + // For now, use roughly 1:1 prefix to bucket ratio. + uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1; + + // Collect prefix records that hash to the same bucket, into a single + // linklist. + std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr); + std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0); + for (PrefixRecord* current : prefixes_) { + uint32_t bucket = PrefixToBucket(current->prefix, num_buckets); + // merge the prefix block span if the first block of this prefix is + // connected to the last block of the previous prefix. + PrefixRecord* prev = prefixes_per_bucket[bucket]; + if (prev) { + assert(current->start_block >= prev->end_block); + auto distance = current->start_block - prev->end_block; + if (distance <= 1) { + prev->end_block = current->end_block; + prev->num_blocks = prev->end_block - prev->start_block + 1; + num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1); + continue; + } + } + current->next = prev; + prefixes_per_bucket[bucket] = current; + num_blocks_per_bucket[bucket] += current->num_blocks; + } + + // Calculate the block array buffer size + uint32_t total_block_array_entries = 0; + for (uint32_t i = 0; i < num_buckets; i++) { + uint32_t num_blocks = num_blocks_per_bucket[i]; + if (num_blocks > 1) { + total_block_array_entries += (num_blocks + 1); + } + } + + // Populate the final prefix block index + uint32_t* block_array_buffer = new uint32_t[total_block_array_entries]; + uint32_t* buckets = new uint32_t[num_buckets]; + uint32_t offset = 0; + for (uint32_t i = 0; i < num_buckets; i++) { + uint32_t num_blocks = num_blocks_per_bucket[i]; + if (num_blocks == 0) { + assert(prefixes_per_bucket[i] == nullptr); + buckets[i] = kNoneBlock; + } else if (num_blocks == 1) { + assert(prefixes_per_bucket[i] != nullptr); + assert(prefixes_per_bucket[i]->next == nullptr); + buckets[i] = prefixes_per_bucket[i]->start_block; + } else { + assert(total_block_array_entries > 0); + assert(prefixes_per_bucket[i] != nullptr); + buckets[i] = EncodeIndex(offset); + block_array_buffer[offset] = num_blocks; + uint32_t* last_block = &block_array_buffer[offset + num_blocks]; + auto current = prefixes_per_bucket[i]; + // populate block ids from largest to smallest + while (current != nullptr) { + for (uint32_t iter = 0; iter < current->num_blocks; iter++) { + *last_block = current->end_block - iter; + last_block--; + } + current = current->next; + } + assert(last_block == &block_array_buffer[offset]); + offset += (num_blocks + 1); + } + } + + assert(offset == total_block_array_entries); + + return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets, + buckets, total_block_array_entries, + block_array_buffer); + } + + private: + const SliceTransform* internal_prefix_extractor_; + + std::vector<PrefixRecord*> prefixes_; + Arena arena_; +}; + +Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, + const Slice& prefixes, const Slice& prefix_meta, + BlockPrefixIndex** prefix_index) { + uint64_t pos = 0; + auto meta_pos = prefix_meta; + Status s; + Builder builder(internal_prefix_extractor); + + while (!meta_pos.empty()) { + uint32_t prefix_size = 0; + uint32_t entry_index = 0; + uint32_t num_blocks = 0; + if (!GetVarint32(&meta_pos, &prefix_size) || + !GetVarint32(&meta_pos, &entry_index) || + !GetVarint32(&meta_pos, &num_blocks)) { + s = Status::Corruption( + "Corrupted prefix meta block: unable to read from it."); + break; + } + if (pos + prefix_size > prefixes.size()) { + s = Status::Corruption( + "Corrupted prefix meta block: size inconsistency."); + break; + } + Slice prefix(prefixes.data() + pos, prefix_size); + builder.Add(prefix, entry_index, num_blocks); + + pos += prefix_size; + } + + if (s.ok() && pos != prefixes.size()) { + s = Status::Corruption("Corrupted prefix meta block"); + } + + if (s.ok()) { + *prefix_index = builder.Finish(); + } + + return s; +} + +uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) { + Slice prefix = internal_prefix_extractor_->Transform(key); + + uint32_t bucket = PrefixToBucket(prefix, num_buckets_); + uint32_t block_id = buckets_[bucket]; + + if (IsNone(block_id)) { + return 0; + } else if (IsBlockId(block_id)) { + *blocks = &buckets_[bucket]; + return 1; + } else { + uint32_t index = DecodeIndex(block_id); + assert(index < num_block_array_buffer_entries_); + *blocks = &block_array_buffer_[index + 1]; + uint32_t num_blocks = block_array_buffer_[index]; + assert(num_blocks > 1); + assert(index + num_blocks < num_block_array_buffer_entries_); + return num_blocks; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_prefix_index.h b/src/rocksdb/table/block_based/block_prefix_index.h new file mode 100644 index 000000000..04121320e --- /dev/null +++ b/src/rocksdb/table/block_based/block_prefix_index.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <stdint.h> +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Comparator; +class Iterator; +class Slice; +class SliceTransform; + +// Build a hash-based index to speed up the lookup for "index block". +// BlockHashIndex accepts a key and, if found, returns its restart index within +// that index block. +class BlockPrefixIndex { + public: + // Maps a key to a list of data blocks that could potentially contain + // the key, based on the prefix. + // Returns the total number of relevant blocks, 0 means the key does + // not exist. + uint32_t GetBlocks(const Slice& key, uint32_t** blocks); + + size_t ApproximateMemoryUsage() const { + return sizeof(BlockPrefixIndex) + + (num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t); + } + + // Create hash index by reading from the metadata blocks. + // @params prefixes: a sequence of prefixes. + // @params prefix_meta: contains the "metadata" to of the prefixes. + static Status Create(const SliceTransform* hash_key_extractor, + const Slice& prefixes, const Slice& prefix_meta, + BlockPrefixIndex** prefix_index); + + ~BlockPrefixIndex() { + delete[] buckets_; + delete[] block_array_buffer_; + } + + private: + class Builder; + friend Builder; + + BlockPrefixIndex(const SliceTransform* internal_prefix_extractor, + uint32_t num_buckets, uint32_t* buckets, + uint32_t num_block_array_buffer_entries, + uint32_t* block_array_buffer) + : internal_prefix_extractor_(internal_prefix_extractor), + num_buckets_(num_buckets), + num_block_array_buffer_entries_(num_block_array_buffer_entries), + buckets_(buckets), + block_array_buffer_(block_array_buffer) {} + + const SliceTransform* internal_prefix_extractor_; + uint32_t num_buckets_; + uint32_t num_block_array_buffer_entries_; + uint32_t* buckets_; + uint32_t* block_array_buffer_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/block_test.cc b/src/rocksdb/table/block_based/block_test.cc new file mode 100644 index 000000000..efa5b3ae3 --- /dev/null +++ b/src/rocksdb/table/block_based/block_test.cc @@ -0,0 +1,627 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include <stdio.h> +#include <algorithm> +#include <set> +#include <string> +#include <unordered_set> +#include <utility> +#include <vector> + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "table/block_based/block.h" +#include "table/block_based/block_builder.h" +#include "table/format.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +static std::string RandomString(Random *rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} +std::string GenerateKey(int primary_key, int secondary_key, int padding_size, + Random *rnd) { + char buf[50]; + char *p = &buf[0]; + snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); + std::string k(p); + if (padding_size) { + k += RandomString(rnd, padding_size); + } + + return k; +} + +// Generate random key value pairs. +// The generated key will be sorted. You can tune the parameters to generated +// different kinds of test key/value pairs for different scenario. +void GenerateRandomKVs(std::vector<std::string> *keys, + std::vector<std::string> *values, const int from, + const int len, const int step = 1, + const int padding_size = 0, + const int keys_share_prefix = 1) { + Random rnd(302); + + // generate different prefix + for (int i = from; i < from + len; i += step) { + // generating keys that shares the prefix + for (int j = 0; j < keys_share_prefix; ++j) { + keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + + // 100 bytes values + values->emplace_back(RandomString(&rnd, 100)); + } + } +} + +class BlockTest : public testing::Test {}; + +// block test +TEST_F(BlockTest, SimpleTest) { + Random rnd(301); + Options options = Options(); + + std::vector<std::string> keys; + std::vector<std::string> values; + BlockBuilder builder(16); + int num_records = 100000; + + GenerateRandomKVs(&keys, &values, 0, num_records); + // add a bunch of records to a block + for (int i = 0; i < num_records; i++) { + builder.Add(keys[i], values[i]); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + // read contents of block sequentially + int count = 0; + InternalIterator *iter = + reader.NewDataIterator(options.comparator, options.comparator); + for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { + // read kv from block + Slice k = iter->key(); + Slice v = iter->value(); + + // compare with lookaside array + ASSERT_EQ(k.ToString().compare(keys[count]), 0); + ASSERT_EQ(v.ToString().compare(values[count]), 0); + } + delete iter; + + // read block contents randomly + iter = reader.NewDataIterator(options.comparator, options.comparator); + for (int i = 0; i < num_records; i++) { + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(keys[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + Slice v = iter->value(); + ASSERT_EQ(v.ToString().compare(values[index]), 0); + } + delete iter; +} + +// return the block contents +BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder, + const std::vector<std::string> &keys, + const std::vector<std::string> &values, + const int /*prefix_group_size*/ = 1) { + builder->reset(new BlockBuilder(1 /* restart interval */)); + + // Add only half of the keys + for (size_t i = 0; i < keys.size(); ++i) { + (*builder)->Add(keys[i], values[i]); + } + Slice rawblock = (*builder)->Finish(); + + BlockContents contents; + contents.data = rawblock; + + return contents; +} + +void CheckBlockContents(BlockContents contents, const int max_key, + const std::vector<std::string> &keys, + const std::vector<std::string> &values) { + const size_t prefix_size = 6; + // create block reader + BlockContents contents_ref(contents.data); + Block reader1(std::move(contents), kDisableGlobalSequenceNumber); + Block reader2(std::move(contents_ref), kDisableGlobalSequenceNumber); + + std::unique_ptr<const SliceTransform> prefix_extractor( + NewFixedPrefixTransform(prefix_size)); + + std::unique_ptr<InternalIterator> regular_iter( + reader2.NewDataIterator(BytewiseComparator(), BytewiseComparator())); + + // Seek existent keys + for (size_t i = 0; i < keys.size(); i++) { + regular_iter->Seek(keys[i]); + ASSERT_OK(regular_iter->status()); + ASSERT_TRUE(regular_iter->Valid()); + + Slice v = regular_iter->value(); + ASSERT_EQ(v.ToString().compare(values[i]), 0); + } + + // Seek non-existent keys. + // For hash index, if no key with a given prefix is not found, iterator will + // simply be set as invalid; whereas the binary search based iterator will + // return the one that is closest. + for (int i = 1; i < max_key - 1; i += 2) { + auto key = GenerateKey(i, 0, 0, nullptr); + regular_iter->Seek(key); + ASSERT_TRUE(regular_iter->Valid()); + } +} + +// In this test case, no two key share same prefix. +TEST_F(BlockTest, SimpleIndexHash) { + const int kMaxKey = 100000; + std::vector<std::string> keys; + std::vector<std::string> values; + GenerateRandomKVs(&keys, &values, 0 /* first key id */, + kMaxKey /* last key id */, 2 /* step */, + 8 /* padding size (8 bytes randomly generated suffix) */); + + std::unique_ptr<BlockBuilder> builder; + auto contents = GetBlockContents(&builder, keys, values); + + CheckBlockContents(std::move(contents), kMaxKey, keys, values); +} + +TEST_F(BlockTest, IndexHashWithSharedPrefix) { + const int kMaxKey = 100000; + // for each prefix, there will be 5 keys starts with it. + const int kPrefixGroup = 5; + std::vector<std::string> keys; + std::vector<std::string> values; + // Generate keys with same prefix. + GenerateRandomKVs(&keys, &values, 0, // first key id + kMaxKey, // last key id + 2, // step + 10, // padding size, + kPrefixGroup); + + std::unique_ptr<BlockBuilder> builder; + auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup); + + CheckBlockContents(std::move(contents), kMaxKey, keys, values); +} + +// A slow and accurate version of BlockReadAmpBitmap that simply store +// all the marked ranges in a set. +class BlockReadAmpBitmapSlowAndAccurate { + public: + void Mark(size_t start_offset, size_t end_offset) { + assert(end_offset >= start_offset); + marked_ranges_.emplace(end_offset, start_offset); + } + + void ResetCheckSequence() { iter_valid_ = false; } + + // Return true if any byte in this range was Marked + // This does linear search from the previous position. When calling + // multiple times, `offset` needs to be incremental to get correct results. + // Call ResetCheckSequence() to reset it. + bool IsPinMarked(size_t offset) { + if (iter_valid_) { + // Has existing iterator, try linear search from + // the iterator. + for (int i = 0; i < 64; i++) { + if (offset < iter_->second) { + return false; + } + if (offset <= iter_->first) { + return true; + } + + iter_++; + if (iter_ == marked_ranges_.end()) { + iter_valid_ = false; + return false; + } + } + } + // Initial call or have linear searched too many times. + // Do binary search. + iter_ = marked_ranges_.lower_bound( + std::make_pair(offset, static_cast<size_t>(0))); + if (iter_ == marked_ranges_.end()) { + iter_valid_ = false; + return false; + } + iter_valid_ = true; + return offset <= iter_->first && offset >= iter_->second; + } + + private: + std::set<std::pair<size_t, size_t>> marked_ranges_; + std::set<std::pair<size_t, size_t>>::iterator iter_; + bool iter_valid_ = false; +}; + +TEST_F(BlockTest, BlockReadAmpBitmap) { + uint32_t pin_offset = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlockReadAmpBitmap:rnd", [&pin_offset](void *arg) { + pin_offset = *(static_cast<uint32_t *>(arg)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + std::vector<size_t> block_sizes = { + 1, // 1 byte + 32, // 32 bytes + 61, // 61 bytes + 64, // 64 bytes + 512, // 0.5 KB + 1024, // 1 KB + 1024 * 4, // 4 KB + 1024 * 10, // 10 KB + 1024 * 50, // 50 KB + 1024 * 1024 * 4, // 5 MB + 777, + 124653, + }; + const size_t kBytesPerBit = 64; + + Random rnd(301); + for (size_t block_size : block_sizes) { + std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockReadAmpBitmap read_amp_bitmap(block_size, kBytesPerBit, stats.get()); + BlockReadAmpBitmapSlowAndAccurate read_amp_slow_and_accurate; + + size_t needed_bits = (block_size / kBytesPerBit); + if (block_size % kBytesPerBit != 0) { + needed_bits++; + } + + ASSERT_EQ(stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES), block_size); + + // Generate some random entries + std::vector<size_t> random_entry_offsets; + for (int i = 0; i < 1000; i++) { + random_entry_offsets.push_back(rnd.Next() % block_size); + } + std::sort(random_entry_offsets.begin(), random_entry_offsets.end()); + auto it = + std::unique(random_entry_offsets.begin(), random_entry_offsets.end()); + random_entry_offsets.resize( + std::distance(random_entry_offsets.begin(), it)); + + std::vector<std::pair<size_t, size_t>> random_entries; + for (size_t i = 0; i < random_entry_offsets.size(); i++) { + size_t entry_start = random_entry_offsets[i]; + size_t entry_end; + if (i + 1 < random_entry_offsets.size()) { + entry_end = random_entry_offsets[i + 1] - 1; + } else { + entry_end = block_size - 1; + } + random_entries.emplace_back(entry_start, entry_end); + } + + for (size_t i = 0; i < random_entries.size(); i++) { + read_amp_slow_and_accurate.ResetCheckSequence(); + auto ¤t_entry = random_entries[rnd.Next() % random_entries.size()]; + + read_amp_bitmap.Mark(static_cast<uint32_t>(current_entry.first), + static_cast<uint32_t>(current_entry.second)); + read_amp_slow_and_accurate.Mark(current_entry.first, + current_entry.second); + + size_t total_bits = 0; + for (size_t bit_idx = 0; bit_idx < needed_bits; bit_idx++) { + total_bits += read_amp_slow_and_accurate.IsPinMarked( + bit_idx * kBytesPerBit + pin_offset); + } + size_t expected_estimate_useful = total_bits * kBytesPerBit; + size_t got_estimate_useful = + stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES); + ASSERT_EQ(expected_estimate_useful, got_estimate_useful); + } + } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlockTest, BlockWithReadAmpBitmap) { + Random rnd(301); + Options options = Options(); + + std::vector<std::string> keys; + std::vector<std::string> values; + BlockBuilder builder(16); + int num_records = 10000; + + GenerateRandomKVs(&keys, &values, 0, num_records, 1); + // add a bunch of records to a block + for (int i = 0; i < num_records; i++) { + builder.Add(keys[i], values[i]); + } + + Slice rawblock = builder.Finish(); + const size_t kBytesPerBit = 8; + + // Read the block sequentially using Next() + { + std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber, + kBytesPerBit, stats.get()); + + // read contents of block sequentially + size_t read_bytes = 0; + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + iter->value(); + read_bytes += iter->TEST_CurrentEntrySize(); + + double semi_acc_read_amp = + static_cast<double>(read_bytes) / rawblock.size(); + double read_amp = static_cast<double>(stats->getTickerCount( + READ_AMP_ESTIMATE_USEFUL_BYTES)) / + stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + // Error in read amplification will be less than 1% if we are reading + // sequentially + double error_pct = fabs(semi_acc_read_amp - read_amp) * 100; + EXPECT_LT(error_pct, 1); + } + + delete iter; + } + + // Read the block sequentially using Seek() + { + std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber, + kBytesPerBit, stats.get()); + + size_t read_bytes = 0; + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); + for (int i = 0; i < num_records; i++) { + Slice k(keys[i]); + + // search in block for this key + iter->Seek(k); + iter->value(); + read_bytes += iter->TEST_CurrentEntrySize(); + + double semi_acc_read_amp = + static_cast<double>(read_bytes) / rawblock.size(); + double read_amp = static_cast<double>(stats->getTickerCount( + READ_AMP_ESTIMATE_USEFUL_BYTES)) / + stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + // Error in read amplification will be less than 1% if we are reading + // sequentially + double error_pct = fabs(semi_acc_read_amp - read_amp) * 100; + EXPECT_LT(error_pct, 1); + } + delete iter; + } + + // Read the block randomly + { + std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber, + kBytesPerBit, stats.get()); + + size_t read_bytes = 0; + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); + std::unordered_set<int> read_keys; + for (int i = 0; i < num_records; i++) { + int index = rnd.Uniform(num_records); + Slice k(keys[index]); + + iter->Seek(k); + iter->value(); + if (read_keys.find(index) == read_keys.end()) { + read_keys.insert(index); + read_bytes += iter->TEST_CurrentEntrySize(); + } + + double semi_acc_read_amp = + static_cast<double>(read_bytes) / rawblock.size(); + double read_amp = static_cast<double>(stats->getTickerCount( + READ_AMP_ESTIMATE_USEFUL_BYTES)) / + stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES); + + double error_pct = fabs(semi_acc_read_amp - read_amp) * 100; + // Error in read amplification will be less than 2% if we are reading + // randomly + EXPECT_LT(error_pct, 2); + } + delete iter; + } +} + +TEST_F(BlockTest, ReadAmpBitmapPow2) { + std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics(); + ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1u); + ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2u); + ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4u); + ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8u); + ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16u); + ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32u); + + ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2u); + ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4u); + ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8u); + ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16u); + ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32u); + ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u); +} + +class IndexBlockTest + : public testing::Test, + public testing::WithParamInterface<std::tuple<bool, bool>> { + public: + IndexBlockTest() = default; + + bool useValueDeltaEncoding() const { return std::get<0>(GetParam()); } + bool includeFirstKey() const { return std::get<1>(GetParam()); } +}; + +// Similar to GenerateRandomKVs but for index block contents. +void GenerateRandomIndexEntries(std::vector<std::string> *separators, + std::vector<BlockHandle> *block_handles, + std::vector<std::string> *first_keys, + const int len) { + Random rnd(42); + + // For each of `len` blocks, we need to generate a first and last key. + // Let's generate n*2 random keys, sort them, group into consecutive pairs. + std::set<std::string> keys; + while ((int)keys.size() < len * 2) { + // Keys need to be at least 8 bytes long to look like internal keys. + keys.insert(test::RandomKey(&rnd, 12)); + } + + uint64_t offset = 0; + for (auto it = keys.begin(); it != keys.end();) { + first_keys->emplace_back(*it++); + separators->emplace_back(*it++); + uint64_t size = rnd.Uniform(1024 * 16); + BlockHandle handle(offset, size); + offset += size + kBlockTrailerSize; + block_handles->emplace_back(handle); + } +} + +TEST_P(IndexBlockTest, IndexValueEncodingTest) { + Random rnd(301); + Options options = Options(); + + std::vector<std::string> separators; + std::vector<BlockHandle> block_handles; + std::vector<std::string> first_keys; + const bool kUseDeltaEncoding = true; + BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding()); + int num_records = 100; + + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + num_records); + BlockHandle last_encoded_handle; + for (int i = 0; i < num_records; i++) { + IndexValue entry(block_handles[i], first_keys[i]); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr); + if (useValueDeltaEncoding() && i > 0) { + entry.EncodeTo(&delta_encoded_entry, includeFirstKey(), + &last_encoded_handle); + } + last_encoded_handle = entry.handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + const bool kTotalOrderSeek = true; + const bool kIncludesSeq = true; + const bool kValueIsFull = !useValueDeltaEncoding(); + IndexBlockIter *kNullIter = nullptr; + Statistics *kNullStats = nullptr; + // read contents of block sequentially + InternalIteratorBase<IndexValue> *iter = reader.NewIndexIterator( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull); + iter->SeekToFirst(); + for (int index = 0; index < num_records; ++index) { + ASSERT_TRUE(iter->Valid()); + + Slice k = iter->key(); + IndexValue v = iter->value(); + + EXPECT_EQ(separators[index], k.ToString()); + EXPECT_EQ(block_handles[index].offset(), v.handle.offset()); + EXPECT_EQ(block_handles[index].size(), v.handle.size()); + EXPECT_EQ(includeFirstKey() ? first_keys[index] : "", + v.first_internal_key.ToString()); + + iter->Next(); + } + delete iter; + + // read block contents randomly + iter = reader.NewIndexIterator(options.comparator, options.comparator, + kNullIter, kNullStats, kTotalOrderSeek, + includeFirstKey(), kIncludesSeq, kValueIsFull); + for (int i = 0; i < num_records * 2; i++) { + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(separators[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + IndexValue v = iter->value(); + EXPECT_EQ(separators[index], iter->key().ToString()); + EXPECT_EQ(block_handles[index].offset(), v.handle.offset()); + EXPECT_EQ(block_handles[index].size(), v.handle.size()); + EXPECT_EQ(includeFirstKey() ? first_keys[index] : "", + v.first_internal_key.ToString()); + } + delete iter; +} + +INSTANTIATE_TEST_CASE_P(P, IndexBlockTest, + ::testing::Values(std::make_tuple(false, false), + std::make_tuple(false, true), + std::make_tuple(true, false), + std::make_tuple(true, true))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/block_based/block_type.h b/src/rocksdb/table/block_based/block_type.h new file mode 100644 index 000000000..b2a913746 --- /dev/null +++ b/src/rocksdb/table/block_based/block_type.h @@ -0,0 +1,30 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cstdint> + +namespace ROCKSDB_NAMESPACE { + +// Represents the types of blocks used in the block based table format. +// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format +// for details. + +enum class BlockType : uint8_t { + kData, + kFilter, + kProperties, + kCompressionDictionary, + kRangeDeletion, + kHashIndexPrefixes, + kHashIndexMetadata, + kMetaIndex, + kIndex, + // Note: keep kInvalid the last value when adding new enum values. + kInvalid +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/cachable_entry.h b/src/rocksdb/table/block_based/cachable_entry.h new file mode 100644 index 000000000..598f1ef57 --- /dev/null +++ b/src/rocksdb/table/block_based/cachable_entry.h @@ -0,0 +1,220 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <cassert> +#include "port/likely.h" +#include "rocksdb/cache.h" +#include "rocksdb/cleanable.h" + +namespace ROCKSDB_NAMESPACE { + +// CachableEntry is a handle to an object that may or may not be in the block +// cache. It is used in a variety of ways: +// +// 1) It may refer to an object in the block cache. In this case, cache_ and +// cache_handle_ are not nullptr, and the cache handle has to be released when +// the CachableEntry is destroyed (the lifecycle of the cached object, on the +// other hand, is managed by the cache itself). +// 2) It may uniquely own the (non-cached) object it refers to (examples include +// a block read directly from file, or uncompressed blocks when there is a +// compressed block cache but no uncompressed block cache). In such cases, the +// object has to be destroyed when the CachableEntry is destroyed. +// 3) It may point to an object (cached or not) without owning it. In this case, +// no action is needed when the CachableEntry is destroyed. +// 4) Sometimes, management of a cached or owned object (see #1 and #2 above) +// is transferred to some other object. This is used for instance with iterators +// (where cleanup is performed using a chain of cleanup functions, +// see Cleanable). +// +// Because of #1 and #2 above, copying a CachableEntry is not safe (and thus not +// allowed); hence, this is a move-only type, where a move transfers the +// management responsibilities, and leaves the source object in an empty state. + +template <class T> +class CachableEntry { +public: + CachableEntry() = default; + + CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle, + bool own_value) + : value_(value) + , cache_(cache) + , cache_handle_(cache_handle) + , own_value_(own_value) + { + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + } + + CachableEntry(const CachableEntry&) = delete; + CachableEntry& operator=(const CachableEntry&) = delete; + + CachableEntry(CachableEntry&& rhs) + : value_(rhs.value_) + , cache_(rhs.cache_) + , cache_handle_(rhs.cache_handle_) + , own_value_(rhs.own_value_) + { + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + + rhs.ResetFields(); + } + + CachableEntry& operator=(CachableEntry&& rhs) { + if (UNLIKELY(this == &rhs)) { + return *this; + } + + ReleaseResource(); + + value_ = rhs.value_; + cache_ = rhs.cache_; + cache_handle_ = rhs.cache_handle_; + own_value_ = rhs.own_value_; + + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + + rhs.ResetFields(); + + return *this; + } + + ~CachableEntry() { + ReleaseResource(); + } + + bool IsEmpty() const { + return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr && + !own_value_; + } + + bool IsCached() const { + assert(!!cache_ == !!cache_handle_); + + return cache_handle_ != nullptr; + } + + T* GetValue() const { return value_; } + Cache* GetCache() const { return cache_; } + Cache::Handle* GetCacheHandle() const { return cache_handle_; } + bool GetOwnValue() const { return own_value_; } + + void Reset() { + ReleaseResource(); + ResetFields(); + } + + void TransferTo(Cleanable* cleanable) { + if (cleanable) { + if (cache_handle_ != nullptr) { + assert(cache_ != nullptr); + cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, cache_handle_); + } else if (own_value_) { + cleanable->RegisterCleanup(&DeleteValue, value_, nullptr); + } + } + + ResetFields(); + } + + void SetOwnedValue(T* value) { + assert(value != nullptr); + + if (UNLIKELY(value_ == value && own_value_)) { + assert(cache_ == nullptr && cache_handle_ == nullptr); + return; + } + + Reset(); + + value_ = value; + own_value_ = true; + } + + void SetUnownedValue(T* value) { + assert(value != nullptr); + + if (UNLIKELY(value_ == value && cache_ == nullptr && + cache_handle_ == nullptr && !own_value_)) { + return; + } + + Reset(); + + value_ = value; + assert(!own_value_); + } + + void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) { + assert(value != nullptr); + assert(cache != nullptr); + assert(cache_handle != nullptr); + + if (UNLIKELY(value_ == value && cache_ == cache && + cache_handle_ == cache_handle && !own_value_)) { + return; + } + + Reset(); + + value_ = value; + cache_ = cache; + cache_handle_ = cache_handle; + assert(!own_value_); + } + +private: + void ReleaseResource() { + if (LIKELY(cache_handle_ != nullptr)) { + assert(cache_ != nullptr); + cache_->Release(cache_handle_); + } else if (own_value_) { + delete value_; + } + } + + void ResetFields() { + value_ = nullptr; + cache_ = nullptr; + cache_handle_ = nullptr; + own_value_ = false; + } + + static void ReleaseCacheHandle(void* arg1, void* arg2) { + Cache* const cache = static_cast<Cache*>(arg1); + assert(cache); + + Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2); + assert(cache_handle); + + cache->Release(cache_handle); + } + + static void DeleteValue(void* arg1, void* /* arg2 */) { + delete static_cast<T*>(arg1); + } + +private: + T* value_ = nullptr; + Cache* cache_ = nullptr; + Cache::Handle* cache_handle_ = nullptr; + bool own_value_ = false; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/data_block_footer.cc b/src/rocksdb/table/block_based/data_block_footer.cc new file mode 100644 index 000000000..5d5d8ed55 --- /dev/null +++ b/src/rocksdb/table/block_based/data_block_footer.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based/data_block_footer.h" + +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +const int kDataBlockIndexTypeBitShift = 31; + +// 0x7FFFFFFF +const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u; + +// 0x7FFFFFFF +const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u; + +uint32_t PackIndexTypeAndNumRestarts( + BlockBasedTableOptions::DataBlockIndexType index_type, + uint32_t num_restarts) { + if (num_restarts > kMaxNumRestarts) { + assert(0); // mute travis "unused" warning + } + + uint32_t block_footer = num_restarts; + if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) { + block_footer |= 1u << kDataBlockIndexTypeBitShift; + } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) { + assert(0); + } + + return block_footer; +} + +void UnPackIndexTypeAndNumRestarts( + uint32_t block_footer, + BlockBasedTableOptions::DataBlockIndexType* index_type, + uint32_t* num_restarts) { + if (index_type) { + if (block_footer & 1u << kDataBlockIndexTypeBitShift) { + *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash; + } else { + *index_type = BlockBasedTableOptions::kDataBlockBinarySearch; + } + } + + if (num_restarts) { + *num_restarts = block_footer & kNumRestartsMask; + assert(*num_restarts <= kMaxNumRestarts); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/data_block_footer.h b/src/rocksdb/table/block_based/data_block_footer.h new file mode 100644 index 000000000..c1cfd4730 --- /dev/null +++ b/src/rocksdb/table/block_based/data_block_footer.h @@ -0,0 +1,25 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +uint32_t PackIndexTypeAndNumRestarts( + BlockBasedTableOptions::DataBlockIndexType index_type, + uint32_t num_restarts); + +void UnPackIndexTypeAndNumRestarts( + uint32_t block_footer, + BlockBasedTableOptions::DataBlockIndexType* index_type, + uint32_t* num_restarts); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/data_block_hash_index.cc b/src/rocksdb/table/block_based/data_block_hash_index.cc new file mode 100644 index 000000000..222475834 --- /dev/null +++ b/src/rocksdb/table/block_based/data_block_hash_index.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include <string> +#include <vector> + +#include "rocksdb/slice.h" +#include "table/block_based/data_block_hash_index.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +void DataBlockHashIndexBuilder::Add(const Slice& key, + const size_t restart_index) { + assert(Valid()); + if (restart_index > kMaxRestartSupportedByHashIndex) { + valid_ = false; + return; + } + + uint32_t hash_value = GetSliceHash(key); + hash_and_restart_pairs_.emplace_back(hash_value, + static_cast<uint8_t>(restart_index)); + estimated_num_buckets_ += bucket_per_key_; +} + +void DataBlockHashIndexBuilder::Finish(std::string& buffer) { + assert(Valid()); + uint16_t num_buckets = static_cast<uint16_t>(estimated_num_buckets_); + + if (num_buckets == 0) { + num_buckets = 1; // sanity check + } + + // The build-in hash cannot well distribute strings when into different + // buckets when num_buckets is power of two, resulting in high hash + // collision. + // We made the num_buckets to be odd to avoid this issue. + num_buckets |= 1; + + std::vector<uint8_t> buckets(num_buckets, kNoEntry); + // write the restart_index array + for (auto& entry : hash_and_restart_pairs_) { + uint32_t hash_value = entry.first; + uint8_t restart_index = entry.second; + uint16_t buck_idx = static_cast<uint16_t>(hash_value % num_buckets); + if (buckets[buck_idx] == kNoEntry) { + buckets[buck_idx] = restart_index; + } else if (buckets[buck_idx] != restart_index) { + // same bucket cannot store two different restart_index, mark collision + buckets[buck_idx] = kCollision; + } + } + + for (uint8_t restart_index : buckets) { + buffer.append( + const_cast<const char*>(reinterpret_cast<char*>(&restart_index)), + sizeof(restart_index)); + } + + // write NUM_BUCK + PutFixed16(&buffer, num_buckets); + + assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex); +} + +void DataBlockHashIndexBuilder::Reset() { + estimated_num_buckets_ = 0; + valid_ = true; + hash_and_restart_pairs_.clear(); +} + +void DataBlockHashIndex::Initialize(const char* data, uint16_t size, + uint16_t* map_offset) { + assert(size >= sizeof(uint16_t)); // NUM_BUCKETS + num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t)); + assert(num_buckets_ > 0); + assert(size > num_buckets_ * sizeof(uint8_t)); + *map_offset = static_cast<uint16_t>(size - sizeof(uint16_t) - + num_buckets_ * sizeof(uint8_t)); +} + +uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset, + const Slice& key) const { + uint32_t hash_value = GetSliceHash(key); + uint16_t idx = static_cast<uint16_t>(hash_value % num_buckets_); + const char* bucket_table = data + map_offset; + return static_cast<uint8_t>(*(bucket_table + idx * sizeof(uint8_t))); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/data_block_hash_index.h b/src/rocksdb/table/block_based/data_block_hash_index.h new file mode 100644 index 000000000..f356395f3 --- /dev/null +++ b/src/rocksdb/table/block_based/data_block_hash_index.h @@ -0,0 +1,136 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <string> +#include <vector> + +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { +// This is an experimental feature aiming to reduce the CPU utilization of +// point-lookup within a data-block. It is only used in data blocks, and not +// in meta-data blocks or per-table index blocks. +// +// It only used to support BlockBasedTable::Get(). +// +// A serialized hash index is appended to the data-block. The new block data +// format is as follows: +// +// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER] +// +// RI: Restart Interval (the same as the default data-block format) +// RI_IDX: Restart Interval index (the same as the default data-block format) +// HASH_IDX: The new data-block hash index feature. +// FOOTER: A 32bit block footer, which is the NUM_RESTARTS with the MSB as +// the flag indicating if this hash index is in use. Note that +// given a data block < 32KB, the MSB is never used. So we can +// borrow the MSB as the hash index flag. Therefore, this format is +// compatible with the legacy data-blocks with num_restarts < 32768, +// as the MSB is 0. +// +// The format of the data-block hash index is as follows: +// +// HASH_IDX: [B B B ... B NUM_BUCK] +// +// B: bucket, an array of restart index. Each buckets is uint8_t. +// NUM_BUCK: Number of buckets, which is the length of the bucket array. +// +// We reserve two special flag: +// kNoEntry=255, +// kCollision=254. +// +// Therefore, the max number of restarts this hash index can supoport is 253. +// +// Buckets are initialized to be kNoEntry. +// +// When storing a key in the hash index, the key is first hashed to a bucket. +// If there the bucket is empty (kNoEntry), the restart index is stored in +// the bucket. If there is already a restart index there, we will update the +// existing restart index to a collision marker (kCollision). If the +// the bucket is already marked as collision, we do not store the restart +// index either. +// +// During query process, a key is first hashed to a bucket. Then we examine if +// the buckets store nothing (kNoEntry) or the bucket had a collision +// (kCollision). If either of those happens, we get the restart index of +// the key and will directly go to the restart interval to search the key. +// +// Note that we only support blocks with #restart_interval < 254. If a block +// has more restart interval than that, hash index will not be create for it. + +const uint8_t kNoEntry = 255; +const uint8_t kCollision = 254; +const uint8_t kMaxRestartSupportedByHashIndex = 253; + +// Because we use uint16_t address, we only support block no more than 64KB +const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16; +const double kDefaultUtilRatio = 0.75; + +class DataBlockHashIndexBuilder { + public: + DataBlockHashIndexBuilder() + : bucket_per_key_(-1 /*uninitialized marker*/), + estimated_num_buckets_(0), + valid_(false) {} + + void Initialize(double util_ratio) { + if (util_ratio <= 0) { + util_ratio = kDefaultUtilRatio; // sanity check + } + bucket_per_key_ = 1 / util_ratio; + valid_ = true; + } + + inline bool Valid() const { return valid_ && bucket_per_key_ > 0; } + void Add(const Slice& key, const size_t restart_index); + void Finish(std::string& buffer); + void Reset(); + inline size_t EstimateSize() const { + uint16_t estimated_num_buckets = + static_cast<uint16_t>(estimated_num_buckets_); + + // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish. + estimated_num_buckets |= 1; + + return sizeof(uint16_t) + + static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t)); + } + + private: + double bucket_per_key_; // is the multiplicative inverse of util_ratio_ + double estimated_num_buckets_; + + // Now the only usage for `valid_` is to mark false when the inserted + // restart_index is larger than supported. In this case HashIndex is not + // appended to the block content. + bool valid_; + + std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_; + friend class DataBlockHashIndex_DataBlockHashTestSmall_Test; +}; + +class DataBlockHashIndex { + public: + DataBlockHashIndex() : num_buckets_(0) {} + + void Initialize(const char* data, uint16_t size, uint16_t* map_offset); + + uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const; + + inline bool Valid() { return num_buckets_ != 0; } + + private: + // To make the serialized hash index compact and to save the space overhead, + // here all the data fields persisted in the block are in uint16 format. + // We find that a uint16 is large enough to index every offset of a 64KiB + // block. + // So in other words, DataBlockHashIndex does not support block size equal + // or greater then 64KiB. + uint16_t num_buckets_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/data_block_hash_index_test.cc b/src/rocksdb/table/block_based/data_block_hash_index_test.cc new file mode 100644 index 000000000..8548c8508 --- /dev/null +++ b/src/rocksdb/table/block_based/data_block_hash_index_test.cc @@ -0,0 +1,719 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <cstdlib> +#include <string> +#include <unordered_map> + +#include "db/table_properties_collector.h" +#include "rocksdb/slice.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/data_block_hash_index.h" +#include "table/get_context.h" +#include "table/table_builder.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +bool SearchForOffset(DataBlockHashIndex& index, const char* data, + uint16_t map_offset, const Slice& key, + uint8_t& restart_point) { + uint8_t entry = index.Lookup(data, map_offset, key); + if (entry == kCollision) { + return true; + } + + if (entry == kNoEntry) { + return false; + } + + return entry == restart_point; +} + +// Random KV generator similer to block_test +static std::string RandomString(Random* rnd, int len) { + std::string r; + test::RandomString(rnd, len, &r); + return r; +} +std::string GenerateKey(int primary_key, int secondary_key, int padding_size, + Random* rnd) { + char buf[50]; + char* p = &buf[0]; + snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); + std::string k(p); + if (padding_size) { + k += RandomString(rnd, padding_size); + } + + return k; +} + +// Generate random key value pairs. +// The generated key will be sorted. You can tune the parameters to generated +// different kinds of test key/value pairs for different scenario. +void GenerateRandomKVs(std::vector<std::string>* keys, + std::vector<std::string>* values, const int from, + const int len, const int step = 1, + const int padding_size = 0, + const int keys_share_prefix = 1) { + Random rnd(302); + + // generate different prefix + for (int i = from; i < from + len; i += step) { + // generating keys that shares the prefix + for (int j = 0; j < keys_share_prefix; ++j) { + keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); + + // 100 bytes values + values->emplace_back(RandomString(&rnd, 100)); + } + } +} + +TEST(DataBlockHashIndex, DataBlockHashTestSmall) { + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + for (int j = 0; j < 5; j++) { + for (uint8_t i = 0; i < 2 + j; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("fake"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 2; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } + builder.Reset(); + } +} + +TEST(DataBlockHashIndex, DataBlockHashTest) { + // bucket_num = 200, #keys = 100. 50% utilization + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("fake content"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } +} + +TEST(DataBlockHashIndex, DataBlockHashTestCollision) { + // bucket_num = 2. There will be intense hash collisions + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("some other fake content to take up space"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 100; i++) { + std::string key("key" + std::to_string(i)); + uint8_t restart_point = i; + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } +} + +TEST(DataBlockHashIndex, DataBlockHashTestLarge) { + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + std::unordered_map<std::string, uint8_t> m; + + for (uint8_t i = 0; i < 100; i++) { + if (i % 2) { + continue; // leave half of the keys out + } + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + builder.Add(key, restart_point); + m[key] = restart_point; + } + + size_t estimated_size = builder.EstimateSize(); + + std::string buffer("filling stuff"), buffer2; + size_t original_size = buffer.size(); + estimated_size += original_size; + builder.Finish(buffer); + + ASSERT_EQ(buffer.size(), estimated_size); + + buffer2 = buffer; // test for the correctness of relative offset + + Slice s(buffer2); + DataBlockHashIndex index; + uint16_t map_offset; + index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset); + + // the additional hash map should start at the end of the buffer + ASSERT_EQ(original_size, map_offset); + for (uint8_t i = 0; i < 100; i++) { + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + if (m.count(key)) { + ASSERT_TRUE(m[key] == restart_point); + ASSERT_TRUE( + SearchForOffset(index, s.data(), map_offset, key, restart_point)); + } else { + // we allow false positve, so don't test the nonexisting keys. + // when false positive happens, the search will continue to the + // restart intervals to see if the key really exist. + } + } +} + +TEST(DataBlockHashIndex, RestartIndexExceedMax) { + DataBlockHashIndexBuilder builder; + builder.Initialize(0.75 /*util_ratio*/); + std::unordered_map<std::string, uint8_t> m; + + for (uint8_t i = 0; i <= 253; i++) { + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + ASSERT_TRUE(builder.Valid()); + + builder.Reset(); + + for (uint8_t i = 0; i <= 254; i++) { + std::string key = "key" + std::to_string(i); + uint8_t restart_point = i; + builder.Add(key, restart_point); + } + + ASSERT_FALSE(builder.Valid()); + + builder.Reset(); + ASSERT_TRUE(builder.Valid()); +} + +TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) { + Options options = Options(); + + BlockBuilder builder(1 /* block_restart_interval */, + true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + + // #restarts <= 253. HashIndex is valid + for (int i = 0; i <= 253; i++) { + std::string ukey = "key" + std::to_string(i); + InternalKey ikey(ukey, 0, kTypeValue); + builder.Add(ikey.Encode().ToString(), "value"); + } + + { + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinaryAndHash); + } + + builder.Reset(); + + // #restarts > 253. HashIndex is not used + for (int i = 0; i <= 254; i++) { + std::string ukey = "key" + std::to_string(i); + InternalKey ikey(ukey, 0, kTypeValue); + builder.Add(ikey.Encode().ToString(), "value"); + } + + { + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinarySearch); + } +} + +TEST(DataBlockHashIndex, BlockSizeExceedMax) { + Options options = Options(); + std::string ukey(10, 'k'); + InternalKey ikey(ukey, 0, kTypeValue); + + BlockBuilder builder(1 /* block_restart_interval */, + false /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + + { + // insert a large value. The block size plus HashIndex is 65536. + std::string value(65502, 'v'); + + builder.Add(ikey.Encode().ToString(), value); + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex); + std::cerr << "block size: " << rawblock.size() << std::endl; + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinaryAndHash); + } + + builder.Reset(); + + { + // insert a large value. The block size plus HashIndex would be 65537. + // This excceed the max block size supported by HashIndex (65536). + // So when build finishes HashIndex will not be created for the block. + std::string value(65503, 'v'); + + builder.Add(ikey.Encode().ToString(), value); + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex); + std::cerr << "block size: " << rawblock.size() << std::endl; + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + // the index type have fallen back to binary when build finish. + ASSERT_EQ(reader.IndexType(), + BlockBasedTableOptions::kDataBlockBinarySearch); + } +} + +TEST(DataBlockHashIndex, BlockTestSingleKey) { + Options options = Options(); + + BlockBuilder builder(16 /* block_restart_interval */, + true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + + std::string ukey("gopher"); + std::string value("gold"); + InternalKey ikey(ukey, 10, kTypeValue); + builder.Add(ikey.Encode().ToString(), value /*value*/); + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + const InternalKeyComparator icmp(BytewiseComparator()); + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); + bool may_exist; + // search in block for the key just inserted + { + InternalKey seek_ikey(ukey, 10, kValueTypeForSeek); + may_exist = iter->SeekForGet(seek_ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ( + options.comparator->Compare(iter->key(), ikey.Encode().ToString()), 0); + ASSERT_EQ(iter->value(), value); + } + + // search in block for the existing ukey, but with higher seqno + { + InternalKey seek_ikey(ukey, 20, kValueTypeForSeek); + + // HashIndex should be able to set the iter correctly + may_exist = iter->SeekForGet(seek_ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_TRUE(iter->Valid()); + + // user key should match + ASSERT_EQ(options.comparator->Compare(ExtractUserKey(iter->key()), ukey), + 0); + + // seek_key seqno number should be greater than that of iter result + ASSERT_GT(GetInternalKeySeqno(seek_ikey.Encode()), + GetInternalKeySeqno(iter->key())); + + ASSERT_EQ(iter->value(), value); + } + + // Search in block for the existing ukey, but with lower seqno + // in this case, hash can find the only occurrence of the user_key, but + // ParseNextDataKey() will skip it as it does not have a older seqno. + // In this case, GetForSeek() is effective to locate the user_key, and + // iter->Valid() == false indicates that we've reached to the end of + // the block and the caller should continue searching the next block. + { + InternalKey seek_ikey(ukey, 5, kValueTypeForSeek); + may_exist = iter->SeekForGet(seek_ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_FALSE(iter->Valid()); // should have reached to the end of block + } + + delete iter; +} + +TEST(DataBlockHashIndex, BlockTestLarge) { + Random rnd(1019); + Options options = Options(); + std::vector<std::string> keys; + std::vector<std::string> values; + + BlockBuilder builder(16 /* block_restart_interval */, + true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinaryAndHash); + int num_records = 500; + + GenerateRandomKVs(&keys, &values, 0, num_records); + + // Generate keys. Adding a trailing "1" to indicate existent keys. + // Later will Seeking for keys with a trailing "0" to test seeking + // non-existent keys. + for (int i = 0; i < num_records; i++) { + std::string ukey(keys[i] + "1" /* existing key marker */); + InternalKey ikey(ukey, 0, kTypeValue); + builder.Add(ikey.Encode().ToString(), values[i]); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + const InternalKeyComparator icmp(BytewiseComparator()); + + // random seek existent keys + for (int i = 0; i < num_records; i++) { + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + std::string ukey(keys[index] + "1" /* existing key marker */); + InternalKey ikey(ukey, 0, kTypeValue); + + // search in block for this key + bool may_exist = iter->SeekForGet(ikey.Encode().ToString()); + ASSERT_TRUE(may_exist); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(values[index], iter->value()); + + delete iter; + } + + // random seek non-existent user keys + // In this case A), the user_key cannot be found in HashIndex. The key may + // exist in the next block. So the iter is set invalidated to tell the + // caller to search the next block. This test case belongs to this case A). + // + // Note that for non-existent keys, there is possibility of false positive, + // i.e. the key is still hashed into some restart interval. + // Two additional possible outcome: + // B) linear seek the restart interval and not found, the iter stops at the + // starting of the next restart interval. The key does not exist + // anywhere. + // C) linear seek the restart interval and not found, the iter stops at the + // the end of the block, i.e. restarts_. The key may exist in the next + // block. + // So these combinations are possible when searching non-existent user_key: + // + // case# may_exist iter->Valid() + // A true false + // B false true + // C true false + + for (int i = 0; i < num_records; i++) { + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + std::string ukey(keys[index] + "0" /* non-existing key marker */); + InternalKey ikey(ukey, 0, kTypeValue); + + // search in block for this key + bool may_exist = iter->SeekForGet(ikey.Encode().ToString()); + if (!may_exist) { + ASSERT_TRUE(iter->Valid()); + } + if (!iter->Valid()) { + ASSERT_TRUE(may_exist); + } + + delete iter; + } +} + +// helper routine for DataBlockHashIndex.BlockBoundary +void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, + std::string& v2, InternalKey& seek_ikey, + GetContext& get_context, Options& options) { + std::unique_ptr<WritableFileWriter> file_writer; + std::unique_ptr<RandomAccessFileReader> file_reader; + std::unique_ptr<TableReader> table_reader; + int level_ = -1; + + std::vector<std::string> keys; + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + const InternalKeyComparator internal_comparator(options.comparator); + + EnvOptions soptions; + + soptions.use_mmap_reads = ioptions.allow_mmap_reads; + file_writer.reset( + test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */)); + std::unique_ptr<TableBuilder> builder; + std::vector<std::unique_ptr<IntTblPropCollectorFactory>> + int_tbl_prop_collector_factories; + std::string column_family_name; + builder.reset(ioptions.table_factory->NewTableBuilder( + TableBuilderOptions(ioptions, moptions, internal_comparator, + &int_tbl_prop_collector_factories, + options.compression, options.sample_for_compression, + CompressionOptions(), false /* skip_filters */, + column_family_name, level_), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + file_writer.get())); + + builder->Add(ik1.Encode().ToString(), v1); + builder->Add(ik2.Encode().ToString(), v2); + EXPECT_TRUE(builder->status().ok()); + + Status s = builder->Finish(); + file_writer->Flush(); + EXPECT_TRUE(s.ok()) << s.ToString(); + + EXPECT_EQ( + test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(), + builder->FileSize()); + + // Open the table + file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource( + test::GetStringSinkFromLegacyWriter(file_writer.get())->contents(), + 0 /*uniq_id*/, ioptions.allow_mmap_reads))); + const bool kSkipFilters = true; + const bool kImmortal = true; + ioptions.table_factory->NewTableReader( + TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, + internal_comparator, !kSkipFilters, !kImmortal, + level_), + std::move(file_reader), + test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(), + &table_reader); + // Search using Get() + ReadOptions ro; + + ASSERT_OK(table_reader->Get(ro, seek_ikey.Encode().ToString(), &get_context, + moptions.prefix_extractor.get())); +} + +TEST(DataBlockHashIndex, BlockBoundary) { + BlockBasedTableOptions table_options; + table_options.data_block_index_type = + BlockBasedTableOptions::kDataBlockBinaryAndHash; + table_options.block_restart_interval = 1; + table_options.block_size = 4096; + + Options options; + options.comparator = BytewiseComparator(); + + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // insert two large k/v pair. Given that the block_size is 4096, one k/v + // pair will take up one block. + // [ k1/v1 ][ k2/v2 ] + // [ Block N ][ Block N+1 ] + + { + // [ "aab"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@60 + std::string uk1("aab"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 60, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, true, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value, v2); + value.Reset(); + } + + { + // [ "axy"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@60 + std::string uk1("axy"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 60, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, true, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value, v2); + value.Reset(); + } + + { + // [ "axy"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@120 + std::string uk1("axy"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 120, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, true, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value, v1); + value.Reset(); + } + + { + // [ "axy"@100 ][ "axy"@10 ] + // | Block N ][ Block N+1 ] + // seek for "axy"@5 + std::string uk1("axy"); + InternalKey ik1(uk1, 100, kTypeValue); + std::string v1(4100, '1'); // large value + + std::string uk2("axy"); + InternalKey ik2(uk2, 10, kTypeValue); + std::string v2(4100, '2'); // large value + + PinnableSlice value; + std::string seek_ukey("axy"); + InternalKey seek_ikey(seek_ukey, 5, kTypeValue); + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, seek_ukey, &value, nullptr, + nullptr, true, nullptr, nullptr); + + TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); + ASSERT_EQ(get_context.State(), GetContext::kNotFound); + value.Reset(); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/block_based/filter_block.h b/src/rocksdb/table/block_based/filter_block.h new file mode 100644 index 000000000..1ad8d3f18 --- /dev/null +++ b/src/rocksdb/table/block_based/filter_block.h @@ -0,0 +1,176 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A filter block is stored near the end of a Table file. It contains +// filters (e.g., bloom filters) for all data blocks in the table combined +// into a single filter block. +// +// It is a base class for BlockBasedFilter and FullFilter. +// These two are both used in BlockBasedTable. The first one contain filter +// For a part of keys in sst file, the second contain filter for all keys +// in sst file. + +#pragma once + +#include <stddef.h> +#include <stdint.h> +#include <memory> +#include <string> +#include <vector> +#include "db/dbformat.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "table/format.h" +#include "table/multiget_context.h" +#include "trace_replay/block_cache_tracer.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +const uint64_t kNotValid = ULLONG_MAX; +class FilterPolicy; + +class GetContext; +using MultiGetRange = MultiGetContext::Range; + +// A FilterBlockBuilder is used to construct all of the filters for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// +// The sequence of calls to FilterBlockBuilder must match the regexp: +// (StartBlock Add*)* Finish +// +// BlockBased/Full FilterBlock would be called in the same way. +class FilterBlockBuilder { + public: + explicit FilterBlockBuilder() {} + // No copying allowed + FilterBlockBuilder(const FilterBlockBuilder&) = delete; + void operator=(const FilterBlockBuilder&) = delete; + + virtual ~FilterBlockBuilder() {} + + virtual bool IsBlockBased() = 0; // If is blockbased filter + virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter + virtual void Add(const Slice& key) = 0; // Add a key to current filter + virtual size_t NumAdded() const = 0; // Number of keys added + Slice Finish() { // Generate Filter + const BlockHandle empty_handle; + Status dont_care_status; + auto ret = Finish(empty_handle, &dont_care_status); + assert(dont_care_status.ok()); + return ret; + } + virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0; +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +// +// BlockBased/Full FilterBlock would be called in the same way. +class FilterBlockReader { + public: + FilterBlockReader() = default; + virtual ~FilterBlockReader() = default; + + FilterBlockReader(const FilterBlockReader&) = delete; + FilterBlockReader& operator=(const FilterBlockReader&) = delete; + + virtual bool IsBlockBased() = 0; // If is blockbased filter + + /** + * If no_io is set, then it returns true if it cannot answer the query without + * reading data from disk. This is used in PartitionedFilterBlockReader to + * avoid reading partitions that are not in block cache already + * + * Normally filters are built on only the user keys and the InternalKey is not + * needed for a query. The index in PartitionedFilterBlockReader however is + * built upon InternalKey and must be provided via const_ikey_ptr when running + * queries. + */ + virtual bool KeyMayMatch(const Slice& key, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; + + virtual void KeysMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { + for (auto iter = range->begin(); iter != range->end(); ++iter) { + const Slice ukey = iter->ukey; + const Slice ikey = iter->ikey; + GetContext* const get_context = iter->get_context; + if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey, + get_context, lookup_context)) { + range->SkipKey(iter); + } + } + } + + /** + * no_io and const_ikey_ptr here means the same as in KeyMayMatch + */ + virtual bool PrefixMayMatch(const Slice& prefix, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; + + virtual void PrefixesMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { + for (auto iter = range->begin(); iter != range->end(); ++iter) { + const Slice ukey = iter->ukey; + const Slice ikey = iter->ikey; + GetContext* const get_context = iter->get_context; + if (prefix_extractor->InDomain(ukey) && + !PrefixMayMatch(prefix_extractor->Transform(ukey), prefix_extractor, + block_offset, no_io, &ikey, get_context, + lookup_context)) { + range->SkipKey(iter); + } + } + } + + virtual size_t ApproximateMemoryUsage() const = 0; + + // convert this object to a human readable form + virtual std::string ToString() const { + std::string error_msg("Unsupported filter \n"); + return error_msg; + } + + virtual void CacheDependencies(bool /*pin*/) {} + + virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/, + const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* /*comparator*/, + const Slice* const const_ikey_ptr, + bool* filter_checked, bool need_upper_bound_check, + BlockCacheLookupContext* lookup_context) { + if (need_upper_bound_check) { + return true; + } + *filter_checked = true; + Slice prefix = prefix_extractor->Transform(user_key); + return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, + const_ikey_ptr, /* get_context */ nullptr, + lookup_context); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/filter_block_reader_common.cc b/src/rocksdb/table/block_based/filter_block_reader_common.cc new file mode 100644 index 000000000..fa0802669 --- /dev/null +++ b/src/rocksdb/table/block_based/filter_block_reader_common.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/filter_block_reader_common.h" +#include "monitoring/perf_context_imp.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/parsed_full_filter_block.h" + +namespace ROCKSDB_NAMESPACE { + +template <typename TBlocklike> +Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<TBlocklike>* filter_block) { + PERF_TIMER_GUARD(read_filter_block_nanos); + + assert(table); + assert(filter_block); + assert(filter_block->IsEmpty()); + + const BlockBasedTable::Rep* const rep = table->get_rep(); + assert(rep); + + const Status s = + table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle, + UncompressionDict::GetEmptyDict(), filter_block, + BlockType::kFilter, get_context, lookup_context, + /* for_compaction */ false, use_cache); + + return s; +} + +template <typename TBlocklike> +const SliceTransform* +FilterBlockReaderCommon<TBlocklike>::table_prefix_extractor() const { + assert(table_); + + const BlockBasedTable::Rep* const rep = table_->get_rep(); + assert(rep); + + return rep->prefix_filtering ? rep->table_prefix_extractor.get() : nullptr; +} + +template <typename TBlocklike> +bool FilterBlockReaderCommon<TBlocklike>::whole_key_filtering() const { + assert(table_); + assert(table_->get_rep()); + + return table_->get_rep()->whole_key_filtering; +} + +template <typename TBlocklike> +bool FilterBlockReaderCommon<TBlocklike>::cache_filter_blocks() const { + assert(table_); + assert(table_->get_rep()); + + return table_->get_rep()->table_options.cache_index_and_filter_blocks; +} + +template <typename TBlocklike> +Status FilterBlockReaderCommon<TBlocklike>::GetOrReadFilterBlock( + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<TBlocklike>* filter_block) const { + assert(filter_block); + + if (!filter_block_.IsEmpty()) { + filter_block->SetUnownedValue(filter_block_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options, + cache_filter_blocks(), get_context, lookup_context, + filter_block); +} + +template <typename TBlocklike> +size_t FilterBlockReaderCommon<TBlocklike>::ApproximateFilterBlockMemoryUsage() + const { + assert(!filter_block_.GetOwnValue() || filter_block_.GetValue() != nullptr); + return filter_block_.GetOwnValue() + ? filter_block_.GetValue()->ApproximateMemoryUsage() + : 0; +} + +// Explicitly instantiate templates for both "blocklike" types we use. +// This makes it possible to keep the template definitions in the .cc file. +template class FilterBlockReaderCommon<BlockContents>; +template class FilterBlockReaderCommon<Block>; +template class FilterBlockReaderCommon<ParsedFullFilterBlock>; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/filter_block_reader_common.h b/src/rocksdb/table/block_based/filter_block_reader_common.h new file mode 100644 index 000000000..a18bc5449 --- /dev/null +++ b/src/rocksdb/table/block_based/filter_block_reader_common.h @@ -0,0 +1,55 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include <cassert> +#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBasedTable; +class FilePrefetchBuffer; + +// Encapsulates common functionality for the various filter block reader +// implementations. Provides access to the filter block regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +template <typename TBlocklike> +class FilterBlockReaderCommon : public FilterBlockReader { + public: + FilterBlockReaderCommon(const BlockBasedTable* t, + CachableEntry<TBlocklike>&& filter_block) + : table_(t), filter_block_(std::move(filter_block)) { + assert(table_); + } + + protected: + static Status ReadFilterBlock(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<TBlocklike>* filter_block); + + const BlockBasedTable* table() const { return table_; } + const SliceTransform* table_prefix_extractor() const; + bool whole_key_filtering() const; + bool cache_filter_blocks() const; + + Status GetOrReadFilterBlock(bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<TBlocklike>* filter_block) const; + + size_t ApproximateFilterBlockMemoryUsage() const; + + private: + const BlockBasedTable* table_; + CachableEntry<TBlocklike> filter_block_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/filter_policy.cc b/src/rocksdb/table/block_based/filter_policy.cc new file mode 100644 index 000000000..c8f23ee33 --- /dev/null +++ b/src/rocksdb/table/block_based/filter_policy.cc @@ -0,0 +1,759 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <array> +#include <deque> + +#include "rocksdb/filter_policy.h" + +#include "rocksdb/slice.h" +#include "table/block_based/block_based_filter_block.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/filter_policy_internal.h" +#include "third-party/folly/folly/ConstexprMath.h" +#include "util/bloom_impl.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// See description in FastLocalBloomImpl +class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { + public: + explicit FastLocalBloomBitsBuilder(const int millibits_per_key) + : millibits_per_key_(millibits_per_key), + num_probes_(FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_)) { + assert(millibits_per_key >= 1000); + } + + // No Copy allowed + FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete; + void operator=(const FastLocalBloomBitsBuilder&) = delete; + + ~FastLocalBloomBitsBuilder() override {} + + virtual void AddKey(const Slice& key) override { + uint64_t hash = GetSliceHash64(key); + if (hash_entries_.empty() || hash != hash_entries_.back()) { + hash_entries_.push_back(hash); + } + } + + virtual Slice Finish(std::unique_ptr<const char[]>* buf) override { + uint32_t len_with_metadata = + CalculateSpace(static_cast<uint32_t>(hash_entries_.size())); + char* data = new char[len_with_metadata]; + memset(data, 0, len_with_metadata); + + assert(data); + assert(len_with_metadata >= 5); + + uint32_t len = len_with_metadata - 5; + if (len > 0) { + AddAllEntries(data, len); + } + + // See BloomFilterPolicy::GetBloomBitsReader re: metadata + // -1 = Marker for newer Bloom implementations + data[len] = static_cast<char>(-1); + // 0 = Marker for this sub-implementation + data[len + 1] = static_cast<char>(0); + // num_probes (and 0 in upper bits for 64-byte block size) + data[len + 2] = static_cast<char>(num_probes_); + // rest of metadata stays zero + + const char* const_data = data; + buf->reset(const_data); + assert(hash_entries_.empty()); + + return Slice(data, len_with_metadata); + } + + int CalculateNumEntry(const uint32_t bytes) override { + uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0; + return static_cast<int>(uint64_t{8000} * bytes_no_meta / + millibits_per_key_); + } + + uint32_t CalculateSpace(const int num_entry) override { + uint32_t num_cache_lines = 0; + if (millibits_per_key_ > 0 && num_entry > 0) { + num_cache_lines = static_cast<uint32_t>( + (int64_t{num_entry} * millibits_per_key_ + 511999) / 512000); + } + return num_cache_lines * 64 + /*metadata*/ 5; + } + + double EstimatedFpRate(size_t keys, size_t bytes) override { + return FastLocalBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5, + num_probes_, /*hash bits*/ 64); + } + + private: + void AddAllEntries(char* data, uint32_t len) { + // Simple version without prefetching: + // + // for (auto h : hash_entries_) { + // FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len, + // num_probes_, data); + // } + + const size_t num_entries = hash_entries_.size(); + constexpr size_t kBufferMask = 7; + static_assert(((kBufferMask + 1) & kBufferMask) == 0, + "Must be power of 2 minus 1"); + + std::array<uint32_t, kBufferMask + 1> hashes; + std::array<uint32_t, kBufferMask + 1> byte_offsets; + + // Prime the buffer + size_t i = 0; + for (; i <= kBufferMask && i < num_entries; ++i) { + uint64_t h = hash_entries_.front(); + hash_entries_.pop_front(); + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data, + /*out*/ &byte_offsets[i]); + hashes[i] = Upper32of64(h); + } + + // Process and buffer + for (; i < num_entries; ++i) { + uint32_t& hash_ref = hashes[i & kBufferMask]; + uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask]; + // Process (add) + FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes_, + data + byte_offset_ref); + // And buffer + uint64_t h = hash_entries_.front(); + hash_entries_.pop_front(); + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data, + /*out*/ &byte_offset_ref); + hash_ref = Upper32of64(h); + } + + // Finish processing + for (i = 0; i <= kBufferMask && i < num_entries; ++i) { + FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes_, + data + byte_offsets[i]); + } + } + + int millibits_per_key_; + int num_probes_; + // A deque avoids unnecessary copying of already-saved values + // and has near-minimal peak memory use. + std::deque<uint64_t> hash_entries_; +}; + +// See description in FastLocalBloomImpl +class FastLocalBloomBitsReader : public FilterBitsReader { + public: + FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes) + : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {} + + // No Copy allowed + FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete; + void operator=(const FastLocalBloomBitsReader&) = delete; + + ~FastLocalBloomBitsReader() override {} + + bool MayMatch(const Slice& key) override { + uint64_t h = GetSliceHash64(key); + uint32_t byte_offset; + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_, + /*out*/ &byte_offset); + return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_, + data_ + byte_offset); + } + + virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes; + std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets; + for (int i = 0; i < num_keys; ++i) { + uint64_t h = GetSliceHash64(*keys[i]); + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_, + /*out*/ &byte_offsets[i]); + hashes[i] = Upper32of64(h); + } + for (int i = 0; i < num_keys; ++i) { + may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared( + hashes[i], num_probes_, data_ + byte_offsets[i]); + } + } + + private: + const char* data_; + const int num_probes_; + const uint32_t len_bytes_; +}; + +using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>; + +class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { + public: + explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log); + + // No Copy allowed + LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete; + void operator=(const LegacyBloomBitsBuilder&) = delete; + + ~LegacyBloomBitsBuilder() override; + + void AddKey(const Slice& key) override; + + Slice Finish(std::unique_ptr<const char[]>* buf) override; + + int CalculateNumEntry(const uint32_t bytes) override; + + uint32_t CalculateSpace(const int num_entry) override { + uint32_t dont_care1; + uint32_t dont_care2; + return CalculateSpace(num_entry, &dont_care1, &dont_care2); + } + + double EstimatedFpRate(size_t keys, size_t bytes) override { + return LegacyBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5, + num_probes_); + } + + private: + int bits_per_key_; + int num_probes_; + std::vector<uint32_t> hash_entries_; + Logger* info_log_; + + // Get totalbits that optimized for cpu cache line + uint32_t GetTotalBitsForLocality(uint32_t total_bits); + + // Reserve space for new filter + char* ReserveSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + // Implementation-specific variant of public CalculateSpace + uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + // Assuming single threaded access to this function. + void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); +}; + +LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key, + Logger* info_log) + : bits_per_key_(bits_per_key), + num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)), + info_log_(info_log) { + assert(bits_per_key_); +} + +LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {} + +void LegacyBloomBitsBuilder::AddKey(const Slice& key) { + uint32_t hash = BloomHash(key); + if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { + hash_entries_.push_back(hash); + } +} + +Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) { + uint32_t total_bits, num_lines; + size_t num_entries = hash_entries_.size(); + char* data = + ReserveSpace(static_cast<int>(num_entries), &total_bits, &num_lines); + assert(data); + + if (total_bits != 0 && num_lines != 0) { + for (auto h : hash_entries_) { + AddHash(h, data, num_lines, total_bits); + } + + // Check for excessive entries for 32-bit hash function + if (num_entries >= /* minimum of 3 million */ 3000000U) { + // More specifically, we can detect that the 32-bit hash function + // is causing significant increase in FP rate by comparing current + // estimated FP rate to what we would get with a normal number of + // keys at same memory ratio. + double est_fp_rate = LegacyBloomImpl::EstimatedFpRate( + num_entries, total_bits / 8, num_probes_); + double vs_fp_rate = LegacyBloomImpl::EstimatedFpRate( + 1U << 16, (1U << 16) * bits_per_key_ / 8, num_probes_); + + if (est_fp_rate >= 1.50 * vs_fp_rate) { + // For more details, see + // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter + ROCKS_LOG_WARN( + info_log_, + "Using legacy SST/BBT Bloom filter with excessive key count " + "(%.1fM @ %dbpk), causing estimated %.1fx higher filter FP rate. " + "Consider using new Bloom with format_version>=5, smaller SST " + "file size, or partitioned filters.", + num_entries / 1000000.0, bits_per_key_, est_fp_rate / vs_fp_rate); + } + } + } + // See BloomFilterPolicy::GetFilterBitsReader for metadata + data[total_bits / 8] = static_cast<char>(num_probes_); + EncodeFixed32(data + total_bits / 8 + 1, static_cast<uint32_t>(num_lines)); + + const char* const_data = data; + buf->reset(const_data); + hash_entries_.clear(); + + return Slice(data, total_bits / 8 + 5); +} + +uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_lines = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + + // Make num_lines an odd number to make sure more bits are involved + // when determining which block. + if (num_lines % 2 == 0) { + num_lines++; + } + return num_lines * (CACHE_LINE_SIZE * 8); +} + +uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry, + uint32_t* total_bits, + uint32_t* num_lines) { + assert(bits_per_key_); + if (num_entry != 0) { + uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_); + + *total_bits = GetTotalBitsForLocality(total_bits_tmp); + *num_lines = *total_bits / (CACHE_LINE_SIZE * 8); + assert(*total_bits > 0 && *total_bits % 8 == 0); + } else { + // filter is empty, just leave space for metadata + *total_bits = 0; + *num_lines = 0; + } + + // Reserve space for Filter + uint32_t sz = *total_bits / 8; + sz += 5; // 4 bytes for num_lines, 1 byte for num_probes + return sz; +} + +char* LegacyBloomBitsBuilder::ReserveSpace(const int num_entry, + uint32_t* total_bits, + uint32_t* num_lines) { + uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines); + char* data = new char[sz]; + memset(data, 0, sz); + return data; +} + +int LegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) { + assert(bits_per_key_); + assert(bytes > 0); + int high = static_cast<int>(bytes * 8 / bits_per_key_ + 1); + int low = 1; + int n = high; + for (; n >= low; n--) { + if (CalculateSpace(n) <= bytes) { + break; + } + } + assert(n < high); // High should be an overestimation + return n; +} + +inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data, + uint32_t num_lines, + uint32_t total_bits) { +#ifdef NDEBUG + static_cast<void>(total_bits); +#endif + assert(num_lines > 0 && total_bits > 0); + + LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data, + folly::constexpr_log2(CACHE_LINE_SIZE)); +} + +class LegacyBloomBitsReader : public FilterBitsReader { + public: + LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines, + uint32_t log2_cache_line_size) + : data_(data), + num_probes_(num_probes), + num_lines_(num_lines), + log2_cache_line_size_(log2_cache_line_size) {} + + // No Copy allowed + LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete; + void operator=(const LegacyBloomBitsReader&) = delete; + + ~LegacyBloomBitsReader() override {} + + // "contents" contains the data built by a preceding call to + // FilterBitsBuilder::Finish. MayMatch must return true if the key was + // passed to FilterBitsBuilder::AddKey. This method may return true or false + // if the key was not on the list, but it should aim to return false with a + // high probability. + bool MayMatch(const Slice& key) override { + uint32_t hash = BloomHash(key); + uint32_t byte_offset; + LegacyBloomImpl::PrepareHashMayMatch( + hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_); + return LegacyBloomImpl::HashMayMatchPrepared( + hash, num_probes_, data_ + byte_offset, log2_cache_line_size_); + } + + virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes; + std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets; + for (int i = 0; i < num_keys; ++i) { + hashes[i] = BloomHash(*keys[i]); + LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_, + /*out*/ &byte_offsets[i], + log2_cache_line_size_); + } + for (int i = 0; i < num_keys; ++i) { + may_match[i] = LegacyBloomImpl::HashMayMatchPrepared( + hashes[i], num_probes_, data_ + byte_offsets[i], + log2_cache_line_size_); + } + } + + private: + const char* data_; + const int num_probes_; + const uint32_t num_lines_; + const uint32_t log2_cache_line_size_; +}; + +class AlwaysTrueFilter : public FilterBitsReader { + public: + bool MayMatch(const Slice&) override { return true; } + using FilterBitsReader::MayMatch; // inherit overload +}; + +class AlwaysFalseFilter : public FilterBitsReader { + public: + bool MayMatch(const Slice&) override { return false; } + using FilterBitsReader::MayMatch; // inherit overload +}; + +} // namespace + +const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllFixedImpls = { + kLegacyBloom, + kDeprecatedBlock, + kFastLocalBloom, +}; + +const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllUserModes = { + kDeprecatedBlock, + kAuto, +}; + +BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode) + : mode_(mode), warned_(false) { + // Sanitize bits_per_key + if (bits_per_key < 1.0) { + bits_per_key = 1.0; + } else if (!(bits_per_key < 100.0)) { // including NaN + bits_per_key = 100.0; + } + + // Includes a nudge toward rounding up, to ensure on all platforms + // that doubles specified with three decimal digits after the decimal + // point are interpreted accurately. + millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001); + + // For better or worse, this is a rounding up of a nudged rounding up, + // e.g. 7.4999999999999 will round up to 8, but that provides more + // predictability against small arithmetic errors in floating point. + whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000; +} + +BloomFilterPolicy::~BloomFilterPolicy() {} + +const char* BloomFilterPolicy::Name() const { + return "rocksdb.BuiltinBloomFilter"; +} + +void BloomFilterPolicy::CreateFilter(const Slice* keys, int n, + std::string* dst) const { + // We should ideally only be using this deprecated interface for + // appropriately constructed BloomFilterPolicy + assert(mode_ == kDeprecatedBlock); + + // Compute bloom filter size (in both bits and bytes) + uint32_t bits = static_cast<uint32_t>(n * whole_bits_per_key_); + + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if (bits < 64) bits = 64; + + uint32_t bytes = (bits + 7) / 8; + bits = bytes * 8; + + int num_probes = + LegacyNoLocalityBloomImpl::ChooseNumProbes(whole_bits_per_key_); + + const size_t init_size = dst->size(); + dst->resize(init_size + bytes, 0); + dst->push_back(static_cast<char>(num_probes)); // Remember # of probes + char* array = &(*dst)[init_size]; + for (int i = 0; i < n; i++) { + LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes, + array); + } +} + +bool BloomFilterPolicy::KeyMayMatch(const Slice& key, + const Slice& bloom_filter) const { + const size_t len = bloom_filter.size(); + if (len < 2 || len > 0xffffffffU) { + return false; + } + + const char* array = bloom_filter.data(); + const uint32_t bits = static_cast<uint32_t>(len - 1) * 8; + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + const int k = static_cast<uint8_t>(array[len - 1]); + if (k > 30) { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true; + } + // NB: using stored k not num_probes for whole_bits_per_key_ + return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k, + array); +} + +FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const { + // This code path should no longer be used, for the built-in + // BloomFilterPolicy. Internal to RocksDB and outside + // BloomFilterPolicy, only get a FilterBitsBuilder with + // BloomFilterPolicy::GetBuilderFromContext(), which will call + // BloomFilterPolicy::GetBuilderWithContext(). RocksDB users have + // been warned (HISTORY.md) that they can no longer call this on + // the built-in BloomFilterPolicy (unlikely). + assert(false); + return GetBuilderWithContext(FilterBuildingContext(BlockBasedTableOptions())); +} + +FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( + const FilterBuildingContext& context) const { + Mode cur = mode_; + // Unusual code construction so that we can have just + // one exhaustive switch without (risky) recursion + for (int i = 0; i < 2; ++i) { + switch (cur) { + case kAuto: + if (context.table_options.format_version < 5) { + cur = kLegacyBloom; + } else { + cur = kFastLocalBloom; + } + break; + case kDeprecatedBlock: + return nullptr; + case kFastLocalBloom: + return new FastLocalBloomBitsBuilder(millibits_per_key_); + case kLegacyBloom: + if (whole_bits_per_key_ >= 14 && context.info_log && + !warned_.load(std::memory_order_relaxed)) { + warned_ = true; + const char* adjective; + if (whole_bits_per_key_ >= 20) { + adjective = "Dramatic"; + } else { + adjective = "Significant"; + } + // For more details, see + // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter + ROCKS_LOG_WARN( + context.info_log, + "Using legacy Bloom filter with high (%d) bits/key. " + "%s filter space and/or accuracy improvement is available " + "with format_version>=5.", + whole_bits_per_key_, adjective); + } + return new LegacyBloomBitsBuilder(whole_bits_per_key_, + context.info_log); + } + } + assert(false); + return nullptr; // something legal +} + +FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext( + const FilterBuildingContext& context) { + if (context.table_options.filter_policy) { + return context.table_options.filter_policy->GetBuilderWithContext(context); + } else { + return nullptr; + } +} + +// Read metadata to determine what kind of FilterBitsReader is needed +// and return a new one. +FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( + const Slice& contents) const { + uint32_t len_with_meta = static_cast<uint32_t>(contents.size()); + if (len_with_meta <= 5) { + // filter is empty or broken. Treat like zero keys added. + return new AlwaysFalseFilter(); + } + + // Legacy Bloom filter data: + // 0 +-----------------------------------+ + // | Raw Bloom filter data | + // | ... | + // len +-----------------------------------+ + // | byte for num_probes or | + // | marker for new implementations | + // len+1 +-----------------------------------+ + // | four bytes for number of cache | + // | lines | + // len_with_meta +-----------------------------------+ + + int8_t raw_num_probes = + static_cast<int8_t>(contents.data()[len_with_meta - 5]); + // NB: *num_probes > 30 and < 128 probably have not been used, because of + // BloomFilterPolicy::initialize, unless directly calling + // LegacyBloomBitsBuilder as an API, but we are leaving those cases in + // limbo with LegacyBloomBitsReader for now. + + if (raw_num_probes < 1) { + // Note: < 0 (or unsigned > 127) indicate special new implementations + // (or reserved for future use) + if (raw_num_probes == -1) { + // Marker for newer Bloom implementations + return GetBloomBitsReader(contents); + } + // otherwise + // Treat as zero probes (always FP) for now. + return new AlwaysTrueFilter(); + } + // else attempt decode for LegacyBloomBitsReader + + int num_probes = raw_num_probes; + assert(num_probes >= 1); + assert(num_probes <= 127); + + uint32_t len = len_with_meta - 5; + assert(len > 0); + + uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4); + uint32_t log2_cache_line_size; + + if (num_lines * CACHE_LINE_SIZE == len) { + // Common case + log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE); + } else if (num_lines == 0 || len % num_lines != 0) { + // Invalid (no solution to num_lines * x == len) + // Treat as zero probes (always FP) for now. + return new AlwaysTrueFilter(); + } else { + // Determine the non-native cache line size (from another system) + log2_cache_line_size = 0; + while ((num_lines << log2_cache_line_size) < len) { + ++log2_cache_line_size; + } + if ((num_lines << log2_cache_line_size) != len) { + // Invalid (block size not a power of two) + // Treat as zero probes (always FP) for now. + return new AlwaysTrueFilter(); + } + } + // if not early return + return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines, + log2_cache_line_size); +} + +// For newer Bloom filter implementations +FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader( + const Slice& contents) const { + uint32_t len_with_meta = static_cast<uint32_t>(contents.size()); + uint32_t len = len_with_meta - 5; + + assert(len > 0); // precondition + + // New Bloom filter data: + // 0 +-----------------------------------+ + // | Raw Bloom filter data | + // | ... | + // len +-----------------------------------+ + // | char{-1} byte -> new Bloom filter | + // len+1 +-----------------------------------+ + // | byte for subimplementation | + // | 0: FastLocalBloom | + // | other: reserved | + // len+2 +-----------------------------------+ + // | byte for block_and_probes | + // | 0 in top 3 bits -> 6 -> 64-byte | + // | reserved: | + // | 1 in top 3 bits -> 7 -> 128-byte| + // | 2 in top 3 bits -> 8 -> 256-byte| + // | ... | + // | num_probes in bottom 5 bits, | + // | except 0 and 31 reserved | + // len+3 +-----------------------------------+ + // | two bytes reserved | + // | possibly for hash seed | + // len_with_meta +-----------------------------------+ + + // Read more metadata (see above) + char sub_impl_val = contents.data()[len_with_meta - 4]; + char block_and_probes = contents.data()[len_with_meta - 3]; + int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6; + + int num_probes = (block_and_probes & 31); + if (num_probes < 1 || num_probes > 30) { + // Reserved / future safe + return new AlwaysTrueFilter(); + } + + uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2); + if (rest != 0) { + // Reserved, possibly for hash seed + // Future safe + return new AlwaysTrueFilter(); + } + + if (sub_impl_val == 0) { // FastLocalBloom + if (log2_block_bytes == 6) { // Only block size supported for now + return new FastLocalBloomBitsReader(contents.data(), num_probes, len); + } + } + // otherwise + // Reserved / future safe + return new AlwaysTrueFilter(); +} + +const FilterPolicy* NewBloomFilterPolicy(double bits_per_key, + bool use_block_based_builder) { + BloomFilterPolicy::Mode m; + if (use_block_based_builder) { + m = BloomFilterPolicy::kDeprecatedBlock; + } else { + m = BloomFilterPolicy::kAuto; + } + assert(std::find(BloomFilterPolicy::kAllUserModes.begin(), + BloomFilterPolicy::kAllUserModes.end(), + m) != BloomFilterPolicy::kAllUserModes.end()); + return new BloomFilterPolicy(bits_per_key, m); +} + +FilterBuildingContext::FilterBuildingContext( + const BlockBasedTableOptions& _table_options) + : table_options(_table_options) {} + +FilterPolicy::~FilterPolicy() { } + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/filter_policy_internal.h b/src/rocksdb/table/block_based/filter_policy_internal.h new file mode 100644 index 000000000..2ca9dc859 --- /dev/null +++ b/src/rocksdb/table/block_based/filter_policy_internal.h @@ -0,0 +1,142 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <atomic> +#include <memory> +#include <string> +#include <vector> + +#include "rocksdb/filter_policy.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +// Exposes any extra information needed for testing built-in +// FilterBitsBuilders +class BuiltinFilterBitsBuilder : public FilterBitsBuilder { + public: + // Calculate number of bytes needed for a new filter, including + // metadata. Passing the result to CalculateNumEntry should + // return >= the num_entry passed in. + virtual uint32_t CalculateSpace(const int num_entry) = 0; + + // Returns an estimate of the FP rate of the returned filter if + // `keys` keys are added and the filter returned by Finish is `bytes` + // bytes. + virtual double EstimatedFpRate(size_t keys, size_t bytes) = 0; +}; + +// RocksDB built-in filter policy for Bloom or Bloom-like filters. +// This class is considered internal API and subject to change. +// See NewBloomFilterPolicy. +class BloomFilterPolicy : public FilterPolicy { + public: + // An internal marker for operating modes of BloomFilterPolicy, in terms + // of selecting an implementation. This makes it easier for tests to track + // or to walk over the built-in set of Bloom filter implementations. The + // only variance in BloomFilterPolicy by mode/implementation is in + // GetFilterBitsBuilder(), so an enum is practical here vs. subclasses. + // + // This enum is essentially the union of all the different kinds of return + // value from GetFilterBitsBuilder, or "underlying implementation", and + // higher-level modes that choose an underlying implementation based on + // context information. + enum Mode { + // Legacy implementation of Bloom filter for full and partitioned filters. + // Set to 0 in case of value confusion with bool use_block_based_builder + // NOTE: TESTING ONLY as this mode does not use best compatible + // implementation + kLegacyBloom = 0, + // Deprecated block-based Bloom filter implementation. + // Set to 1 in case of value confusion with bool use_block_based_builder + // NOTE: DEPRECATED but user exposed + kDeprecatedBlock = 1, + // A fast, cache-local Bloom filter implementation. See description in + // FastLocalBloomImpl. + // NOTE: TESTING ONLY as this mode does not check format_version + kFastLocalBloom = 2, + // Automatically choose from the above (except kDeprecatedBlock) based on + // context at build time, including compatibility with format_version. + // NOTE: This is currently the only recommended mode that is user exposed. + kAuto = 100, + }; + // All the different underlying implementations that a BloomFilterPolicy + // might use, as a mode that says "always use this implementation." + // Only appropriate for unit tests. + static const std::vector<Mode> kAllFixedImpls; + + // All the different modes of BloomFilterPolicy that are exposed from + // user APIs. Only appropriate for higher-level unit tests. Integration + // tests should prefer using NewBloomFilterPolicy (user-exposed). + static const std::vector<Mode> kAllUserModes; + + explicit BloomFilterPolicy(double bits_per_key, Mode mode); + + ~BloomFilterPolicy() override; + + const char* Name() const override; + + // Deprecated block-based filter only + void CreateFilter(const Slice* keys, int n, std::string* dst) const override; + + // Deprecated block-based filter only + bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override; + + FilterBitsBuilder* GetFilterBitsBuilder() const override; + + // To use this function, call GetBuilderFromContext(). + // + // Neither the context nor any objects therein should be saved beyond + // the call to this function, unless it's shared_ptr. + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const override; + + // Returns a new FilterBitsBuilder from the filter_policy in + // table_options of a context, or nullptr if not applicable. + // (An internal convenience function to save boilerplate.) + static FilterBitsBuilder* GetBuilderFromContext(const FilterBuildingContext&); + + // Read metadata to determine what kind of FilterBitsReader is needed + // and return a new one. This must successfully process any filter data + // generated by a built-in FilterBitsBuilder, regardless of the impl + // chosen for this BloomFilterPolicy. Not compatible with CreateFilter. + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override; + + // Essentially for testing only: configured millibits/key + int GetMillibitsPerKey() const { return millibits_per_key_; } + // Essentially for testing only: legacy whole bits/key + int GetWholeBitsPerKey() const { return whole_bits_per_key_; } + + private: + // Newer filters support fractional bits per key. For predictable behavior + // of 0.001-precision values across floating point implementations, we + // round to thousandths of a bit (on average) per key. + int millibits_per_key_; + + // Older filters round to whole number bits per key. (There *should* be no + // compatibility issue with fractional bits per key, but preserving old + // behavior with format_version < 5 just in case.) + int whole_bits_per_key_; + + // Selected mode (a specific implementation or way of selecting an + // implementation) for building new SST filters. + Mode mode_; + + // Whether relevant warnings have been logged already. (Remember so we + // only report once per BloomFilterPolicy instance, to keep the noise down.) + mutable std::atomic<bool> warned_; + + // For newer Bloom filter implementation(s) + FilterBitsReader* GetBloomBitsReader(const Slice& contents) const; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/flush_block_policy.cc b/src/rocksdb/table/block_based/flush_block_policy.cc new file mode 100644 index 000000000..f5cb2d227 --- /dev/null +++ b/src/rocksdb/table/block_based/flush_block_policy.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/flush_block_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "table/block_based/block_builder.h" +#include "table/format.h" + +#include <cassert> + +namespace ROCKSDB_NAMESPACE { + +// Flush block by size +class FlushBlockBySizePolicy : public FlushBlockPolicy { + public: + // @params block_size: Approximate size of user data packed per + // block. + // @params block_size_deviation: This is used to close a block before it + // reaches the configured + FlushBlockBySizePolicy(const uint64_t block_size, + const uint64_t block_size_deviation, + const bool align, + const BlockBuilder& data_block_builder) + : block_size_(block_size), + block_size_deviation_limit_( + ((block_size * (100 - block_size_deviation)) + 99) / 100), + align_(align), + data_block_builder_(data_block_builder) {} + + bool Update(const Slice& key, const Slice& value) override { + // it makes no sense to flush when the data block is empty + if (data_block_builder_.empty()) { + return false; + } + + auto curr_size = data_block_builder_.CurrentSizeEstimate(); + + // Do flush if one of the below two conditions is true: + // 1) if the current estimated size already exceeds the block size, + // 2) block_size_deviation is set and the estimated size after appending + // the kv will exceed the block size and the current size is under the + // the deviation. + return curr_size >= block_size_ || BlockAlmostFull(key, value); + } + + private: + bool BlockAlmostFull(const Slice& key, const Slice& value) const { + if (block_size_deviation_limit_ == 0) { + return false; + } + + const auto curr_size = data_block_builder_.CurrentSizeEstimate(); + auto estimated_size_after = + data_block_builder_.EstimateSizeAfterKV(key, value); + + if (align_) { + estimated_size_after += kBlockTrailerSize; + return estimated_size_after > block_size_; + } + + return estimated_size_after > block_size_ && + curr_size > block_size_deviation_limit_; + } + + const uint64_t block_size_; + const uint64_t block_size_deviation_limit_; + const bool align_; + const BlockBuilder& data_block_builder_; +}; + +FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + const BlockBasedTableOptions& table_options, + const BlockBuilder& data_block_builder) const { + return new FlushBlockBySizePolicy( + table_options.block_size, table_options.block_size_deviation, + table_options.block_align, data_block_builder); +} + +FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + const uint64_t size, const int deviation, + const BlockBuilder& data_block_builder) { + return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/flush_block_policy.h b/src/rocksdb/table/block_based/flush_block_policy.h new file mode 100644 index 000000000..68c60c168 --- /dev/null +++ b/src/rocksdb/table/block_based/flush_block_policy.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/flush_block_policy.h" + +namespace ROCKSDB_NAMESPACE { + +// FlushBlockEveryKeyPolicy currently used only in tests. + +class FlushBlockEveryKeyPolicy : public FlushBlockPolicy { + public: + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + if (!start_) { + start_ = true; + return false; + } + return true; + } + + private: + bool start_ = false; +}; + +class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory { + public: + explicit FlushBlockEveryKeyPolicyFactory() {} + + const char* Name() const override { + return "FlushBlockEveryKeyPolicyFactory"; + } + + FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& /*data_block_builder*/) const override { + return new FlushBlockEveryKeyPolicy; + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/full_filter_block.cc b/src/rocksdb/table/block_based/full_filter_block.cc new file mode 100644 index 000000000..e2f7f476f --- /dev/null +++ b/src/rocksdb/table/block_based/full_filter_block.cc @@ -0,0 +1,338 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_based/full_filter_block.h" +#include <array> + +#include "monitoring/perf_context_imp.h" +#include "port/malloc.h" +#include "port/port.h" +#include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +FullFilterBlockBuilder::FullFilterBlockBuilder( + const SliceTransform* _prefix_extractor, bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder) + : prefix_extractor_(_prefix_extractor), + whole_key_filtering_(whole_key_filtering), + last_whole_key_recorded_(false), + last_prefix_recorded_(false), + num_added_(0) { + assert(filter_bits_builder != nullptr); + filter_bits_builder_.reset(filter_bits_builder); +} + +void FullFilterBlockBuilder::Add(const Slice& key) { + const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key); + if (whole_key_filtering_) { + if (!add_prefix) { + AddKey(key); + } else { + // if both whole_key and prefix are added to bloom then we will have whole + // key and prefix addition being interleaved and thus cannot rely on the + // bits builder to properly detect the duplicates by comparing with the + // last item. + Slice last_whole_key = Slice(last_whole_key_str_); + if (!last_whole_key_recorded_ || last_whole_key.compare(key) != 0) { + AddKey(key); + last_whole_key_recorded_ = true; + last_whole_key_str_.assign(key.data(), key.size()); + } + } + } + if (add_prefix) { + AddPrefix(key); + } +} + +// Add key to filter if needed +inline void FullFilterBlockBuilder::AddKey(const Slice& key) { + filter_bits_builder_->AddKey(key); + num_added_++; +} + +// Add prefix to filter if needed +void FullFilterBlockBuilder::AddPrefix(const Slice& key) { + Slice prefix = prefix_extractor_->Transform(key); + if (whole_key_filtering_) { + // if both whole_key and prefix are added to bloom then we will have whole + // key and prefix addition being interleaved and thus cannot rely on the + // bits builder to properly detect the duplicates by comparing with the last + // item. + Slice last_prefix = Slice(last_prefix_str_); + if (!last_prefix_recorded_ || last_prefix.compare(prefix) != 0) { + AddKey(prefix); + last_prefix_recorded_ = true; + last_prefix_str_.assign(prefix.data(), prefix.size()); + } + } else { + AddKey(prefix); + } +} + +void FullFilterBlockBuilder::Reset() { + last_whole_key_recorded_ = false; + last_prefix_recorded_ = false; +} + +Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, + Status* status) { + Reset(); + // In this impl we ignore BlockHandle + *status = Status::OK(); + if (num_added_ != 0) { + num_added_ = 0; + return filter_bits_builder_->Finish(&filter_data_); + } + return Slice(); +} + +FullFilterBlockReader::FullFilterBlockReader( + const BlockBasedTable* t, + CachableEntry<ParsedFullFilterBlock>&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) { + const SliceTransform* const prefix_extractor = table_prefix_extractor(); + if (prefix_extractor) { + full_length_enabled_ = + prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_); + } +} + +bool FullFilterBlockReader::KeyMayMatch( + const Slice& key, const SliceTransform* /*prefix_extractor*/, + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { +#ifdef NDEBUG + (void)block_offset; +#endif + assert(block_offset == kNotValid); + if (!whole_key_filtering()) { + return true; + } + return MayMatch(key, no_io, get_context, lookup_context); +} + +std::unique_ptr<FilterBlockReader> FullFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry<ParsedFullFilterBlock> filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + use_cache, nullptr /* get_context */, + lookup_context, &filter_block); + if (!s.ok()) { + return std::unique_ptr<FilterBlockReader>(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr<FilterBlockReader>( + new FullFilterBlockReader(table, std::move(filter_block))); +} + +bool FullFilterBlockReader::PrefixMayMatch( + const Slice& prefix, const SliceTransform* /* prefix_extractor */, + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { +#ifdef NDEBUG + (void)block_offset; +#endif + assert(block_offset == kNotValid); + return MayMatch(prefix, no_io, get_context, lookup_context); +} + +bool FullFilterBlockReader::MayMatch( + const Slice& entry, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { + CachableEntry<ParsedFullFilterBlock> filter_block; + + const Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (!s.ok()) { + return true; + } + + assert(filter_block.GetValue()); + + FilterBitsReader* const filter_bits_reader = + filter_block.GetValue()->filter_bits_reader(); + + if (filter_bits_reader) { + if (filter_bits_reader->MayMatch(entry)) { + PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + return true; + } else { + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + return false; + } + } + return true; // remain the same with block_based filter +} + +void FullFilterBlockReader::KeysMayMatch( + MultiGetRange* range, const SliceTransform* /*prefix_extractor*/, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { +#ifdef NDEBUG + (void)range; + (void)block_offset; +#endif + assert(block_offset == kNotValid); + if (!whole_key_filtering()) { + // Simply return. Don't skip any key - consider all keys as likely to be + // present + return; + } + MayMatch(range, no_io, nullptr, lookup_context); +} + +void FullFilterBlockReader::PrefixesMayMatch( + MultiGetRange* range, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { +#ifdef NDEBUG + (void)range; + (void)block_offset; +#endif + assert(block_offset == kNotValid); + MayMatch(range, no_io, prefix_extractor, lookup_context); +} + +void FullFilterBlockReader::MayMatch( + MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const { + CachableEntry<ParsedFullFilterBlock> filter_block; + + const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context, + lookup_context, &filter_block); + if (!s.ok()) { + return; + } + + assert(filter_block.GetValue()); + + FilterBitsReader* const filter_bits_reader = + filter_block.GetValue()->filter_bits_reader(); + + if (!filter_bits_reader) { + return; + } + + // We need to use an array instead of autovector for may_match since + // &may_match[0] doesn't work for autovector<bool> (compiler error). So + // declare both keys and may_match as arrays, which is also slightly less + // expensive compared to autovector + std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys; + std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}}; + autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes; + int num_keys = 0; + MultiGetRange filter_range(*range, range->begin(), range->end()); + for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) { + if (!prefix_extractor) { + keys[num_keys++] = &iter->ukey; + } else if (prefix_extractor->InDomain(iter->ukey)) { + prefixes.emplace_back(prefix_extractor->Transform(iter->ukey)); + keys[num_keys++] = &prefixes.back(); + } else { + filter_range.SkipKey(iter); + } + } + + filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]); + + int i = 0; + for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) { + if (!may_match[i]) { + // Update original MultiGet range to skip this key. The filter_range + // was temporarily used just to skip keys not in prefix_extractor domain + range->SkipKey(iter); + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + } else { + // PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + PerfContext* perf_ctx = get_perf_context(); + perf_ctx->bloom_sst_hit_count++; + } + ++i; + } +} + +size_t FullFilterBlockReader::ApproximateMemoryUsage() const { + size_t usage = ApproximateFilterBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast<FullFilterBlockReader*>(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; +} + +bool FullFilterBlockReader::RangeMayExist( + const Slice* iterate_upper_bound, const Slice& user_key, + const SliceTransform* prefix_extractor, const Comparator* comparator, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool need_upper_bound_check, BlockCacheLookupContext* lookup_context) { + if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) { + *filter_checked = false; + return true; + } + Slice prefix = prefix_extractor->Transform(user_key); + if (need_upper_bound_check && + !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) { + *filter_checked = false; + return true; + } else { + *filter_checked = true; + return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, + const_ikey_ptr, /* get_context */ nullptr, + lookup_context); + } +} + +bool FullFilterBlockReader::IsFilterCompatible( + const Slice* iterate_upper_bound, const Slice& prefix, + const Comparator* comparator) const { + // Try to reuse the bloom filter in the SST table if prefix_extractor in + // mutable_cf_options has changed. If range [user_key, upper_bound) all + // share the same prefix then we may still be able to use the bloom filter. + const SliceTransform* const prefix_extractor = table_prefix_extractor(); + if (iterate_upper_bound != nullptr && prefix_extractor) { + if (!prefix_extractor->InDomain(*iterate_upper_bound)) { + return false; + } + Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound); + // first check if user_key and upper_bound all share the same prefix + if (!comparator->Equal(prefix, upper_bound_xform)) { + // second check if user_key's prefix is the immediate predecessor of + // upper_bound and have the same length. If so, we know for sure all + // keys in the range [user_key, upper_bound) share the same prefix. + // Also need to make sure upper_bound are full length to ensure + // correctness + if (!full_length_enabled_ || + iterate_upper_bound->size() != prefix_extractor_full_length_ || + !comparator->IsSameLengthImmediateSuccessor(prefix, + *iterate_upper_bound)) { + return false; + } + } + return true; + } else { + return false; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/full_filter_block.h b/src/rocksdb/table/block_based/full_filter_block.h new file mode 100644 index 000000000..c72a58021 --- /dev/null +++ b/src/rocksdb/table/block_based/full_filter_block.h @@ -0,0 +1,139 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <stddef.h> +#include <stdint.h> +#include <memory> +#include <string> +#include <vector> + +#include "db/dbformat.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +class FilterPolicy; +class FilterBitsBuilder; +class FilterBitsReader; + +// A FullFilterBlockBuilder is used to construct a full filter for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// The format of full filter block is: +// +----------------------------------------------------------------+ +// | full filter for all keys in sst file | +// +----------------------------------------------------------------+ +// The full filter can be very large. At the end of it, we put +// num_probes: how many hash functions are used in bloom filter +// +class FullFilterBlockBuilder : public FilterBlockBuilder { + public: + explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor, + bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder); + // No copying allowed + FullFilterBlockBuilder(const FullFilterBlockBuilder&) = delete; + void operator=(const FullFilterBlockBuilder&) = delete; + + // bits_builder is created in filter_policy, it should be passed in here + // directly. and be deleted here + ~FullFilterBlockBuilder() {} + + virtual bool IsBlockBased() override { return false; } + virtual void StartBlock(uint64_t /*block_offset*/) override {} + virtual void Add(const Slice& key) override; + virtual size_t NumAdded() const override { return num_added_; } + virtual Slice Finish(const BlockHandle& tmp, Status* status) override; + using FilterBlockBuilder::Finish; + + protected: + virtual void AddKey(const Slice& key); + std::unique_ptr<FilterBitsBuilder> filter_bits_builder_; + virtual void Reset(); + void AddPrefix(const Slice& key); + const SliceTransform* prefix_extractor() { return prefix_extractor_; } + + private: + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + bool last_whole_key_recorded_; + std::string last_whole_key_str_; + bool last_prefix_recorded_; + std::string last_prefix_str_; + + uint32_t num_added_; + std::unique_ptr<const char[]> filter_data_; + +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +class FullFilterBlockReader + : public FilterBlockReaderCommon<ParsedFullFilterBlock> { + public: + FullFilterBlockReader(const BlockBasedTable* t, + CachableEntry<ParsedFullFilterBlock>&& filter_block); + + static std::unique_ptr<FilterBlockReader> Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); + + bool IsBlockBased() override { return false; } + + bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + bool PrefixMayMatch(const Slice& prefix, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + void KeysMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) override; + + void PrefixesMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) override; + size_t ApproximateMemoryUsage() const override; + bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* comparator, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool need_upper_bound_check, + BlockCacheLookupContext* lookup_context) override; + + private: + bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; + void MayMatch(MultiGetRange* range, bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const; + bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix, + const Comparator* comparator) const; + + private: + bool full_length_enabled_; + size_t prefix_extractor_full_length_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/full_filter_block_test.cc b/src/rocksdb/table/block_based/full_filter_block_test.cc new file mode 100644 index 000000000..496b149ab --- /dev/null +++ b/src/rocksdb/table/block_based/full_filter_block_test.cc @@ -0,0 +1,333 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <set> + +#include "table/block_based/full_filter_block.h" +#include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/mock_block_based_table.h" +#include "table/block_based/filter_policy_internal.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class TestFilterBitsBuilder : public FilterBitsBuilder { + public: + explicit TestFilterBitsBuilder() {} + + // Add Key to filter + void AddKey(const Slice& key) override { + hash_entries_.push_back(Hash(key.data(), key.size(), 1)); + } + + // Generate the filter using the keys that are added + Slice Finish(std::unique_ptr<const char[]>* buf) override { + uint32_t len = static_cast<uint32_t>(hash_entries_.size()) * 4; + char* data = new char[len]; + for (size_t i = 0; i < hash_entries_.size(); i++) { + EncodeFixed32(data + i * 4, hash_entries_[i]); + } + const char* const_data = data; + buf->reset(const_data); + return Slice(data, len); + } + + private: + std::vector<uint32_t> hash_entries_; +}; + +class MockBlockBasedTable : public BlockBasedTable { + public: + explicit MockBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {} +}; + +class TestFilterBitsReader : public FilterBitsReader { + public: + explicit TestFilterBitsReader(const Slice& contents) + : data_(contents.data()), len_(static_cast<uint32_t>(contents.size())) {} + + // Silence compiler warning about overloaded virtual + using FilterBitsReader::MayMatch; + bool MayMatch(const Slice& entry) override { + uint32_t h = Hash(entry.data(), entry.size(), 1); + for (size_t i = 0; i + 4 <= len_; i += 4) { + if (h == DecodeFixed32(data_ + i)) { + return true; + } + } + return false; + } + + private: + const char* data_; + uint32_t len_; +}; + + +class TestHashFilter : public FilterPolicy { + public: + const char* Name() const override { return "TestHashFilter"; } + + void CreateFilter(const Slice* keys, int n, std::string* dst) const override { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + bool KeyMayMatch(const Slice& key, const Slice& filter) const override { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } + + FilterBitsBuilder* GetFilterBitsBuilder() const override { + return new TestFilterBitsBuilder(); + } + + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override { + return new TestFilterBitsReader(contents); + } +}; + +class PluginFullFilterBlockTest : public mock::MockBlockBasedTableTester, + public testing::Test { + public: + PluginFullFilterBlockTest() + : mock::MockBlockBasedTableTester(new TestHashFilter) {} +}; + +TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) { + FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); + Slice slice = builder.Finish(); + ASSERT_EQ("", EscapeString(slice)); + + CachableEntry<ParsedFullFilterBlock> block( + new ParsedFullFilterBlock(table_options_.filter_policy.get(), + BlockContents(slice)), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + // Remain same symantic with blockbased filter + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { + FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.Add("box"); + builder.Add("hello"); + Slice slice = builder.Finish(); + + CachableEntry<ParsedFullFilterBlock> block( + new ParsedFullFilterBlock(table_options_.filter_policy.get(), + BlockContents(slice)), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +class FullFilterBlockTest : public mock::MockBlockBasedTableTester, + public testing::Test { + public: + FullFilterBlockTest() + : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, false)) {} +}; + +TEST_F(FullFilterBlockTest, EmptyBuilder) { + FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); + Slice slice = builder.Finish(); + ASSERT_EQ("", EscapeString(slice)); + + CachableEntry<ParsedFullFilterBlock> block( + new ParsedFullFilterBlock(table_options_.filter_policy.get(), + BlockContents(slice)), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + // Remain same symantic with blockbased filter + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder { + std::unique_ptr<FilterBitsBuilder> b_; + std::set<std::string> uniq_; + + public: + explicit CountUniqueFilterBitsBuilderWrapper(FilterBitsBuilder* b) : b_(b) {} + + ~CountUniqueFilterBitsBuilderWrapper() override {} + + void AddKey(const Slice& key) override { + b_->AddKey(key); + uniq_.insert(key.ToString()); + } + + Slice Finish(std::unique_ptr<const char[]>* buf) override { + Slice rv = b_->Finish(buf); + uniq_.clear(); + return rv; + } + + int CalculateNumEntry(const uint32_t bytes) override { + return b_->CalculateNumEntry(bytes); + } + + size_t CountUnique() { return uniq_.size(); } +}; + +TEST_F(FullFilterBlockTest, DuplicateEntries) { + { // empty prefixes + std::unique_ptr<const SliceTransform> prefix_extractor( + NewFixedPrefixTransform(0)); + auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder()); + const bool WHOLE_KEY = true; + FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, + bits_builder); + ASSERT_EQ(0, builder.NumAdded()); + ASSERT_EQ(0, bits_builder->CountUnique()); + // adds key and empty prefix; both abstractions count them + builder.Add("key1"); + ASSERT_EQ(2, builder.NumAdded()); + ASSERT_EQ(2, bits_builder->CountUnique()); + // Add different key (unique) and also empty prefix (not unique). + // From here in this test, it's immaterial whether the block builder + // can count unique keys. + builder.Add("key2"); + ASSERT_EQ(3, bits_builder->CountUnique()); + // Empty key -> nothing unique + builder.Add(""); + ASSERT_EQ(3, bits_builder->CountUnique()); + } + + // mix of empty and non-empty + std::unique_ptr<const SliceTransform> prefix_extractor( + NewFixedPrefixTransform(7)); + auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder()); + const bool WHOLE_KEY = true; + FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, + bits_builder); + ASSERT_EQ(0, builder.NumAdded()); + builder.Add(""); // test with empty key too + builder.Add("prefix1key1"); + builder.Add("prefix1key1"); + builder.Add("prefix1key2"); + builder.Add("prefix1key3"); + builder.Add("prefix2key4"); + // 1 empty, 2 non-empty prefixes, and 4 non-empty keys + ASSERT_EQ(1 + 2 + 4, bits_builder->CountUnique()); +} + +TEST_F(FullFilterBlockTest, SingleChunk) { + FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); + ASSERT_EQ(0, builder.NumAdded()); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.Add("box"); + builder.Add("hello"); + ASSERT_EQ(5, builder.NumAdded()); + Slice slice = builder.Finish(); + + CachableEntry<ParsedFullFilterBlock> block( + new ParsedFullFilterBlock(table_options_.filter_policy.get(), + BlockContents(slice)), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/block_based/index_builder.cc b/src/rocksdb/table/block_based/index_builder.cc new file mode 100644 index 000000000..277bec61d --- /dev/null +++ b/src/rocksdb/table/block_based/index_builder.cc @@ -0,0 +1,222 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based/index_builder.h" + +#include <assert.h> +#include <cinttypes> + +#include <list> +#include <string> + +#include "rocksdb/comparator.h" +#include "rocksdb/flush_block_policy.h" +#include "table/block_based/partitioned_filter_block.h" +#include "table/format.h" + +// Without anonymous namespace here, we fail the warning -Wmissing-prototypes +namespace ROCKSDB_NAMESPACE { +// using namespace rocksdb; +// Create a index builder based on its type. +IndexBuilder* IndexBuilder::CreateIndexBuilder( + BlockBasedTableOptions::IndexType index_type, + const InternalKeyComparator* comparator, + const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt) { + IndexBuilder* result = nullptr; + switch (index_type) { + case BlockBasedTableOptions::kBinarySearch: { + result = new ShortenedIndexBuilder( + comparator, table_opt.index_block_restart_interval, + table_opt.format_version, use_value_delta_encoding, + table_opt.index_shortening, /* include_first_key */ false); + } break; + case BlockBasedTableOptions::kHashSearch: { + // Currently kHashSearch is incompatible with index_block_restart_interval + // > 1 + assert(table_opt.index_block_restart_interval == 1); + result = new HashIndexBuilder( + comparator, int_key_slice_transform, + table_opt.index_block_restart_interval, table_opt.format_version, + use_value_delta_encoding, table_opt.index_shortening); + } break; + case BlockBasedTableOptions::kTwoLevelIndexSearch: { + result = PartitionedIndexBuilder::CreateIndexBuilder( + comparator, use_value_delta_encoding, table_opt); + } break; + case BlockBasedTableOptions::kBinarySearchWithFirstKey: { + result = new ShortenedIndexBuilder( + comparator, table_opt.index_block_restart_interval, + table_opt.format_version, use_value_delta_encoding, + table_opt.index_shortening, /* include_first_key */ true); + } break; + default: { + assert(!"Do not recognize the index type "); + } break; + } + return result; +} + +PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder( + const InternalKeyComparator* comparator, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt) { + return new PartitionedIndexBuilder(comparator, table_opt, + use_value_delta_encoding); +} + +PartitionedIndexBuilder::PartitionedIndexBuilder( + const InternalKeyComparator* comparator, + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding) + : IndexBuilder(comparator), + index_block_builder_(table_opt.index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_block_builder_without_seq_(table_opt.index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + sub_index_builder_(nullptr), + table_opt_(table_opt), + // We start by false. After each partition we revise the value based on + // what the sub_index_builder has decided. If the feature is disabled + // entirely, this will be set to true after switching the first + // sub_index_builder. Otherwise, it could be set to true even one of the + // sub_index_builders could not safely exclude seq from the keys, then it + // wil be enforced on all sub_index_builders on ::Finish. + seperator_is_key_plus_seq_(false), + use_value_delta_encoding_(use_value_delta_encoding) {} + +PartitionedIndexBuilder::~PartitionedIndexBuilder() { + delete sub_index_builder_; +} + +void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { + assert(sub_index_builder_ == nullptr); + sub_index_builder_ = new ShortenedIndexBuilder( + comparator_, table_opt_.index_block_restart_interval, + table_opt_.format_version, use_value_delta_encoding_, + table_opt_.index_shortening, /* include_first_key */ false); + flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( + table_opt_.metadata_block_size, table_opt_.block_size_deviation, + // Note: this is sub-optimal since sub_index_builder_ could later reset + // seperator_is_key_plus_seq_ but the probability of that is low. + sub_index_builder_->seperator_is_key_plus_seq_ + ? sub_index_builder_->index_block_builder_ + : sub_index_builder_->index_block_builder_without_seq_)); + partition_cut_requested_ = false; +} + +void PartitionedIndexBuilder::RequestPartitionCut() { + partition_cut_requested_ = true; +} + +void PartitionedIndexBuilder::AddIndexEntry( + std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, const BlockHandle& block_handle) { + // Note: to avoid two consecuitive flush in the same method call, we do not + // check flush policy when adding the last key + if (UNLIKELY(first_key_in_next_block == nullptr)) { // no more keys + if (sub_index_builder_ == nullptr) { + MakeNewSubIndexBuilder(); + } + sub_index_builder_->AddIndexEntry(last_key_in_current_block, + first_key_in_next_block, block_handle); + if (sub_index_builder_->seperator_is_key_plus_seq_) { + // then we need to apply it to all sub-index builders + seperator_is_key_plus_seq_ = true; + } + sub_index_last_key_ = std::string(*last_key_in_current_block); + entries_.push_back( + {sub_index_last_key_, + std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)}); + sub_index_builder_ = nullptr; + cut_filter_block = true; + } else { + // apply flush policy only to non-empty sub_index_builder_ + if (sub_index_builder_ != nullptr) { + std::string handle_encoding; + block_handle.EncodeTo(&handle_encoding); + bool do_flush = + partition_cut_requested_ || + flush_policy_->Update(*last_key_in_current_block, handle_encoding); + if (do_flush) { + entries_.push_back( + {sub_index_last_key_, + std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)}); + cut_filter_block = true; + sub_index_builder_ = nullptr; + } + } + if (sub_index_builder_ == nullptr) { + MakeNewSubIndexBuilder(); + } + sub_index_builder_->AddIndexEntry(last_key_in_current_block, + first_key_in_next_block, block_handle); + sub_index_last_key_ = std::string(*last_key_in_current_block); + if (sub_index_builder_->seperator_is_key_plus_seq_) { + // then we need to apply it to all sub-index builders + seperator_is_key_plus_seq_ = true; + } + } +} + +Status PartitionedIndexBuilder::Finish( + IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) { + if (partition_cnt_ == 0) { + partition_cnt_ = entries_.size(); + } + // It must be set to null after last key is added + assert(sub_index_builder_ == nullptr); + if (finishing_indexes == true) { + Entry& last_entry = entries_.front(); + std::string handle_encoding; + last_partition_block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64( + &handle_delta_encoding, + last_partition_block_handle.size() - last_encoded_handle_.size()); + last_encoded_handle_ = last_partition_block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); + if (!seperator_is_key_plus_seq_) { + index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key), + handle_encoding, + &handle_delta_encoding_slice); + } + entries_.pop_front(); + } + // If there is no sub_index left, then return the 2nd level index. + if (UNLIKELY(entries_.empty())) { + if (seperator_is_key_plus_seq_) { + index_blocks->index_block_contents = index_block_builder_.Finish(); + } else { + index_blocks->index_block_contents = + index_block_builder_without_seq_.Finish(); + } + top_level_index_size_ = index_blocks->index_block_contents.size(); + index_size_ += top_level_index_size_; + return Status::OK(); + } else { + // Finish the next partition index in line and Incomplete() to indicate we + // expect more calls to Finish + Entry& entry = entries_.front(); + // Apply the policy to all sub-indexes + entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_; + auto s = entry.value->Finish(index_blocks); + index_size_ += index_blocks->index_block_contents.size(); + finishing_indexes = true; + return s.ok() ? Status::Incomplete() : s; + } +} + +size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; } +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/index_builder.h b/src/rocksdb/table/block_based/index_builder.h new file mode 100644 index 000000000..bfffc5996 --- /dev/null +++ b/src/rocksdb/table/block_based/index_builder.h @@ -0,0 +1,443 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include <assert.h> +#include <cinttypes> + +#include <list> +#include <string> +#include <unordered_map> + +#include "rocksdb/comparator.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_builder.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { +// The interface for building index. +// Instruction for adding a new concrete IndexBuilder: +// 1. Create a subclass instantiated from IndexBuilder. +// 2. Add a new entry associated with that subclass in TableOptions::IndexType. +// 3. Add a create function for the new subclass in CreateIndexBuilder. +// Note: we can devise more advanced design to simplify the process for adding +// new subclass, which will, on the other hand, increase the code complexity and +// catch unwanted attention from readers. Given that we won't add/change +// indexes frequently, it makes sense to just embrace a more straightforward +// design that just works. +class IndexBuilder { + public: + static IndexBuilder* CreateIndexBuilder( + BlockBasedTableOptions::IndexType index_type, + const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator, + const InternalKeySliceTransform* int_key_slice_transform, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt); + + // Index builder will construct a set of blocks which contain: + // 1. One primary index block. + // 2. (Optional) a set of metablocks that contains the metadata of the + // primary index. + struct IndexBlocks { + Slice index_block_contents; + std::unordered_map<std::string, Slice> meta_blocks; + }; + explicit IndexBuilder(const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + virtual ~IndexBuilder() {} + + // Add a new index entry to index block. + // To allow further optimization, we provide `last_key_in_current_block` and + // `first_key_in_next_block`, based on which the specific implementation can + // determine the best index key to be used for the index block. + // Called before the OnKeyAdded() call for first_key_in_next_block. + // @last_key_in_current_block: this parameter maybe overridden with the value + // "substitute key". + // @first_key_in_next_block: it will be nullptr if the entry being added is + // the last one in the table + // + // REQUIRES: Finish() has not yet been called. + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) = 0; + + // This method will be called whenever a key is added. The subclasses may + // override OnKeyAdded() if they need to collect additional information. + virtual void OnKeyAdded(const Slice& /*key*/) {} + + // Inform the index builder that all entries has been written. Block builder + // may therefore perform any operation required for block finalization. + // + // REQUIRES: Finish() has not yet been called. + inline Status Finish(IndexBlocks* index_blocks) { + // Throw away the changes to last_partition_block_handle. It has no effect + // on the first call to Finish anyway. + BlockHandle last_partition_block_handle; + return Finish(index_blocks, last_partition_block_handle); + } + + // This override of Finish can be utilized to build the 2nd level index in + // PartitionIndexBuilder. + // + // index_blocks will be filled with the resulting index data. If the return + // value is Status::InComplete() then it means that the index is partitioned + // and the callee should keep calling Finish until Status::OK() is returned. + // In that case, last_partition_block_handle is pointer to the block written + // with the result of the last call to Finish. This can be utilized to build + // the second level index pointing to each block of partitioned indexes. The + // last call to Finish() that returns Status::OK() populates index_blocks with + // the 2nd level index content. + virtual Status Finish(IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) = 0; + + // Get the size for index block. Must be called after ::Finish. + virtual size_t IndexSize() const = 0; + + virtual bool seperator_is_key_plus_seq() { return true; } + + protected: + const InternalKeyComparator* comparator_; + // Set after ::Finish is called + size_t index_size_ = 0; +}; + +// This index builder builds space-efficient index block. +// +// Optimizations: +// 1. Made block's `block_restart_interval` to be 1, which will avoid linear +// search when doing index lookup (can be disabled by setting +// index_block_restart_interval). +// 2. Shorten the key length for index block. Other than honestly using the +// last key in the data block as the index key, we instead find a shortest +// substitute key that serves the same function. +class ShortenedIndexBuilder : public IndexBuilder { + public: + explicit ShortenedIndexBuilder( + const InternalKeyComparator* comparator, + const int index_block_restart_interval, const uint32_t format_version, + const bool use_value_delta_encoding, + BlockBasedTableOptions::IndexShorteningMode shortening_mode, + bool include_first_key) + : IndexBuilder(comparator), + index_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + use_value_delta_encoding_(use_value_delta_encoding), + include_first_key_(include_first_key), + shortening_mode_(shortening_mode) { + // Making the default true will disable the feature for old versions + seperator_is_key_plus_seq_ = (format_version <= 2); + } + + virtual void OnKeyAdded(const Slice& key) override { + if (include_first_key_ && current_block_first_internal_key_.empty()) { + current_block_first_internal_key_.assign(key.data(), key.size()); + } + } + + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { + if (first_key_in_next_block != nullptr) { + if (shortening_mode_ != + BlockBasedTableOptions::IndexShorteningMode::kNoShortening) { + comparator_->FindShortestSeparator(last_key_in_current_block, + *first_key_in_next_block); + } + if (!seperator_is_key_plus_seq_ && + comparator_->user_comparator()->Compare( + ExtractUserKey(*last_key_in_current_block), + ExtractUserKey(*first_key_in_next_block)) == 0) { + seperator_is_key_plus_seq_ = true; + } + } else { + if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor) { + comparator_->FindShortSuccessor(last_key_in_current_block); + } + } + auto sep = Slice(*last_key_in_current_block); + + assert(!include_first_key_ || !current_block_first_internal_key_.empty()); + IndexValue entry(block_handle, current_block_first_internal_key_); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, include_first_key_, nullptr); + if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) { + entry.EncodeTo(&delta_encoded_entry, include_first_key_, + &last_encoded_handle_); + } else { + // If it's the first block, or delta encoding is disabled, + // BlockBuilder::Add() below won't use delta-encoded slice. + } + last_encoded_handle_ = block_handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice); + if (!seperator_is_key_plus_seq_) { + index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry, + &delta_encoded_entry_slice); + } + + current_block_first_internal_key_.clear(); + } + + using IndexBuilder::Finish; + virtual Status Finish( + IndexBlocks* index_blocks, + const BlockHandle& /*last_partition_block_handle*/) override { + if (seperator_is_key_plus_seq_) { + index_blocks->index_block_contents = index_block_builder_.Finish(); + } else { + index_blocks->index_block_contents = + index_block_builder_without_seq_.Finish(); + } + index_size_ = index_blocks->index_block_contents.size(); + return Status::OK(); + } + + virtual size_t IndexSize() const override { return index_size_; } + + virtual bool seperator_is_key_plus_seq() override { + return seperator_is_key_plus_seq_; + } + + friend class PartitionedIndexBuilder; + + private: + BlockBuilder index_block_builder_; + BlockBuilder index_block_builder_without_seq_; + const bool use_value_delta_encoding_; + bool seperator_is_key_plus_seq_; + const bool include_first_key_; + BlockBasedTableOptions::IndexShorteningMode shortening_mode_; + BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle(); + std::string current_block_first_internal_key_; +}; + +// HashIndexBuilder contains a binary-searchable primary index and the +// metadata for secondary hash index construction. +// The metadata for hash index consists two parts: +// - a metablock that compactly contains a sequence of prefixes. All prefixes +// are stored consectively without any metadata (like, prefix sizes) being +// stored, which is kept in the other metablock. +// - a metablock contains the metadata of the prefixes, including prefix size, +// restart index and number of block it spans. The format looks like: +// +// +-----------------+---------------------------+---------------------+ +// <=prefix 1 +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ +// <=prefix 2 +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ +// | | +// | .... | +// | | +// +-----------------+---------------------------+---------------------+ +// <=prefix n +// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | +// +-----------------+---------------------------+---------------------+ +// +// The reason of separating these two metablocks is to enable the efficiently +// reuse the first metablock during hash index construction without unnecessary +// data copy or small heap allocations for prefixes. +class HashIndexBuilder : public IndexBuilder { + public: + explicit HashIndexBuilder( + const InternalKeyComparator* comparator, + const SliceTransform* hash_key_extractor, + int index_block_restart_interval, int format_version, + bool use_value_delta_encoding, + BlockBasedTableOptions::IndexShorteningMode shortening_mode) + : IndexBuilder(comparator), + primary_index_builder_(comparator, index_block_restart_interval, + format_version, use_value_delta_encoding, + shortening_mode, /* include_first_key */ false), + hash_key_extractor_(hash_key_extractor) {} + + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { + ++current_restart_index_; + primary_index_builder_.AddIndexEntry(last_key_in_current_block, + first_key_in_next_block, block_handle); + } + + virtual void OnKeyAdded(const Slice& key) override { + auto key_prefix = hash_key_extractor_->Transform(key); + bool is_first_entry = pending_block_num_ == 0; + + // Keys may share the prefix + if (is_first_entry || pending_entry_prefix_ != key_prefix) { + if (!is_first_entry) { + FlushPendingPrefix(); + } + + // need a hard copy otherwise the underlying data changes all the time. + // TODO(kailiu) ToString() is expensive. We may speed up can avoid data + // copy. + pending_entry_prefix_ = key_prefix.ToString(); + pending_block_num_ = 1; + pending_entry_index_ = static_cast<uint32_t>(current_restart_index_); + } else { + // entry number increments when keys share the prefix reside in + // different data blocks. + auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1; + assert(last_restart_index <= current_restart_index_); + if (last_restart_index != current_restart_index_) { + ++pending_block_num_; + } + } + } + + virtual Status Finish( + IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) override { + if (pending_block_num_ != 0) { + FlushPendingPrefix(); + } + primary_index_builder_.Finish(index_blocks, last_partition_block_handle); + index_blocks->meta_blocks.insert( + {kHashIndexPrefixesBlock.c_str(), prefix_block_}); + index_blocks->meta_blocks.insert( + {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_}); + return Status::OK(); + } + + virtual size_t IndexSize() const override { + return primary_index_builder_.IndexSize() + prefix_block_.size() + + prefix_meta_block_.size(); + } + + virtual bool seperator_is_key_plus_seq() override { + return primary_index_builder_.seperator_is_key_plus_seq(); + } + + private: + void FlushPendingPrefix() { + prefix_block_.append(pending_entry_prefix_.data(), + pending_entry_prefix_.size()); + PutVarint32Varint32Varint32( + &prefix_meta_block_, + static_cast<uint32_t>(pending_entry_prefix_.size()), + pending_entry_index_, pending_block_num_); + } + + ShortenedIndexBuilder primary_index_builder_; + const SliceTransform* hash_key_extractor_; + + // stores a sequence of prefixes + std::string prefix_block_; + // stores the metadata of prefixes + std::string prefix_meta_block_; + + // The following 3 variables keeps unflushed prefix and its metadata. + // The details of block_num and entry_index can be found in + // "block_hash_index.{h,cc}" + uint32_t pending_block_num_ = 0; + uint32_t pending_entry_index_ = 0; + std::string pending_entry_prefix_; + + uint64_t current_restart_index_ = 0; +}; + +/** + * IndexBuilder for two-level indexing. Internally it creates a new index for + * each partition and Finish then in order when Finish is called on it + * continiously until Status::OK() is returned. + * + * The format on the disk would be I I I I I I IP where I is block containing a + * partition of indexes built using ShortenedIndexBuilder and IP is a block + * containing a secondary index on the partitions, built using + * ShortenedIndexBuilder. + */ +class PartitionedIndexBuilder : public IndexBuilder { + public: + static PartitionedIndexBuilder* CreateIndexBuilder( + const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator, + const bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt); + + explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator, + const BlockBasedTableOptions& table_opt, + const bool use_value_delta_encoding); + + virtual ~PartitionedIndexBuilder(); + + virtual void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override; + + virtual Status Finish( + IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) override; + + virtual size_t IndexSize() const override { return index_size_; } + size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; } + size_t NumPartitions() const; + + inline bool ShouldCutFilterBlock() { + // Current policy is to align the partitions of index and filters + if (cut_filter_block) { + cut_filter_block = false; + return true; + } + return false; + } + + std::string& GetPartitionKey() { return sub_index_last_key_; } + + // Called when an external entity (such as filter partition builder) request + // cutting the next partition + void RequestPartitionCut(); + + virtual bool seperator_is_key_plus_seq() override { + return seperator_is_key_plus_seq_; + } + + bool get_use_value_delta_encoding() { return use_value_delta_encoding_; } + + private: + // Set after ::Finish is called + size_t top_level_index_size_ = 0; + // Set after ::Finish is called + size_t partition_cnt_ = 0; + + void MakeNewSubIndexBuilder(); + + struct Entry { + std::string key; + std::unique_ptr<ShortenedIndexBuilder> value; + }; + std::list<Entry> entries_; // list of partitioned indexes and their keys + BlockBuilder index_block_builder_; // top-level index builder + BlockBuilder index_block_builder_without_seq_; // same for user keys + // the active partition index builder + ShortenedIndexBuilder* sub_index_builder_; + // the last key in the active partition index builder + std::string sub_index_last_key_; + std::unique_ptr<FlushBlockPolicy> flush_policy_; + // true if Finish is called once but not complete yet. + bool finishing_indexes = false; + const BlockBasedTableOptions& table_opt_; + bool seperator_is_key_plus_seq_; + bool use_value_delta_encoding_; + // true if an external entity (such as filter partition builder) request + // cutting the next partition + bool partition_cut_requested_ = true; + // true if it should cut the next filter partition block + bool cut_filter_block = false; + BlockHandle last_encoded_handle_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/mock_block_based_table.h b/src/rocksdb/table/block_based/mock_block_based_table.h new file mode 100644 index 000000000..54817bd67 --- /dev/null +++ b/src/rocksdb/table/block_based/mock_block_based_table.h @@ -0,0 +1,56 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_filter_block.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/filter_policy_internal.h" + +namespace ROCKSDB_NAMESPACE { +namespace mock { + +class MockBlockBasedTable : public BlockBasedTable { + public: + explicit MockBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {} +}; + +class MockBlockBasedTableTester { + static constexpr int kMockLevel = 0; + + public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; + BlockBasedTableOptions table_options_; + InternalKeyComparator icomp_; + std::unique_ptr<BlockBasedTable> table_; + + MockBlockBasedTableTester(const FilterPolicy *filter_policy) + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { + table_options_.filter_policy.reset(filter_policy); + + constexpr bool skip_filters = false; + constexpr bool immortal_table = false; + table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep( + ioptions_, env_options_, table_options_, icomp_, skip_filters, + kMockLevel, immortal_table))); + } + + FilterBitsBuilder* GetBuilder() const { + FilterBuildingContext context(table_options_); + context.column_family_name = "mock_cf"; + context.compaction_style = ioptions_.compaction_style; + context.level_at_creation = kMockLevel; + context.info_log = ioptions_.info_log; + return BloomFilterPolicy::GetBuilderFromContext(context); + } +}; + +} // namespace mock +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/parsed_full_filter_block.cc b/src/rocksdb/table/block_based/parsed_full_filter_block.cc new file mode 100644 index 000000000..3e555387e --- /dev/null +++ b/src/rocksdb/table/block_based/parsed_full_filter_block.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/parsed_full_filter_block.h" +#include "rocksdb/filter_policy.h" + +namespace ROCKSDB_NAMESPACE { + +ParsedFullFilterBlock::ParsedFullFilterBlock(const FilterPolicy* filter_policy, + BlockContents&& contents) + : block_contents_(std::move(contents)), + filter_bits_reader_( + !block_contents_.data.empty() + ? filter_policy->GetFilterBitsReader(block_contents_.data) + : nullptr) {} + +ParsedFullFilterBlock::~ParsedFullFilterBlock() = default; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/parsed_full_filter_block.h b/src/rocksdb/table/block_based/parsed_full_filter_block.h new file mode 100644 index 000000000..36c619921 --- /dev/null +++ b/src/rocksdb/table/block_based/parsed_full_filter_block.h @@ -0,0 +1,40 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <memory> + +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +class FilterBitsReader; +class FilterPolicy; + +// The sharable/cachable part of the full filter. +class ParsedFullFilterBlock { + public: + ParsedFullFilterBlock(const FilterPolicy* filter_policy, + BlockContents&& contents); + ~ParsedFullFilterBlock(); + + FilterBitsReader* filter_bits_reader() const { + return filter_bits_reader_.get(); + } + + // TODO: consider memory usage of the FilterBitsReader + size_t ApproximateMemoryUsage() const { + return block_contents_.ApproximateMemoryUsage(); + } + + bool own_bytes() const { return block_contents_.own_bytes(); } + + private: + BlockContents block_contents_; + std::unique_ptr<FilterBitsReader> filter_bits_reader_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/partitioned_filter_block.cc b/src/rocksdb/table/block_based/partitioned_filter_block.cc new file mode 100644 index 000000000..2138d96dd --- /dev/null +++ b/src/rocksdb/table/block_based/partitioned_filter_block.cc @@ -0,0 +1,388 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_based/partitioned_filter_block.h" + +#include <utility> + +#include "monitoring/perf_context_imp.h" +#include "port/malloc.h" +#include "port/port.h" +#include "rocksdb/filter_policy.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( + const SliceTransform* _prefix_extractor, bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, + PartitionedIndexBuilder* const p_index_builder, + const uint32_t partition_size) + : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering, + filter_bits_builder), + index_on_filter_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_on_filter_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + p_index_builder_(p_index_builder), + keys_added_to_partition_(0) { + keys_per_partition_ = + filter_bits_builder_->CalculateNumEntry(partition_size); +} + +PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {} + +void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( + const Slice* next_key) { + // Use == to send the request only once + if (keys_added_to_partition_ == keys_per_partition_) { + // Currently only index builder is in charge of cutting a partition. We keep + // requesting until it is granted. + p_index_builder_->RequestPartitionCut(); + } + if (!p_index_builder_->ShouldCutFilterBlock()) { + return; + } + filter_gc.push_back(std::unique_ptr<const char[]>(nullptr)); + + // Add the prefix of the next key before finishing the partition. This hack, + // fixes a bug with format_verison=3 where seeking for the prefix would lead + // us to the previous partition. + const bool add_prefix = + next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key); + if (add_prefix) { + FullFilterBlockBuilder::AddPrefix(*next_key); + } + + Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); + std::string& index_key = p_index_builder_->GetPartitionKey(); + filters.push_back({index_key, filter}); + keys_added_to_partition_ = 0; + Reset(); +} + +void PartitionedFilterBlockBuilder::Add(const Slice& key) { + MaybeCutAFilterBlock(&key); + FullFilterBlockBuilder::Add(key); +} + +void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { + FullFilterBlockBuilder::AddKey(key); + keys_added_to_partition_++; +} + +Slice PartitionedFilterBlockBuilder::Finish( + const BlockHandle& last_partition_block_handle, Status* status) { + if (finishing_filters == true) { + // Record the handle of the last written filter block in the index + FilterEntry& last_entry = filters.front(); + std::string handle_encoding; + last_partition_block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64( + &handle_delta_encoding, + last_partition_block_handle.size() - last_encoded_handle_.size()); + last_encoded_handle_ = last_partition_block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_on_filter_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); + if (!p_index_builder_->seperator_is_key_plus_seq()) { + index_on_filter_block_builder_without_seq_.Add( + ExtractUserKey(last_entry.key), handle_encoding, + &handle_delta_encoding_slice); + } + filters.pop_front(); + } else { + MaybeCutAFilterBlock(nullptr); + } + // If there is no filter partition left, then return the index on filter + // partitions + if (UNLIKELY(filters.empty())) { + *status = Status::OK(); + if (finishing_filters) { + if (p_index_builder_->seperator_is_key_plus_seq()) { + return index_on_filter_block_builder_.Finish(); + } else { + return index_on_filter_block_builder_without_seq_.Finish(); + } + } else { + // This is the rare case where no key was added to the filter + return Slice(); + } + } else { + // Return the next filter partition in line and set Incomplete() status to + // indicate we expect more calls to Finish + *status = Status::Incomplete(); + finishing_filters = true; + return filters.front().filter; + } +} + +PartitionedFilterBlockReader::PartitionedFilterBlockReader( + const BlockBasedTable* t, CachableEntry<Block>&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) {} + +std::unique_ptr<FilterBlockReader> PartitionedFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry<Block> filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + use_cache, nullptr /* get_context */, + lookup_context, &filter_block); + if (!s.ok()) { + return std::unique_ptr<FilterBlockReader>(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr<FilterBlockReader>( + new PartitionedFilterBlockReader(table, std::move(filter_block))); +} + +bool PartitionedFilterBlockReader::KeyMayMatch( + const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context) { + assert(const_ikey_ptr != nullptr); + assert(block_offset == kNotValid); + if (!whole_key_filtering()) { + return true; + } + + return MayMatch(key, prefix_extractor, block_offset, no_io, const_ikey_ptr, + get_context, lookup_context, + &FullFilterBlockReader::KeyMayMatch); +} + +bool PartitionedFilterBlockReader::PrefixMayMatch( + const Slice& prefix, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context) { +#ifdef NDEBUG + (void)block_offset; +#endif + assert(const_ikey_ptr != nullptr); + assert(block_offset == kNotValid); + if (!table_prefix_extractor() && !prefix_extractor) { + return true; + } + + return MayMatch(prefix, prefix_extractor, block_offset, no_io, const_ikey_ptr, + get_context, lookup_context, + &FullFilterBlockReader::PrefixMayMatch); +} + +BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( + const CachableEntry<Block>& filter_block, const Slice& entry) const { + IndexBlockIter iter; + const InternalKeyComparator* const comparator = internal_comparator(); + Statistics* kNullStats = nullptr; + filter_block.GetValue()->NewIndexIterator( + comparator, comparator->user_comparator(), &iter, kNullStats, + true /* total_order_seek */, false /* have_first_key */, + index_key_includes_seq(), index_value_is_full()); + iter.Seek(entry); + if (UNLIKELY(!iter.Valid())) { + // entry is larger than all the keys. However its prefix might still be + // present in the last partition. If this is called by PrefixMayMatch this + // is necessary for correct behavior. Otherwise it is unnecessary but safe. + // Assuming this is an unlikely case for full key search, the performance + // overhead should be negligible. + iter.SeekToLast(); + } + assert(iter.Valid()); + BlockHandle fltr_blk_handle = iter.value().handle; + return fltr_blk_handle; +} + +Status PartitionedFilterBlockReader::GetFilterPartitionBlock( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle, + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<ParsedFullFilterBlock>* filter_block) const { + assert(table()); + assert(filter_block); + assert(filter_block->IsEmpty()); + + if (!filter_map_.empty()) { + auto iter = filter_map_.find(fltr_blk_handle.offset()); + // This is a possible scenario since block cache might not have had space + // for the partition + if (iter != filter_map_.end()) { + filter_block->SetUnownedValue(iter->second.GetValue()); + return Status::OK(); + } + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + const Status s = + table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle, + UncompressionDict::GetEmptyDict(), filter_block, + BlockType::kFilter, get_context, lookup_context, + /* for_compaction */ false, /* use_cache */ true); + + return s; +} + +bool PartitionedFilterBlockReader::MayMatch( + const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + FilterFunction filter_function) const { + CachableEntry<Block> filter_block; + Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (UNLIKELY(!s.ok())) { + return true; + } + + if (UNLIKELY(filter_block.GetValue()->size() == 0)) { + return true; + } + + auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr); + if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range + return false; + } + + CachableEntry<ParsedFullFilterBlock> filter_partition_block; + s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle, + no_io, get_context, lookup_context, + &filter_partition_block); + if (UNLIKELY(!s.ok())) { + return true; + } + + FullFilterBlockReader filter_partition(table(), + std::move(filter_partition_block)); + return (filter_partition.*filter_function)( + slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context, + lookup_context); +} + +size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { + size_t usage = ApproximateFilterBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast<PartitionedFilterBlockReader*>(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + // TODO(myabandeh): better estimation for filter_map_ size +} + +// TODO(myabandeh): merge this with the same function in IndexReader +void PartitionedFilterBlockReader::CacheDependencies(bool pin) { + assert(table()); + + const BlockBasedTable::Rep* const rep = table()->get_rep(); + assert(rep); + + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + + CachableEntry<Block> filter_block; + + Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, + &lookup_context, &filter_block); + if (!s.ok()) { + ROCKS_LOG_WARN(rep->ioptions.info_log, + "Error retrieving top-level filter block while trying to " + "cache filter partitions: %s", + s.ToString().c_str()); + return; + } + + // Before read partitions, prefetch them to avoid lots of IOs + assert(filter_block.GetValue()); + + IndexBlockIter biter; + const InternalKeyComparator* const comparator = internal_comparator(); + Statistics* kNullStats = nullptr; + filter_block.GetValue()->NewIndexIterator( + comparator, comparator->user_comparator(), &biter, kNullStats, + true /* total_order_seek */, false /* have_first_key */, + index_key_includes_seq(), index_value_is_full()); + // Index partitions are assumed to be consecuitive. Prefetch them all. + // Read the first block offset + biter.SeekToFirst(); + BlockHandle handle = biter.value().handle; + uint64_t prefetch_off = handle.offset(); + + // Read the last block's offset + biter.SeekToLast(); + handle = biter.value().handle; + uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; + uint64_t prefetch_len = last_off - prefetch_off; + std::unique_ptr<FilePrefetchBuffer> prefetch_buffer; + + prefetch_buffer.reset(new FilePrefetchBuffer()); + s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off, + static_cast<size_t>(prefetch_len)); + + // After prefetch, read the partitions one by one + ReadOptions read_options; + for (biter.SeekToFirst(); biter.Valid(); biter.Next()) { + handle = biter.value().handle; + + CachableEntry<ParsedFullFilterBlock> block; + // TODO: Support counter batch update for partitioned index and + // filter blocks + s = table()->MaybeReadBlockAndLoadToCache( + prefetch_buffer.get(), read_options, handle, + UncompressionDict::GetEmptyDict(), &block, BlockType::kFilter, + nullptr /* get_context */, &lookup_context, nullptr /* contents */); + + assert(s.ok() || block.GetValue() == nullptr); + if (s.ok() && block.GetValue() != nullptr) { + if (block.IsCached()) { + if (pin) { + filter_map_[handle.offset()] = std::move(block); + } + } + } + } +} + +const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator() + const { + assert(table()); + assert(table()->get_rep()); + + return &table()->get_rep()->internal_comparator; +} + +bool PartitionedFilterBlockReader::index_key_includes_seq() const { + assert(table()); + assert(table()->get_rep()); + + return table()->get_rep()->index_key_includes_seq; +} + +bool PartitionedFilterBlockReader::index_value_is_full() const { + assert(table()); + assert(table()->get_rep()); + + return table()->get_rep()->index_value_is_full; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/partitioned_filter_block.h b/src/rocksdb/table/block_based/partitioned_filter_block.h new file mode 100644 index 000000000..314297cab --- /dev/null +++ b/src/rocksdb/table/block_based/partitioned_filter_block.h @@ -0,0 +1,122 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <list> +#include <string> +#include <unordered_map> +#include "db/dbformat.h" +#include "index_builder.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/block_based/block.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/block_based/full_filter_block.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { + public: + explicit PartitionedFilterBlockBuilder( + const SliceTransform* prefix_extractor, bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, + PartitionedIndexBuilder* const p_index_builder, + const uint32_t partition_size); + + virtual ~PartitionedFilterBlockBuilder(); + + void AddKey(const Slice& key) override; + void Add(const Slice& key) override; + + virtual Slice Finish(const BlockHandle& last_partition_block_handle, + Status* status) override; + + private: + // Filter data + BlockBuilder index_on_filter_block_builder_; // top-level index builder + BlockBuilder + index_on_filter_block_builder_without_seq_; // same for user keys + struct FilterEntry { + std::string key; + Slice filter; + }; + std::list<FilterEntry> filters; // list of partitioned indexes and their keys + std::unique_ptr<IndexBuilder> value; + std::vector<std::unique_ptr<const char[]>> filter_gc; + bool finishing_filters = + false; // true if Finish is called once but not complete yet. + // The policy of when cut a filter block and Finish it + void MaybeCutAFilterBlock(const Slice* next_key); + // Currently we keep the same number of partitions for filters and indexes. + // This would allow for some potentioal optimizations in future. If such + // optimizations did not realize we can use different number of partitions and + // eliminate p_index_builder_ + PartitionedIndexBuilder* const p_index_builder_; + // The desired number of keys per partition + uint32_t keys_per_partition_; + // The number of keys added to the last partition so far + uint32_t keys_added_to_partition_; + BlockHandle last_encoded_handle_; +}; + +class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> { + public: + PartitionedFilterBlockReader(const BlockBasedTable* t, + CachableEntry<Block>&& filter_block); + + static std::unique_ptr<FilterBlockReader> Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); + + bool IsBlockBased() override { return false; } + bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + bool PrefixMayMatch(const Slice& prefix, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + size_t ApproximateMemoryUsage() const override; + + private: + BlockHandle GetFilterPartitionHandle(const CachableEntry<Block>& filter_block, + const Slice& entry) const; + Status GetFilterPartitionBlock( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle, + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<ParsedFullFilterBlock>* filter_block) const; + + using FilterFunction = bool (FullFilterBlockReader::*)( + const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context); + bool MayMatch(const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + FilterFunction filter_function) const; + void CacheDependencies(bool pin) override; + + const InternalKeyComparator* internal_comparator() const; + bool index_key_includes_seq() const; + bool index_value_is_full() const; + + protected: + std::unordered_map<uint64_t, CachableEntry<ParsedFullFilterBlock>> + filter_map_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/partitioned_filter_block_test.cc b/src/rocksdb/table/block_based/partitioned_filter_block_test.cc new file mode 100644 index 000000000..071bad9ca --- /dev/null +++ b/src/rocksdb/table/block_based/partitioned_filter_block_test.cc @@ -0,0 +1,424 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <map> + +#include "rocksdb/filter_policy.h" + +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/partitioned_filter_block.h" +#include "table/block_based/filter_policy_internal.h" + +#include "index_builder.h" +#include "logging/logging.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +std::map<uint64_t, std::string> blooms; + +class MockedBlockBasedTable : public BlockBasedTable { + public: + MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib) + : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) { + // Initialize what Open normally does as much as necessary for the test + rep->index_key_includes_seq = pib->seperator_is_key_plus_seq(); + rep->index_value_is_full = !pib->get_use_value_delta_encoding(); + } +}; + +class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader { + public: + MyPartitionedFilterBlockReader(BlockBasedTable* t, + CachableEntry<Block>&& filter_block) + : PartitionedFilterBlockReader(t, std::move(filter_block)) { + for (const auto& pair : blooms) { + const uint64_t offset = pair.first; + const std::string& bloom = pair.second; + + assert(t); + assert(t->get_rep()); + CachableEntry<ParsedFullFilterBlock> block( + new ParsedFullFilterBlock( + t->get_rep()->table_options.filter_policy.get(), + BlockContents(Slice(bloom))), + nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + filter_map_[offset] = std::move(block); + } + } +}; + +class PartitionedFilterBlockTest + : public testing::Test, + virtual public ::testing::WithParamInterface<uint32_t> { + public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; + BlockBasedTableOptions table_options_; + InternalKeyComparator icomp_; + std::unique_ptr<BlockBasedTable> table_; + std::shared_ptr<Cache> cache_; + int bits_per_key_; + + PartitionedFilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator), + bits_per_key_(10) { + table_options_.filter_policy.reset( + NewBloomFilterPolicy(bits_per_key_, false)); + table_options_.format_version = GetParam(); + table_options_.index_block_restart_interval = 3; + } + + ~PartitionedFilterBlockTest() override {} + + const std::string keys[4] = {"afoo", "bar", "box", "hello"}; + const std::string missing_keys[2] = {"missing", "other"}; + + uint64_t MaxIndexSize() { + int num_keys = sizeof(keys) / sizeof(*keys); + uint64_t max_key_size = 0; + for (int i = 1; i < num_keys; i++) { + max_key_size = std::max(max_key_size, static_cast<uint64_t>(keys[i].size())); + } + uint64_t max_index_size = num_keys * (max_key_size + 8 /*handle*/); + return max_index_size; + } + + uint64_t MaxFilterSize() { + int num_keys = sizeof(keys) / sizeof(*keys); + // General, rough over-approximation + return num_keys * bits_per_key_ + (CACHE_LINE_SIZE * 8 + /*metadata*/ 5); + } + + uint64_t last_offset = 10; + BlockHandle Write(const Slice& slice) { + BlockHandle bh(last_offset + 1, slice.size()); + blooms[bh.offset()] = slice.ToString(); + last_offset += bh.size(); + return bh; + } + + PartitionedIndexBuilder* NewIndexBuilder() { + const bool kValueDeltaEncoded = true; + return PartitionedIndexBuilder::CreateIndexBuilder( + &icomp_, !kValueDeltaEncoded, table_options_); + } + + PartitionedFilterBlockBuilder* NewBuilder( + PartitionedIndexBuilder* const p_index_builder, + const SliceTransform* prefix_extractor = nullptr) { + assert(table_options_.block_size_deviation <= 100); + auto partition_size = static_cast<uint32_t>( + ((table_options_.metadata_block_size * + (100 - table_options_.block_size_deviation)) + + 99) / + 100); + partition_size = std::max(partition_size, static_cast<uint32_t>(1)); + const bool kValueDeltaEncoded = true; + return new PartitionedFilterBlockBuilder( + prefix_extractor, table_options_.whole_key_filtering, + BloomFilterPolicy::GetBuilderFromContext( + FilterBuildingContext(table_options_)), + table_options_.index_block_restart_interval, !kValueDeltaEncoded, + p_index_builder, partition_size); + } + + PartitionedFilterBlockReader* NewReader( + PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) { + BlockHandle bh; + Status status; + Slice slice; + do { + slice = builder->Finish(bh, &status); + bh = Write(slice); + } while (status.IsIncomplete()); + + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockedBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table), + pib)); + BlockContents contents(slice); + CachableEntry<Block> block( + new Block(std::move(contents), kDisableGlobalSequenceNumber, + 0 /* read_amp_bytes_per_bit */, nullptr), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + auto reader = + new MyPartitionedFilterBlockReader(table_.get(), std::move(block)); + return reader; + } + + void VerifyReader(PartitionedFilterBlockBuilder* builder, + PartitionedIndexBuilder* pib, bool empty = false, + const SliceTransform* prefix_extractor = nullptr) { + std::unique_ptr<PartitionedFilterBlockReader> reader( + NewReader(builder, pib)); + // Querying added keys + const bool no_io = true; + for (auto key : keys) { + auto ikey = InternalKey(key, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io, + &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + } + { + // querying a key twice + auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->KeyMayMatch( + keys[0], prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); + } + // querying missing keys + for (auto key : missing_keys) { + auto ikey = InternalKey(key, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + if (empty) { + ASSERT_TRUE(reader->KeyMayMatch( + key, prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); + } else { + // assuming a good hash function + ASSERT_FALSE(reader->KeyMayMatch( + key, prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); + } + } + } + + int TestBlockPerKey() { + std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder()); + std::unique_ptr<PartitionedFilterBlockBuilder> builder( + NewBuilder(pib.get())); + int i = 0; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i], keys[i + 1]); + i++; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i], keys[i + 1]); + i++; + builder->Add(keys[i]); + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i], keys[i + 1]); + i++; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i]); + + VerifyReader(builder.get(), pib.get()); + return CountNumOfIndexPartitions(pib.get()); + } + + void TestBlockPerTwoKeys(const SliceTransform* prefix_extractor = nullptr) { + std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder()); + std::unique_ptr<PartitionedFilterBlockBuilder> builder( + NewBuilder(pib.get(), prefix_extractor)); + int i = 0; + builder->Add(keys[i]); + i++; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i], keys[i + 1]); + i++; + builder->Add(keys[i]); + builder->Add(keys[i]); + i++; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i]); + + VerifyReader(builder.get(), pib.get(), prefix_extractor); + } + + void TestBlockPerAllKeys() { + std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder()); + std::unique_ptr<PartitionedFilterBlockBuilder> builder( + NewBuilder(pib.get())); + int i = 0; + builder->Add(keys[i]); + i++; + builder->Add(keys[i]); + i++; + builder->Add(keys[i]); + builder->Add(keys[i]); + i++; + builder->Add(keys[i]); + CutABlock(pib.get(), keys[i]); + + VerifyReader(builder.get(), pib.get()); + } + + void CutABlock(PartitionedIndexBuilder* builder, + const std::string& user_key) { + // Assuming a block is cut, add an entry to the index + std::string key = + std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep()); + BlockHandle dont_care_block_handle(1, 1); + builder->AddIndexEntry(&key, nullptr, dont_care_block_handle); + } + + void CutABlock(PartitionedIndexBuilder* builder, const std::string& user_key, + const std::string& next_user_key) { + // Assuming a block is cut, add an entry to the index + std::string key = + std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep()); + std::string next_key = std::string( + *InternalKey(next_user_key, 0, ValueType::kTypeValue).rep()); + BlockHandle dont_care_block_handle(1, 1); + Slice slice = Slice(next_key.data(), next_key.size()); + builder->AddIndexEntry(&key, &slice, dont_care_block_handle); + } + + int CountNumOfIndexPartitions(PartitionedIndexBuilder* builder) { + IndexBuilder::IndexBlocks dont_care_ib; + BlockHandle dont_care_bh(10, 10); + Status s; + int cnt = 0; + do { + s = builder->Finish(&dont_care_ib, dont_care_bh); + cnt++; + } while (s.IsIncomplete()); + return cnt - 1; // 1 is 2nd level index + } +}; + +INSTANTIATE_TEST_CASE_P(FormatDef, PartitionedFilterBlockTest, + testing::Values(test::kDefaultFormatVersion)); +INSTANTIATE_TEST_CASE_P(FormatLatest, PartitionedFilterBlockTest, + testing::Values(test::kLatestFormatVersion)); + +TEST_P(PartitionedFilterBlockTest, EmptyBuilder) { + std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder()); + std::unique_ptr<PartitionedFilterBlockBuilder> builder(NewBuilder(pib.get())); + const bool empty = true; + VerifyReader(builder.get(), pib.get(), empty); +} + +TEST_P(PartitionedFilterBlockTest, OneBlock) { + uint64_t max_index_size = MaxIndexSize(); + for (uint64_t i = 1; i < max_index_size + 1; i++) { + table_options_.metadata_block_size = i; + TestBlockPerAllKeys(); + } +} + +TEST_P(PartitionedFilterBlockTest, TwoBlocksPerKey) { + uint64_t max_index_size = MaxIndexSize(); + for (uint64_t i = 1; i < max_index_size + 1; i++) { + table_options_.metadata_block_size = i; + TestBlockPerTwoKeys(); + } +} + +// This reproduces the bug that a prefix is the same among multiple consecutive +// blocks but the bug would add it only to the first block. +TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { + // some small number to cause partition cuts + table_options_.metadata_block_size = 1; + std::unique_ptr<const SliceTransform> prefix_extractor( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(1)); + std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder()); + std::unique_ptr<PartitionedFilterBlockBuilder> builder( + NewBuilder(pib.get(), prefix_extractor.get())); + const std::string pkeys[3] = {"p-key10", "p-key20", "p-key30"}; + builder->Add(pkeys[0]); + CutABlock(pib.get(), pkeys[0], pkeys[1]); + builder->Add(pkeys[1]); + CutABlock(pib.get(), pkeys[1], pkeys[2]); + builder->Add(pkeys[2]); + CutABlock(pib.get(), pkeys[2]); + std::unique_ptr<PartitionedFilterBlockReader> reader( + NewReader(builder.get(), pib.get())); + for (auto key : pkeys) { + auto ikey = InternalKey(key, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->PrefixMayMatch( + prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid, + /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + } + // Non-existent keys but with the same prefix + const std::string pnonkeys[4] = {"p-key9", "p-key11", "p-key21", "p-key31"}; + for (auto key : pnonkeys) { + auto ikey = InternalKey(key, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->PrefixMayMatch( + prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid, + /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + } +} + +// This reproduces the bug in format_version=3 that the seeking the prefix will +// lead us to the partition before the one that has filter for the prefix. +TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) { + // some small number to cause partition cuts + table_options_.metadata_block_size = 1; + std::unique_ptr<const SliceTransform> prefix_extractor( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(2)); + std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder()); + std::unique_ptr<PartitionedFilterBlockBuilder> builder( + NewBuilder(pib.get(), prefix_extractor.get())); + // In the bug, searching for prefix "p3" on an index with format version 3, + // will give the key "p3" and the partition of the keys that are <= p3, i.e., + // p2-keys, where the filter for prefix "p3" does not exist. + const std::string pkeys[] = {"p1-key1", "p2-key2", "p3-key3", "p4-key3", + "p5-key3"}; + builder->Add(pkeys[0]); + CutABlock(pib.get(), pkeys[0], pkeys[1]); + builder->Add(pkeys[1]); + CutABlock(pib.get(), pkeys[1], pkeys[2]); + builder->Add(pkeys[2]); + CutABlock(pib.get(), pkeys[2], pkeys[3]); + builder->Add(pkeys[3]); + CutABlock(pib.get(), pkeys[3], pkeys[4]); + builder->Add(pkeys[4]); + CutABlock(pib.get(), pkeys[4]); + std::unique_ptr<PartitionedFilterBlockReader> reader( + NewReader(builder.get(), pib.get())); + for (auto key : pkeys) { + auto prefix = prefix_extractor->Transform(key); + auto ikey = InternalKey(prefix, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->PrefixMayMatch( + prefix, prefix_extractor.get(), kNotValid, + /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + } +} + +TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) { + uint64_t max_index_size = MaxIndexSize(); + for (uint64_t i = 1; i < max_index_size + 1; i++) { + table_options_.metadata_block_size = i; + TestBlockPerKey(); + } +} + +TEST_P(PartitionedFilterBlockTest, PartitionCount) { + int num_keys = sizeof(keys) / sizeof(*keys); + table_options_.metadata_block_size = + std::max(MaxIndexSize(), MaxFilterSize()); + int partitions = TestBlockPerKey(); + ASSERT_EQ(partitions, 1); + // A low number ensures cutting a block after each key + table_options_.metadata_block_size = 1; + partitions = TestBlockPerKey(); + ASSERT_EQ(partitions, num_keys - 1 /* last two keys make one flush */); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/table/block_based/uncompression_dict_reader.cc b/src/rocksdb/table/block_based/uncompression_dict_reader.cc new file mode 100644 index 000000000..78e2b93c1 --- /dev/null +++ b/src/rocksdb/table/block_based/uncompression_dict_reader.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/uncompression_dict_reader.h" +#include "monitoring/perf_context_imp.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +Status UncompressionDictReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(uncompression_dict_reader); + + CachableEntry<UncompressionDict> uncompression_dict; + if (prefetch || !use_cache) { + const Status s = ReadUncompressionDictionary( + table, prefetch_buffer, ReadOptions(), use_cache, + nullptr /* get_context */, lookup_context, &uncompression_dict); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + uncompression_dict.Reset(); + } + } + + uncompression_dict_reader->reset( + new UncompressionDictReader(table, std::move(uncompression_dict))); + + return Status::OK(); +} + +Status UncompressionDictReader::ReadUncompressionDictionary( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<UncompressionDict>* uncompression_dict) { + // TODO: add perf counter for compression dictionary read time + + assert(table); + assert(uncompression_dict); + assert(uncompression_dict->IsEmpty()); + + const BlockBasedTable::Rep* const rep = table->get_rep(); + assert(rep); + assert(!rep->compression_dict_handle.IsNull()); + + const Status s = table->RetrieveBlock( + prefetch_buffer, read_options, rep->compression_dict_handle, + UncompressionDict::GetEmptyDict(), uncompression_dict, + BlockType::kCompressionDictionary, get_context, lookup_context, + /* for_compaction */ false, use_cache); + + if (!s.ok()) { + ROCKS_LOG_WARN( + rep->ioptions.info_log, + "Encountered error while reading data from compression dictionary " + "block %s", + s.ToString().c_str()); + } + + return s; +} + +Status UncompressionDictReader::GetOrReadUncompressionDictionary( + FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<UncompressionDict>* uncompression_dict) const { + assert(uncompression_dict); + + if (!uncompression_dict_.IsEmpty()) { + uncompression_dict->SetUnownedValue(uncompression_dict_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + return ReadUncompressionDictionary(table_, prefetch_buffer, read_options, + cache_dictionary_blocks(), get_context, + lookup_context, uncompression_dict); +} + +size_t UncompressionDictReader::ApproximateMemoryUsage() const { + assert(!uncompression_dict_.GetOwnValue() || + uncompression_dict_.GetValue() != nullptr); + size_t usage = uncompression_dict_.GetOwnValue() + ? uncompression_dict_.GetValue()->ApproximateMemoryUsage() + : 0; + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast<UncompressionDictReader*>(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + + return usage; +} + +bool UncompressionDictReader::cache_dictionary_blocks() const { + assert(table_); + assert(table_->get_rep()); + + return table_->get_rep()->table_options.cache_index_and_filter_blocks; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/block_based/uncompression_dict_reader.h b/src/rocksdb/table/block_based/uncompression_dict_reader.h new file mode 100644 index 000000000..3e7826179 --- /dev/null +++ b/src/rocksdb/table/block_based/uncompression_dict_reader.h @@ -0,0 +1,59 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include <cassert> +#include "table/block_based/cachable_entry.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBasedTable; +struct BlockCacheLookupContext; +class FilePrefetchBuffer; +class GetContext; +struct ReadOptions; +struct UncompressionDict; + +// Provides access to the uncompression dictionary regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +class UncompressionDictReader { + public: + static Status Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader); + + Status GetOrReadUncompressionDictionary( + FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<UncompressionDict>* uncompression_dict) const; + + size_t ApproximateMemoryUsage() const; + + private: + UncompressionDictReader(const BlockBasedTable* t, + CachableEntry<UncompressionDict>&& uncompression_dict) + : table_(t), uncompression_dict_(std::move(uncompression_dict)) { + assert(table_); + } + + bool cache_dictionary_blocks() const; + + static Status ReadUncompressionDictionary( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry<UncompressionDict>* uncompression_dict); + + const BlockBasedTable* table_; + CachableEntry<UncompressionDict> uncompression_dict_; +}; + +} // namespace ROCKSDB_NAMESPACE |