diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/rocksdb/table/block_based/block_based_table_iterator.h | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/rocksdb/table/block_based/block_based_table_iterator.h | 280 |
1 files changed, 280 insertions, 0 deletions
diff --git a/src/rocksdb/table/block_based/block_based_table_iterator.h b/src/rocksdb/table/block_based/block_based_table_iterator.h new file mode 100644 index 000000000..a2918b248 --- /dev/null +++ b/src/rocksdb/table/block_based/block_based_table_iterator.h @@ -0,0 +1,280 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_based_table_reader_impl.h" +#include "table/block_based/block_prefetcher.h" +#include "table/block_based/reader_common.h" + +namespace ROCKSDB_NAMESPACE { +// Iterates over the contents of BlockBasedTable. +class BlockBasedTableIterator : public InternalIteratorBase<Slice> { + // compaction_readahead_size: its value will only be used if for_compaction = + // true + // @param read_options Must outlive this iterator. + public: + BlockBasedTableIterator( + const BlockBasedTable* table, const ReadOptions& read_options, + const InternalKeyComparator& icomp, + std::unique_ptr<InternalIteratorBase<IndexValue>>&& index_iter, + bool check_filter, bool need_upper_bound_check, + const SliceTransform* prefix_extractor, TableReaderCaller caller, + size_t compaction_readahead_size = 0, bool allow_unprepared_value = false) + : index_iter_(std::move(index_iter)), + table_(table), + read_options_(read_options), + icomp_(icomp), + user_comparator_(icomp.user_comparator()), + pinned_iters_mgr_(nullptr), + prefix_extractor_(prefix_extractor), + lookup_context_(caller), + block_prefetcher_( + compaction_readahead_size, + table_->get_rep()->table_options.initial_auto_readahead_size), + allow_unprepared_value_(allow_unprepared_value), + block_iter_points_to_real_block_(false), + check_filter_(check_filter), + need_upper_bound_check_(need_upper_bound_check), + async_read_in_progress_(false) {} + + ~BlockBasedTableIterator() {} + + void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void SeekToFirst() override; + void SeekToLast() override; + void Next() final override; + bool NextAndGetResult(IterateResult* result) override; + void Prev() override; + bool Valid() const override { + return !is_out_of_bound_ && + (is_at_first_key_from_index_ || + (block_iter_points_to_real_block_ && block_iter_.Valid())); + } + Slice key() const override { + assert(Valid()); + if (is_at_first_key_from_index_) { + return index_iter_->value().first_internal_key; + } else { + return block_iter_.key(); + } + } + Slice user_key() const override { + assert(Valid()); + if (is_at_first_key_from_index_) { + return ExtractUserKey(index_iter_->value().first_internal_key); + } else { + return block_iter_.user_key(); + } + } + bool PrepareValue() override { + assert(Valid()); + + if (!is_at_first_key_from_index_) { + return true; + } + + return const_cast<BlockBasedTableIterator*>(this) + ->MaterializeCurrentBlock(); + } + Slice value() const override { + // PrepareValue() must have been called. + assert(!is_at_first_key_from_index_); + assert(Valid()); + + return block_iter_.value(); + } + Status status() const override { + // Prefix index set status to NotFound when the prefix does not exist + if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { + return index_iter_->status(); + } else if (block_iter_points_to_real_block_) { + return block_iter_.status(); + } else if (async_read_in_progress_) { + return Status::TryAgain(); + } else { + return Status::OK(); + } + } + + inline IterBoundCheck UpperBoundCheckResult() override { + if (is_out_of_bound_) { + return IterBoundCheck::kOutOfBound; + } else if (block_upper_bound_check_ == + BlockUpperBound::kUpperBoundBeyondCurBlock) { + assert(!is_out_of_bound_); + return IterBoundCheck::kInbound; + } else { + return IterBoundCheck::kUnknown; + } + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + } + bool IsKeyPinned() const override { + // Our key comes either from block_iter_'s current key + // or index_iter_'s current *value*. + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) || + (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned())); + } + bool IsValuePinned() const override { + assert(!is_at_first_key_from_index_); + assert(Valid()); + + // BlockIter::IsValuePinned() is always true. No need to check + return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && + block_iter_points_to_real_block_; + } + + void ResetDataIter() { + if (block_iter_points_to_real_block_) { + if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) { + block_iter_.DelegateCleanupsTo(pinned_iters_mgr_); + } + block_iter_.Invalidate(Status::OK()); + block_iter_points_to_real_block_ = false; + } + block_upper_bound_check_ = BlockUpperBound::kUnknown; + } + + void SavePrevIndexValue() { + if (block_iter_points_to_real_block_) { + // Reseek. If they end up with the same data block, we shouldn't re-fetch + // the same data block. + prev_block_offset_ = index_iter_->value().handle.offset(); + } + } + + void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { + if (block_prefetcher_.prefetch_buffer() != nullptr && + read_options_.adaptive_readahead) { + block_prefetcher_.prefetch_buffer()->GetReadaheadState( + &(readahead_file_info->data_block_readahead_info)); + if (index_iter_) { + index_iter_->GetReadaheadState(readahead_file_info); + } + } + } + + void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override { + if (read_options_.adaptive_readahead) { + block_prefetcher_.SetReadaheadState( + &(readahead_file_info->data_block_readahead_info)); + if (index_iter_) { + index_iter_->SetReadaheadState(readahead_file_info); + } + } + } + + std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_; + + private: + enum class IterDirection { + kForward, + kBackward, + }; + // This enum indicates whether the upper bound falls into current block + // or beyond. + // +-------------+ + // | cur block | <-- (1) + // +-------------+ + // <-- (2) + // --- <boundary key> --- + // <-- (3) + // +-------------+ + // | next block | <-- (4) + // ...... + // + // When the block is smaller than <boundary key>, kUpperBoundInCurBlock + // is the value to use. The examples are (1) or (2) in the graph. It means + // all keys in the next block or beyond will be out of bound. Keys within + // the current block may or may not be out of bound. + // When the block is larger or equal to <boundary key>, + // kUpperBoundBeyondCurBlock is to be used. The examples are (3) and (4) + // in the graph. It means that all keys in the current block is within the + // upper bound and keys in the next block may or may not be within the uppder + // bound. + // If the boundary key hasn't been checked against the upper bound, + // kUnknown can be used. + enum class BlockUpperBound { + kUpperBoundInCurBlock, + kUpperBoundBeyondCurBlock, + kUnknown, + }; + + const BlockBasedTable* table_; + const ReadOptions& read_options_; + const InternalKeyComparator& icomp_; + UserComparatorWrapper user_comparator_; + PinnedIteratorsManager* pinned_iters_mgr_; + DataBlockIter block_iter_; + const SliceTransform* prefix_extractor_; + uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max(); + BlockCacheLookupContext lookup_context_; + + BlockPrefetcher block_prefetcher_; + + const bool allow_unprepared_value_; + // True if block_iter_ is initialized and points to the same block + // as index iterator. + bool block_iter_points_to_real_block_; + // See InternalIteratorBase::IsOutOfBound(). + bool is_out_of_bound_ = false; + // How current data block's boundary key with the next block is compared with + // iterate upper bound. + BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown; + // True if we're standing at the first key of a block, and we haven't loaded + // that block yet. A call to PrepareValue() will trigger loading the block. + bool is_at_first_key_from_index_ = false; + bool check_filter_; + // TODO(Zhongyi): pick a better name + bool need_upper_bound_check_; + + bool async_read_in_progress_; + + // If `target` is null, seek to first. + void SeekImpl(const Slice* target, bool async_prefetch); + + void InitDataBlock(); + void AsyncInitDataBlock(bool is_first_pass); + bool MaterializeCurrentBlock(); + void FindKeyForward(); + void FindBlockForward(); + void FindKeyBackward(); + void CheckOutOfBound(); + + // Check if data block is fully within iterate_upper_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update data_block_within_upper_bound_ accordingly. + void CheckDataBlockWithinUpperBound(); + + bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) { + if (need_upper_bound_check_ && direction == IterDirection::kBackward) { + // Upper bound check isn't sufficient for backward direction to + // guarantee the same result as total order, so disable prefix + // check. + return true; + } + if (check_filter_ && !table_->PrefixRangeMayMatch( + ikey, read_options_, prefix_extractor_, + need_upper_bound_check_, &lookup_context_)) { + // TODO remember the iterator is invalidated because of prefix + // match. This can avoid the upper level file iterator to falsely + // believe the position is the end of the SST file and move to + // the first key of the next file. + ResetDataIter(); + return false; + } + return true; + } +}; +} // namespace ROCKSDB_NAMESPACE |