summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/table/block_based/block.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/table/block_based/block.h')
-rw-r--r--src/rocksdb/table/block_based/block.h744
1 files changed, 744 insertions, 0 deletions
diff --git a/src/rocksdb/table/block_based/block.h b/src/rocksdb/table/block_based/block.h
new file mode 100644
index 000000000..5d73f72f6
--- /dev/null
+++ b/src/rocksdb/table/block_based/block.h
@@ -0,0 +1,744 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "db/pinned_iterators_manager.h"
+#include "port/malloc.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_hash_index.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct BlockContents;
+class Comparator;
+template <class TValue>
+class BlockIter;
+class DataBlockIter;
+class IndexBlockIter;
+class MetaBlockIter;
+class BlockPrefixIndex;
+
+// BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data
+// bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of
+// bytes in the Block we update the bitmap and increment
+// READ_AMP_ESTIMATE_USEFUL_BYTES.
+class BlockReadAmpBitmap {
+ public:
+ explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
+ Statistics* statistics)
+ : bitmap_(nullptr),
+ bytes_per_bit_pow_(0),
+ statistics_(statistics),
+ rnd_(Random::GetTLSInstance()->Uniform(
+ static_cast<int>(bytes_per_bit))) {
+ TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
+ assert(block_size > 0 && bytes_per_bit > 0);
+
+ // convert bytes_per_bit to be a power of 2
+ while (bytes_per_bit >>= 1) {
+ bytes_per_bit_pow_++;
+ }
+
+ // num_bits_needed = ceil(block_size / bytes_per_bit)
+ size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1;
+ assert(num_bits_needed > 0);
+
+ // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
+ size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
+
+ // Create bitmap and set all the bits to 0
+ bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
+
+ RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
+ }
+
+ ~BlockReadAmpBitmap() { delete[] bitmap_; }
+
+ void Mark(uint32_t start_offset, uint32_t end_offset) {
+ assert(end_offset >= start_offset);
+ // Index of first bit in mask
+ uint32_t start_bit =
+ (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
+ bytes_per_bit_pow_;
+ // Index of last bit in mask + 1
+ uint32_t exclusive_end_bit =
+ (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
+ if (start_bit >= exclusive_end_bit) {
+ return;
+ }
+ assert(exclusive_end_bit > 0);
+
+ if (GetAndSet(start_bit) == 0) {
+ uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
+ << bytes_per_bit_pow_;
+ RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
+ new_useful_bytes);
+ }
+ }
+
+ Statistics* GetStatistics() {
+ return statistics_.load(std::memory_order_relaxed);
+ }
+
+ void SetStatistics(Statistics* stats) { statistics_.store(stats); }
+
+ uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
+
+ size_t ApproximateMemoryUsage() const {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+ return malloc_usable_size((void*)this);
+#endif // ROCKSDB_MALLOC_USABLE_SIZE
+ return sizeof(*this);
+ }
+
+ private:
+ // Get the current value of bit at `bit_idx` and set it to 1
+ inline bool GetAndSet(uint32_t bit_idx) {
+ const uint32_t byte_idx = bit_idx / kBitsPerEntry;
+ const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry);
+
+ return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) &
+ bit_mask;
+ }
+
+ const uint32_t kBytesPersEntry = sizeof(uint32_t); // 4 bytes
+ const uint32_t kBitsPerEntry = kBytesPersEntry * 8; // 32 bits
+
+ // Bitmap used to record the bytes that we read, use atomic to protect
+ // against multiple threads updating the same bit
+ std::atomic<uint32_t>* bitmap_;
+ // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
+ // muliplication and division
+ uint8_t bytes_per_bit_pow_;
+ // Pointer to DB Statistics object, Since this bitmap may outlive the DB
+ // this pointer maybe invalid, but the DB will update it to a valid pointer
+ // by using SetStatistics() before calling Mark()
+ std::atomic<Statistics*> statistics_;
+ uint32_t rnd_;
+};
+
+// class Block is the uncompressed and "parsed" form for blocks containing
+// key-value pairs. (See BlockContents comments for more on terminology.)
+// This includes the in-memory representation of data blocks, index blocks
+// (including partitions), range deletion blocks, properties blocks, metaindex
+// blocks, as well as the top level of the partitioned filter structure (which
+// is actually an index of the filter partitions). It is NOT suitable for
+// compressed blocks in general, filter blocks/partitions, or compression
+// dictionaries.
+//
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details of the format and the various block types.
+//
+// TODO: Rename to ParsedKvBlock?
+class Block {
+ public:
+ // Initialize the block with the specified contents.
+ explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0,
+ Statistics* statistics = nullptr);
+ // No copying allowed
+ Block(const Block&) = delete;
+ void operator=(const Block&) = delete;
+
+ ~Block();
+
+ size_t size() const { return size_; }
+ const char* data() const { return data_; }
+ // The additional memory space taken by the block data.
+ size_t usable_size() const { return contents_.usable_size(); }
+ uint32_t NumRestarts() const;
+ bool own_bytes() const { return contents_.own_bytes(); }
+
+ BlockBasedTableOptions::DataBlockIndexType IndexType() const;
+
+ // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
+ // comparator.
+ //
+ // If iter is null, return new Iterator
+ // If iter is not null, update this one and return it as Iterator*
+ //
+ // Updates read_amp_bitmap_ if it is not nullptr.
+ //
+ // If `block_contents_pinned` is true, the caller will guarantee that when
+ // the cleanup functions are transferred from the iterator to other
+ // classes, e.g. PinnableSlice, the pointer to the bytes will still be
+ // valid. Either the iterator holds cache handle or ownership of some resource
+ // and release them in a release function, or caller is sure that the data
+ // will not go away (for example, it's from mmapped file which will not be
+ // closed).
+ //
+ // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+ // the iterator will simply be set as "invalid", rather than returning
+ // the key that is just pass the target key.
+ DataBlockIter* NewDataIterator(const Comparator* raw_ucmp,
+ SequenceNumber global_seqno,
+ DataBlockIter* iter = nullptr,
+ Statistics* stats = nullptr,
+ bool block_contents_pinned = false);
+
+ // Returns an MetaBlockIter for iterating over blocks containing metadata
+ // (like Properties blocks). Unlike data blocks, the keys for these blocks
+ // do not contain sequence numbers, do not use a user-define comparator, and
+ // do not track read amplification/statistics. Additionally, MetaBlocks will
+ // not assert if the block is formatted improperly.
+ //
+ // If `block_contents_pinned` is true, the caller will guarantee that when
+ // the cleanup functions are transferred from the iterator to other
+ // classes, e.g. PinnableSlice, the pointer to the bytes will still be
+ // valid. Either the iterator holds cache handle or ownership of some resource
+ // and release them in a release function, or caller is sure that the data
+ // will not go away (for example, it's from mmapped file which will not be
+ // closed).
+ MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false);
+
+ // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
+ // comparator.
+ //
+ // key_includes_seq, default true, means that the keys are in internal key
+ // format.
+ // value_is_full, default true, means that no delta encoding is
+ // applied to values.
+ //
+ // If `prefix_index` is not nullptr this block will do hash lookup for the key
+ // prefix. If total_order_seek is true, prefix_index_ is ignored.
+ //
+ // `have_first_key` controls whether IndexValue will contain
+ // first_internal_key. It affects data serialization format, so the same value
+ // have_first_key must be used when writing and reading index.
+ // It is determined by IndexType property of the table.
+ IndexBlockIter* NewIndexIterator(const Comparator* raw_ucmp,
+ SequenceNumber global_seqno,
+ IndexBlockIter* iter, Statistics* stats,
+ bool total_order_seek, bool have_first_key,
+ bool key_includes_seq, bool value_is_full,
+ bool block_contents_pinned = false,
+ BlockPrefixIndex* prefix_index = nullptr);
+
+ // Report an approximation of how much memory has been used.
+ size_t ApproximateMemoryUsage() const;
+
+ private:
+ BlockContents contents_;
+ const char* data_; // contents_.data.data()
+ size_t size_; // contents_.data.size()
+ uint32_t restart_offset_; // Offset in data_ of restart array
+ uint32_t num_restarts_;
+ std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
+ DataBlockHashIndex data_block_hash_index_;
+};
+
+// A `BlockIter` iterates over the entries in a `Block`'s data buffer. The
+// format of this data buffer is an uncompressed, sorted sequence of key-value
+// pairs (see `Block` API for more details).
+//
+// Notably, the keys may either be in internal key format or user key format.
+// Subclasses are responsible for configuring the key format.
+//
+// `BlockIter` intends to provide final overrides for all of
+// `InternalIteratorBase` functions that can move the iterator. It does
+// this to guarantee `UpdateKey()` is called exactly once after each key
+// movement potentially visible to users. In this step, the key is prepared
+// (e.g., serialized if global seqno is in effect) so it can be returned
+// immediately when the user asks for it via calling `key() const`.
+//
+// For its subclasses, it provides protected variants of the above-mentioned
+// final-overridden methods. They are named with the "Impl" suffix, e.g.,
+// `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These
+// "Impl" functions are responsible for positioning `raw_key_` but not
+// invoking `UpdateKey()`.
+template <class TValue>
+class BlockIter : public InternalIteratorBase<TValue> {
+ public:
+ // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
+ // nothing. Calls cleanup functions.
+ virtual void Invalidate(const Status& s) {
+ // Assert that the BlockIter is never deleted while Pinning is Enabled.
+ assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
+
+ data_ = nullptr;
+ current_ = restarts_;
+ status_ = s;
+
+ // Call cleanup callbacks.
+ Cleanable::Reset();
+ }
+
+ bool Valid() const override { return current_ < restarts_; }
+
+ virtual void SeekToFirst() override final {
+ SeekToFirstImpl();
+ UpdateKey();
+ }
+
+ virtual void SeekToLast() override final {
+ SeekToLastImpl();
+ UpdateKey();
+ }
+
+ virtual void Seek(const Slice& target) override final {
+ SeekImpl(target);
+ UpdateKey();
+ }
+
+ virtual void SeekForPrev(const Slice& target) override final {
+ SeekForPrevImpl(target);
+ UpdateKey();
+ }
+
+ virtual void Next() override final {
+ NextImpl();
+ UpdateKey();
+ }
+
+ virtual bool NextAndGetResult(IterateResult* result) override final {
+ // This does not need to call `UpdateKey()` as the parent class only has
+ // access to the `UpdateKey()`-invoking functions.
+ return InternalIteratorBase<TValue>::NextAndGetResult(result);
+ }
+
+ virtual void Prev() override final {
+ PrevImpl();
+ UpdateKey();
+ }
+
+ Status status() const override { return status_; }
+ Slice key() const override {
+ assert(Valid());
+ return key_;
+ }
+
+#ifndef NDEBUG
+ ~BlockIter() override {
+ // Assert that the BlockIter is never deleted while Pinning is Enabled.
+ assert(!pinned_iters_mgr_ ||
+ (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+ status_.PermitUncheckedError();
+ }
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+ pinned_iters_mgr_ = pinned_iters_mgr;
+ }
+ PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
+#endif
+
+ bool IsKeyPinned() const override {
+ return block_contents_pinned_ && key_pinned_;
+ }
+
+ bool IsValuePinned() const override { return block_contents_pinned_; }
+
+ size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
+
+ uint32_t ValueOffset() const {
+ return static_cast<uint32_t>(value_.data() - data_);
+ }
+
+ void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
+
+ Cache::Handle* cache_handle() { return cache_handle_; }
+
+ protected:
+ std::unique_ptr<InternalKeyComparator> icmp_;
+ const char* data_; // underlying block contents
+ uint32_t num_restarts_; // Number of uint32_t entries in restart array
+
+ // Index of restart block in which current_ or current_-1 falls
+ uint32_t restart_index_;
+ uint32_t restarts_; // Offset of restart array (list of fixed32)
+ // current_ is offset in data_ of current entry. >= restarts_ if !Valid
+ uint32_t current_;
+ // Raw key from block.
+ IterKey raw_key_;
+ // Buffer for key data when global seqno assignment is enabled.
+ IterKey key_buf_;
+ Slice value_;
+ Status status_;
+ // Key to be exposed to users.
+ Slice key_;
+ bool key_pinned_;
+ // Whether the block data is guaranteed to outlive this iterator, and
+ // as long as the cleanup functions are transferred to another class,
+ // e.g. PinnableSlice, the pointer to the bytes will still be valid.
+ bool block_contents_pinned_;
+ SequenceNumber global_seqno_;
+
+ virtual void SeekToFirstImpl() = 0;
+ virtual void SeekToLastImpl() = 0;
+ virtual void SeekImpl(const Slice& target) = 0;
+ virtual void SeekForPrevImpl(const Slice& target) = 0;
+ virtual void NextImpl() = 0;
+
+ virtual void PrevImpl() = 0;
+
+ template <typename DecodeEntryFunc>
+ inline bool ParseNextKey(bool* is_shared);
+
+ void InitializeBase(const Comparator* raw_ucmp, const char* data,
+ uint32_t restarts, uint32_t num_restarts,
+ SequenceNumber global_seqno, bool block_contents_pinned) {
+ assert(data_ == nullptr); // Ensure it is called only once
+ assert(num_restarts > 0); // Ensure the param is valid
+
+ icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp);
+ data_ = data;
+ restarts_ = restarts;
+ num_restarts_ = num_restarts;
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ global_seqno_ = global_seqno;
+ block_contents_pinned_ = block_contents_pinned;
+ cache_handle_ = nullptr;
+ }
+
+ // Must be called every time a key is found that needs to be returned to user,
+ // and may be called when no key is found (as a no-op). Updates `key_`,
+ // `key_buf_`, and `key_pinned_` with info about the found key.
+ void UpdateKey() {
+ key_buf_.Clear();
+ if (!Valid()) {
+ return;
+ }
+ if (raw_key_.IsUserKey()) {
+ assert(global_seqno_ == kDisableGlobalSequenceNumber);
+ key_ = raw_key_.GetUserKey();
+ key_pinned_ = raw_key_.IsKeyPinned();
+ } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
+ key_ = raw_key_.GetInternalKey();
+ key_pinned_ = raw_key_.IsKeyPinned();
+ } else {
+ key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_,
+ ExtractValueType(raw_key_.GetInternalKey()));
+ key_ = key_buf_.GetInternalKey();
+ key_pinned_ = false;
+ }
+ }
+
+ // Returns the result of `Comparator::Compare()`, where the appropriate
+ // comparator is used for the block contents, the LHS argument is the current
+ // key with global seqno applied, and the RHS argument is `other`.
+ int CompareCurrentKey(const Slice& other) {
+ if (raw_key_.IsUserKey()) {
+ assert(global_seqno_ == kDisableGlobalSequenceNumber);
+ return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other);
+ } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
+ return icmp_->Compare(raw_key_.GetInternalKey(), other);
+ }
+ return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other,
+ kDisableGlobalSequenceNumber);
+ }
+
+ private:
+ // Store the cache handle, if the block is cached. We need this since the
+ // only other place the handle is stored is as an argument to the Cleanable
+ // function callback, which is hard to retrieve. When multiple value
+ // PinnableSlices reference the block, they need the cache handle in order
+ // to bump up the ref count
+ Cache::Handle* cache_handle_;
+
+ public:
+ // Return the offset in data_ just past the end of the current entry.
+ inline uint32_t NextEntryOffset() const {
+ // NOTE: We don't support blocks bigger than 2GB
+ return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
+ }
+
+ uint32_t GetRestartPoint(uint32_t index) {
+ assert(index < num_restarts_);
+ return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+ }
+
+ void SeekToRestartPoint(uint32_t index) {
+ raw_key_.Clear();
+ restart_index_ = index;
+ // current_ will be fixed by ParseNextKey();
+
+ // ParseNextKey() starts at the end of value_, so set value_ accordingly
+ uint32_t offset = GetRestartPoint(index);
+ value_ = Slice(data_ + offset, 0);
+ }
+
+ void CorruptionError();
+
+ protected:
+ template <typename DecodeKeyFunc>
+ inline bool BinarySeek(const Slice& target, uint32_t* index,
+ bool* is_index_key_result);
+
+ void FindKeyAfterBinarySeek(const Slice& target, uint32_t index,
+ bool is_index_key_result);
+};
+
+class DataBlockIter final : public BlockIter<Slice> {
+ public:
+ DataBlockIter()
+ : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
+ DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts,
+ uint32_t num_restarts, SequenceNumber global_seqno,
+ BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
+ DataBlockHashIndex* data_block_hash_index)
+ : DataBlockIter() {
+ Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno,
+ read_amp_bitmap, block_contents_pinned, data_block_hash_index);
+ }
+ void Initialize(const Comparator* raw_ucmp, const char* data,
+ uint32_t restarts, uint32_t num_restarts,
+ SequenceNumber global_seqno,
+ BlockReadAmpBitmap* read_amp_bitmap,
+ bool block_contents_pinned,
+ DataBlockHashIndex* data_block_hash_index) {
+ InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno,
+ block_contents_pinned);
+ raw_key_.SetIsUserKey(false);
+ read_amp_bitmap_ = read_amp_bitmap;
+ last_bitmap_offset_ = current_ + 1;
+ data_block_hash_index_ = data_block_hash_index;
+ }
+
+ Slice value() const override {
+ assert(Valid());
+ if (read_amp_bitmap_ && current_ < restarts_ &&
+ current_ != last_bitmap_offset_) {
+ read_amp_bitmap_->Mark(current_ /* current entry offset */,
+ NextEntryOffset() - 1);
+ last_bitmap_offset_ = current_;
+ }
+ return value_;
+ }
+
+ inline bool SeekForGet(const Slice& target) {
+ if (!data_block_hash_index_) {
+ SeekImpl(target);
+ UpdateKey();
+ return true;
+ }
+ bool res = SeekForGetImpl(target);
+ UpdateKey();
+ return res;
+ }
+
+ void Invalidate(const Status& s) override {
+ BlockIter::Invalidate(s);
+ // Clear prev entries cache.
+ prev_entries_keys_buff_.clear();
+ prev_entries_.clear();
+ prev_entries_idx_ = -1;
+ }
+
+ protected:
+ friend Block;
+ inline bool ParseNextDataKey(bool* is_shared);
+ void SeekToFirstImpl() override;
+ void SeekToLastImpl() override;
+ void SeekImpl(const Slice& target) override;
+ void SeekForPrevImpl(const Slice& target) override;
+ void NextImpl() override;
+ void PrevImpl() override;
+
+ private:
+ // read-amp bitmap
+ BlockReadAmpBitmap* read_amp_bitmap_;
+ // last `current_` value we report to read-amp bitmp
+ mutable uint32_t last_bitmap_offset_;
+ struct CachedPrevEntry {
+ explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
+ size_t _key_offset, size_t _key_size, Slice _value)
+ : offset(_offset),
+ key_ptr(_key_ptr),
+ key_offset(_key_offset),
+ key_size(_key_size),
+ value(_value) {}
+
+ // offset of entry in block
+ uint32_t offset;
+ // Pointer to key data in block (nullptr if key is delta-encoded)
+ const char* key_ptr;
+ // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr)
+ size_t key_offset;
+ // size of key
+ size_t key_size;
+ // value slice pointing to data in block
+ Slice value;
+ };
+ std::string prev_entries_keys_buff_;
+ std::vector<CachedPrevEntry> prev_entries_;
+ int32_t prev_entries_idx_ = -1;
+
+ DataBlockHashIndex* data_block_hash_index_;
+
+ bool SeekForGetImpl(const Slice& target);
+};
+
+// Iterator over MetaBlocks. MetaBlocks are similar to Data Blocks and
+// are used to store Properties associated with table.
+// Meta blocks always store user keys (no sequence number) and always
+// use the BytewiseComparator. Additionally, MetaBlock accesses are
+// not recorded in the Statistics or for Read-Amplification.
+class MetaBlockIter final : public BlockIter<Slice> {
+ public:
+ MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); }
+ void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts,
+ bool block_contents_pinned) {
+ // Initializes the iterator with a BytewiseComparator and
+ // the raw key being a user key.
+ InitializeBase(BytewiseComparator(), data, restarts, num_restarts,
+ kDisableGlobalSequenceNumber, block_contents_pinned);
+ raw_key_.SetIsUserKey(true);
+ }
+
+ Slice value() const override {
+ assert(Valid());
+ return value_;
+ }
+
+ protected:
+ void SeekToFirstImpl() override;
+ void SeekToLastImpl() override;
+ void SeekImpl(const Slice& target) override;
+ void SeekForPrevImpl(const Slice& target) override;
+ void NextImpl() override;
+ void PrevImpl() override;
+};
+
+class IndexBlockIter final : public BlockIter<IndexValue> {
+ public:
+ IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
+
+ // key_includes_seq, default true, means that the keys are in internal key
+ // format.
+ // value_is_full, default true, means that no delta encoding is
+ // applied to values.
+ void Initialize(const Comparator* raw_ucmp, const char* data,
+ uint32_t restarts, uint32_t num_restarts,
+ SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
+ bool have_first_key, bool key_includes_seq,
+ bool value_is_full, bool block_contents_pinned) {
+ InitializeBase(raw_ucmp, data, restarts, num_restarts,
+ kDisableGlobalSequenceNumber, block_contents_pinned);
+ raw_key_.SetIsUserKey(!key_includes_seq);
+ prefix_index_ = prefix_index;
+ value_delta_encoded_ = !value_is_full;
+ have_first_key_ = have_first_key;
+ if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
+ global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
+ } else {
+ global_seqno_state_.reset();
+ }
+ }
+
+ Slice user_key() const override {
+ assert(Valid());
+ return raw_key_.GetUserKey();
+ }
+
+ IndexValue value() const override {
+ assert(Valid());
+ if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
+ return decoded_value_;
+ } else {
+ IndexValue entry;
+ Slice v = value_;
+ Status decode_s __attribute__((__unused__)) =
+ entry.DecodeFrom(&v, have_first_key_, nullptr);
+ assert(decode_s.ok());
+ return entry;
+ }
+ }
+
+ bool IsValuePinned() const override {
+ return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
+ }
+
+ protected:
+ // IndexBlockIter follows a different contract for prefix iterator
+ // from data iterators.
+ // If prefix of the seek key `target` exists in the file, it must
+ // return the same result as total order seek.
+ // If the prefix of `target` doesn't exist in the file, it can either
+ // return the result of total order seek, or set both of Valid() = false
+ // and status() = NotFound().
+ void SeekImpl(const Slice& target) override;
+
+ void SeekForPrevImpl(const Slice&) override {
+ assert(false);
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ status_ = Status::InvalidArgument(
+ "RocksDB internal error: should never call SeekForPrev() on index "
+ "blocks");
+ raw_key_.Clear();
+ value_.clear();
+ }
+
+ void PrevImpl() override;
+
+ void NextImpl() override;
+
+ void SeekToFirstImpl() override;
+
+ void SeekToLastImpl() override;
+
+ private:
+ bool value_delta_encoded_;
+ bool have_first_key_; // value includes first_internal_key
+ BlockPrefixIndex* prefix_index_;
+ // Whether the value is delta encoded. In that case the value is assumed to be
+ // BlockHandle. The first value in each restart interval is the full encoded
+ // BlockHandle; the restart of encoded size part of the BlockHandle. The
+ // offset of delta encoded BlockHandles is computed by adding the size of
+ // previous delta encoded values in the same restart interval to the offset of
+ // the first value in that restart interval.
+ IndexValue decoded_value_;
+
+ // When sequence number overwriting is enabled, this struct contains the seqno
+ // to overwrite with, and current first_internal_key with overwritten seqno.
+ // This is rarely used, so we put it behind a pointer and only allocate when
+ // needed.
+ struct GlobalSeqnoState {
+ // First internal key according to current index entry, but with sequence
+ // number overwritten to global_seqno.
+ IterKey first_internal_key;
+ SequenceNumber global_seqno;
+
+ explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {}
+ };
+
+ std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
+
+ // Set *prefix_may_exist to false if no key possibly share the same prefix
+ // as `target`. If not set, the result position should be the same as total
+ // order Seek.
+ bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist);
+ // Set *prefix_may_exist to false if no key can possibly share the same
+ // prefix as `target`. If not set, the result position should be the same
+ // as total order seek.
+ bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
+ uint32_t left, uint32_t right, uint32_t* index,
+ bool* prefix_may_exist);
+ inline int CompareBlockKey(uint32_t block_index, const Slice& target);
+
+ inline bool ParseNextIndexKey();
+
+ // When value_delta_encoded_ is enabled it decodes the value which is assumed
+ // to be BlockHandle and put it to decoded_value_
+ inline void DecodeCurrentValue(bool is_shared);
+};
+
+} // namespace ROCKSDB_NAMESPACE