summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/table/block_based/block_prefix_index.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/rocksdb/table/block_based/block_prefix_index.cc')
-rw-r--r--src/rocksdb/table/block_based/block_prefix_index.cc232
1 files changed, 232 insertions, 0 deletions
diff --git a/src/rocksdb/table/block_based/block_prefix_index.cc b/src/rocksdb/table/block_based/block_prefix_index.cc
new file mode 100644
index 000000000..f9d92c74c
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefix_index.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/block_prefix_index.h"
+
+#include <vector>
+
+#include "memory/arena.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+inline uint32_t Hash(const Slice& s) {
+ return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0);
+}
+
+inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
+ return Hash(prefix) % num_buckets;
+}
+
+// The prefix block index is simply a bucket array, with each entry pointing to
+// the blocks that span the prefixes hashed to this bucket.
+//
+// To reduce memory footprint, if there is only one block per bucket, the entry
+// stores the block id directly. If there are more than one blocks per bucket,
+// because of hash collision or a single prefix spanning multiple blocks,
+// the entry points to an array of block ids. The block array is an array of
+// uint32_t's. The first uint32_t indicates the total number of blocks, followed
+// by the block ids.
+//
+// To differentiate the two cases, the high order bit of the entry indicates
+// whether it is a 'pointer' into a separate block array.
+// 0x7FFFFFFF is reserved for empty bucket.
+
+const uint32_t kNoneBlock = 0x7FFFFFFF;
+const uint32_t kBlockArrayMask = 0x80000000;
+
+inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; }
+
+inline bool IsBlockId(uint32_t block_id) {
+ return (block_id & kBlockArrayMask) == 0;
+}
+
+inline uint32_t DecodeIndex(uint32_t block_id) {
+ uint32_t index = block_id ^ kBlockArrayMask;
+ assert(index < kBlockArrayMask);
+ return index;
+}
+
+inline uint32_t EncodeIndex(uint32_t index) {
+ assert(index < kBlockArrayMask);
+ return index | kBlockArrayMask;
+}
+
+// temporary storage for prefix information during index building
+struct PrefixRecord {
+ Slice prefix;
+ uint32_t start_block;
+ uint32_t end_block;
+ uint32_t num_blocks;
+ PrefixRecord* next;
+};
+
+class BlockPrefixIndex::Builder {
+ public:
+ explicit Builder(const SliceTransform* internal_prefix_extractor)
+ : internal_prefix_extractor_(internal_prefix_extractor) {}
+
+ void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) {
+ PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
+ arena_.AllocateAligned(sizeof(PrefixRecord)));
+ record->prefix = key_prefix;
+ record->start_block = start_block;
+ record->end_block = start_block + num_blocks - 1;
+ record->num_blocks = num_blocks;
+ prefixes_.push_back(record);
+ }
+
+ BlockPrefixIndex* Finish() {
+ // For now, use roughly 1:1 prefix to bucket ratio.
+ uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
+
+ // Collect prefix records that hash to the same bucket, into a single
+ // linklist.
+ std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
+ std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
+ for (PrefixRecord* current : prefixes_) {
+ uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
+ // merge the prefix block span if the first block of this prefix is
+ // connected to the last block of the previous prefix.
+ PrefixRecord* prev = prefixes_per_bucket[bucket];
+ if (prev) {
+ assert(current->start_block >= prev->end_block);
+ auto distance = current->start_block - prev->end_block;
+ if (distance <= 1) {
+ prev->end_block = current->end_block;
+ prev->num_blocks = prev->end_block - prev->start_block + 1;
+ num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
+ continue;
+ }
+ }
+ current->next = prev;
+ prefixes_per_bucket[bucket] = current;
+ num_blocks_per_bucket[bucket] += current->num_blocks;
+ }
+
+ // Calculate the block array buffer size
+ uint32_t total_block_array_entries = 0;
+ for (uint32_t i = 0; i < num_buckets; i++) {
+ uint32_t num_blocks = num_blocks_per_bucket[i];
+ if (num_blocks > 1) {
+ total_block_array_entries += (num_blocks + 1);
+ }
+ }
+
+ // Populate the final prefix block index
+ uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
+ uint32_t* buckets = new uint32_t[num_buckets];
+ uint32_t offset = 0;
+ for (uint32_t i = 0; i < num_buckets; i++) {
+ uint32_t num_blocks = num_blocks_per_bucket[i];
+ if (num_blocks == 0) {
+ assert(prefixes_per_bucket[i] == nullptr);
+ buckets[i] = kNoneBlock;
+ } else if (num_blocks == 1) {
+ assert(prefixes_per_bucket[i] != nullptr);
+ assert(prefixes_per_bucket[i]->next == nullptr);
+ buckets[i] = prefixes_per_bucket[i]->start_block;
+ } else {
+ assert(total_block_array_entries > 0);
+ assert(prefixes_per_bucket[i] != nullptr);
+ buckets[i] = EncodeIndex(offset);
+ block_array_buffer[offset] = num_blocks;
+ uint32_t* last_block = &block_array_buffer[offset + num_blocks];
+ auto current = prefixes_per_bucket[i];
+ // populate block ids from largest to smallest
+ while (current != nullptr) {
+ for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
+ *last_block = current->end_block - iter;
+ last_block--;
+ }
+ current = current->next;
+ }
+ assert(last_block == &block_array_buffer[offset]);
+ offset += (num_blocks + 1);
+ }
+ }
+
+ assert(offset == total_block_array_entries);
+
+ return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets,
+ buckets, total_block_array_entries,
+ block_array_buffer);
+ }
+
+ private:
+ const SliceTransform* internal_prefix_extractor_;
+
+ std::vector<PrefixRecord*> prefixes_;
+ Arena arena_;
+};
+
+Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
+ const Slice& prefixes, const Slice& prefix_meta,
+ BlockPrefixIndex** prefix_index) {
+ uint64_t pos = 0;
+ auto meta_pos = prefix_meta;
+ Status s;
+ Builder builder(internal_prefix_extractor);
+
+ while (!meta_pos.empty()) {
+ uint32_t prefix_size = 0;
+ uint32_t entry_index = 0;
+ uint32_t num_blocks = 0;
+ if (!GetVarint32(&meta_pos, &prefix_size) ||
+ !GetVarint32(&meta_pos, &entry_index) ||
+ !GetVarint32(&meta_pos, &num_blocks)) {
+ s = Status::Corruption(
+ "Corrupted prefix meta block: unable to read from it.");
+ break;
+ }
+ if (pos + prefix_size > prefixes.size()) {
+ s = Status::Corruption(
+ "Corrupted prefix meta block: size inconsistency.");
+ break;
+ }
+ Slice prefix(prefixes.data() + pos, prefix_size);
+ builder.Add(prefix, entry_index, num_blocks);
+
+ pos += prefix_size;
+ }
+
+ if (s.ok() && pos != prefixes.size()) {
+ s = Status::Corruption("Corrupted prefix meta block");
+ }
+
+ if (s.ok()) {
+ *prefix_index = builder.Finish();
+ }
+
+ return s;
+}
+
+uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) {
+ Slice prefix = internal_prefix_extractor_->Transform(key);
+
+ uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
+ uint32_t block_id = buckets_[bucket];
+
+ if (IsNone(block_id)) {
+ return 0;
+ } else if (IsBlockId(block_id)) {
+ *blocks = &buckets_[bucket];
+ return 1;
+ } else {
+ uint32_t index = DecodeIndex(block_id);
+ assert(index < num_block_array_buffer_entries_);
+ *blocks = &block_array_buffer_[index + 1];
+ uint32_t num_blocks = block_array_buffer_[index];
+ assert(num_blocks > 1);
+ assert(index + num_blocks < num_block_array_buffer_entries_);
+ return num_blocks;
+ }
+}
+
+} // namespace ROCKSDB_NAMESPACE