summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/table/plain_table_index.cc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/rocksdb/table/plain_table_index.cc215
1 files changed, 215 insertions, 0 deletions
diff --git a/src/rocksdb/table/plain_table_index.cc b/src/rocksdb/table/plain_table_index.cc
new file mode 100644
index 00000000..43740923
--- /dev/null
+++ b/src/rocksdb/table/plain_table_index.cc
@@ -0,0 +1,215 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+
+#include "table/plain_table_index.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+namespace {
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+ assert(num_buckets > 0);
+ return hash % num_buckets;
+}
+}
+
+Status PlainTableIndex::InitFromRawData(Slice data) {
+ if (!GetVarint32(&data, &index_size_)) {
+ return Status::Corruption("Couldn't read the index size!");
+ }
+ assert(index_size_ > 0);
+ if (!GetVarint32(&data, &num_prefixes_)) {
+ return Status::Corruption("Couldn't read the index size!");
+ }
+ sub_index_size_ =
+ static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
+
+ char* index_data_begin = const_cast<char*>(data.data());
+ index_ = reinterpret_cast<uint32_t*>(index_data_begin);
+ sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
+ return Status::OK();
+}
+
+PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
+ uint32_t prefix_hash, uint32_t* bucket_value) const {
+ int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
+ GetUnaligned(index_ + bucket, bucket_value);
+ if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
+ *bucket_value ^= kSubIndexMask;
+ return kSubindex;
+ }
+ if (*bucket_value >= kMaxFileSize) {
+ return kNoPrefixForBucket;
+ } else {
+ // point directly to the file
+ return kDirectToFile;
+ }
+}
+
+void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
+ uint32_t offset) {
+ if (num_records_in_current_group_ == kNumRecordsPerGroup) {
+ current_group_ = AllocateNewGroup();
+ num_records_in_current_group_ = 0;
+ }
+ auto& new_record = current_group_[num_records_in_current_group_++];
+ new_record.hash = hash;
+ new_record.offset = offset;
+ new_record.next = nullptr;
+}
+
+void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
+ uint32_t key_offset) {
+ if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
+ ++num_prefixes_;
+ if (!is_first_record_) {
+ keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+ }
+ num_keys_per_prefix_ = 0;
+ prev_key_prefix_ = key_prefix_slice.ToString();
+ prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
+ due_index_ = true;
+ }
+
+ if (due_index_) {
+ // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+ record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
+ due_index_ = false;
+ }
+
+ num_keys_per_prefix_++;
+ if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
+ due_index_ = true;
+ }
+ is_first_record_ = false;
+}
+
+Slice PlainTableIndexBuilder::Finish() {
+ AllocateIndex();
+ std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
+ std::vector<uint32_t> entries_per_bucket(index_size_, 0);
+ BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
+
+ keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+ ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s",
+ keys_per_prefix_hist_.ToString().c_str());
+
+ // From the temp data structure, populate indexes.
+ return FillIndexes(hash_to_offsets, entries_per_bucket);
+}
+
+void PlainTableIndexBuilder::AllocateIndex() {
+ if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
+ // Fall back to pure binary search if the user fails to specify a prefix
+ // extractor.
+ index_size_ = 1;
+ } else {
+ double hash_table_size_multipier = 1.0 / hash_table_ratio_;
+ index_size_ =
+ static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
+ assert(index_size_ > 0);
+ }
+}
+
+void PlainTableIndexBuilder::BucketizeIndexes(
+ std::vector<IndexRecord*>* hash_to_offsets,
+ std::vector<uint32_t>* entries_per_bucket) {
+ bool first = true;
+ uint32_t prev_hash = 0;
+ size_t num_records = record_list_.GetNumRecords();
+ for (size_t i = 0; i < num_records; i++) {
+ IndexRecord* index_record = record_list_.At(i);
+ uint32_t cur_hash = index_record->hash;
+ if (first || prev_hash != cur_hash) {
+ prev_hash = cur_hash;
+ first = false;
+ }
+ uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
+ IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
+ index_record->next = prev_bucket_head;
+ (*hash_to_offsets)[bucket] = index_record;
+ (*entries_per_bucket)[bucket]++;
+ }
+
+ sub_index_size_ = 0;
+ for (auto entry_count : *entries_per_bucket) {
+ if (entry_count <= 1) {
+ continue;
+ }
+ // Only buckets with more than 1 entry will have subindex.
+ sub_index_size_ += VarintLength(entry_count);
+ // total bytes needed to store these entries' in-file offsets.
+ sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
+ }
+}
+
+Slice PlainTableIndexBuilder::FillIndexes(
+ const std::vector<IndexRecord*>& hash_to_offsets,
+ const std::vector<uint32_t>& entries_per_bucket) {
+ ROCKS_LOG_DEBUG(ioptions_.info_log,
+ "Reserving %" PRIu32 " bytes for plain table's sub_index",
+ sub_index_size_);
+ auto total_allocate_size = GetTotalSize();
+ char* allocated = arena_->AllocateAligned(
+ total_allocate_size, huge_page_tlb_size_, ioptions_.info_log);
+
+ auto temp_ptr = EncodeVarint32(allocated, index_size_);
+ uint32_t* index =
+ reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
+ char* sub_index = reinterpret_cast<char*>(index + index_size_);
+
+ uint32_t sub_index_offset = 0;
+ for (uint32_t i = 0; i < index_size_; i++) {
+ uint32_t num_keys_for_bucket = entries_per_bucket[i];
+ switch (num_keys_for_bucket) {
+ case 0:
+ // No key for bucket
+ PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
+ break;
+ case 1:
+ // point directly to the file offset
+ PutUnaligned(index + i, hash_to_offsets[i]->offset);
+ break;
+ default:
+ // point to second level indexes.
+ PutUnaligned(index + i, sub_index_offset | PlainTableIndex::kSubIndexMask);
+ char* prev_ptr = &sub_index[sub_index_offset];
+ char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+ sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
+ char* sub_index_pos = &sub_index[sub_index_offset];
+ IndexRecord* record = hash_to_offsets[i];
+ int j;
+ for (j = num_keys_for_bucket - 1; j >= 0 && record;
+ j--, record = record->next) {
+ EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
+ }
+ assert(j == -1 && record == nullptr);
+ sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
+ assert(sub_index_offset <= sub_index_size_);
+ break;
+ }
+ }
+ assert(sub_index_offset == sub_index_size_);
+
+ ROCKS_LOG_DEBUG(ioptions_.info_log,
+ "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
+ index_size_, sub_index_size_);
+ return Slice(allocated, GetTotalSize());
+}
+
+const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
+ "PlainTableIndexBlock";
+}; // namespace rocksdb
+
+#endif // ROCKSDB_LITE