diff options
Diffstat (limited to 'src/rocksdb/table/plain_table_index.cc')
-rw-r--r-- | src/rocksdb/table/plain_table_index.cc | 215 |
1 files changed, 215 insertions, 0 deletions
diff --git a/src/rocksdb/table/plain_table_index.cc b/src/rocksdb/table/plain_table_index.cc new file mode 100644 index 00000000..43740923 --- /dev/null +++ b/src/rocksdb/table/plain_table_index.cc @@ -0,0 +1,215 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include <inttypes.h> + +#include "table/plain_table_index.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace rocksdb { + +namespace { +inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { + assert(num_buckets > 0); + return hash % num_buckets; +} +} + +Status PlainTableIndex::InitFromRawData(Slice data) { + if (!GetVarint32(&data, &index_size_)) { + return Status::Corruption("Couldn't read the index size!"); + } + assert(index_size_ > 0); + if (!GetVarint32(&data, &num_prefixes_)) { + return Status::Corruption("Couldn't read the index size!"); + } + sub_index_size_ = + static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen; + + char* index_data_begin = const_cast<char*>(data.data()); + index_ = reinterpret_cast<uint32_t*>(index_data_begin); + sub_index_ = reinterpret_cast<char*>(index_ + index_size_); + return Status::OK(); +} + +PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset( + uint32_t prefix_hash, uint32_t* bucket_value) const { + int bucket = GetBucketIdFromHash(prefix_hash, index_size_); + GetUnaligned(index_ + bucket, bucket_value); + if ((*bucket_value & kSubIndexMask) == kSubIndexMask) { + *bucket_value ^= kSubIndexMask; + return kSubindex; + } + if (*bucket_value >= kMaxFileSize) { + return kNoPrefixForBucket; + } else { + // point directly to the file + return kDirectToFile; + } +} + +void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash, + uint32_t offset) { + if (num_records_in_current_group_ == kNumRecordsPerGroup) { + current_group_ = AllocateNewGroup(); + num_records_in_current_group_ = 0; + } + auto& new_record = current_group_[num_records_in_current_group_++]; + new_record.hash = hash; + new_record.offset = offset; + new_record.next = nullptr; +} + +void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice, + uint32_t key_offset) { + if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) { + ++num_prefixes_; + if (!is_first_record_) { + keys_per_prefix_hist_.Add(num_keys_per_prefix_); + } + num_keys_per_prefix_ = 0; + prev_key_prefix_ = key_prefix_slice.ToString(); + prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice); + due_index_ = true; + } + + if (due_index_) { + // Add an index key for every kIndexIntervalForSamePrefixKeys keys + record_list_.AddRecord(prev_key_prefix_hash_, key_offset); + due_index_ = false; + } + + num_keys_per_prefix_++; + if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) { + due_index_ = true; + } + is_first_record_ = false; +} + +Slice PlainTableIndexBuilder::Finish() { + AllocateIndex(); + std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr); + std::vector<uint32_t> entries_per_bucket(index_size_, 0); + BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); + + keys_per_prefix_hist_.Add(num_keys_per_prefix_); + ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s", + keys_per_prefix_hist_.ToString().c_str()); + + // From the temp data structure, populate indexes. + return FillIndexes(hash_to_offsets, entries_per_bucket); +} + +void PlainTableIndexBuilder::AllocateIndex() { + if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) { + // Fall back to pure binary search if the user fails to specify a prefix + // extractor. + index_size_ = 1; + } else { + double hash_table_size_multipier = 1.0 / hash_table_ratio_; + index_size_ = + static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1; + assert(index_size_ > 0); + } +} + +void PlainTableIndexBuilder::BucketizeIndexes( + std::vector<IndexRecord*>* hash_to_offsets, + std::vector<uint32_t>* entries_per_bucket) { + bool first = true; + uint32_t prev_hash = 0; + size_t num_records = record_list_.GetNumRecords(); + for (size_t i = 0; i < num_records; i++) { + IndexRecord* index_record = record_list_.At(i); + uint32_t cur_hash = index_record->hash; + if (first || prev_hash != cur_hash) { + prev_hash = cur_hash; + first = false; + } + uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); + IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; + index_record->next = prev_bucket_head; + (*hash_to_offsets)[bucket] = index_record; + (*entries_per_bucket)[bucket]++; + } + + sub_index_size_ = 0; + for (auto entry_count : *entries_per_bucket) { + if (entry_count <= 1) { + continue; + } + // Only buckets with more than 1 entry will have subindex. + sub_index_size_ += VarintLength(entry_count); + // total bytes needed to store these entries' in-file offsets. + sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen; + } +} + +Slice PlainTableIndexBuilder::FillIndexes( + const std::vector<IndexRecord*>& hash_to_offsets, + const std::vector<uint32_t>& entries_per_bucket) { + ROCKS_LOG_DEBUG(ioptions_.info_log, + "Reserving %" PRIu32 " bytes for plain table's sub_index", + sub_index_size_); + auto total_allocate_size = GetTotalSize(); + char* allocated = arena_->AllocateAligned( + total_allocate_size, huge_page_tlb_size_, ioptions_.info_log); + + auto temp_ptr = EncodeVarint32(allocated, index_size_); + uint32_t* index = + reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_)); + char* sub_index = reinterpret_cast<char*>(index + index_size_); + + uint32_t sub_index_offset = 0; + for (uint32_t i = 0; i < index_size_; i++) { + uint32_t num_keys_for_bucket = entries_per_bucket[i]; + switch (num_keys_for_bucket) { + case 0: + // No key for bucket + PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize); + break; + case 1: + // point directly to the file offset + PutUnaligned(index + i, hash_to_offsets[i]->offset); + break; + default: + // point to second level indexes. + PutUnaligned(index + i, sub_index_offset | PlainTableIndex::kSubIndexMask); + char* prev_ptr = &sub_index[sub_index_offset]; + char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); + sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr); + char* sub_index_pos = &sub_index[sub_index_offset]; + IndexRecord* record = hash_to_offsets[i]; + int j; + for (j = num_keys_for_bucket - 1; j >= 0 && record; + j--, record = record->next) { + EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); + } + assert(j == -1 && record == nullptr); + sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket; + assert(sub_index_offset <= sub_index_size_); + break; + } + } + assert(sub_index_offset == sub_index_size_); + + ROCKS_LOG_DEBUG(ioptions_.info_log, + "hash table size: %" PRIu32 ", suffix_map length %" PRIu32, + index_size_, sub_index_size_); + return Slice(allocated, GetTotalSize()); +} + +const std::string PlainTableIndexBuilder::kPlainTableIndexBlock = + "PlainTableIndexBlock"; +}; // namespace rocksdb + +#endif // ROCKSDB_LITE |