diff options
Diffstat (limited to 'src/rocksdb/table/plain')
-rw-r--r-- | src/rocksdb/table/plain/plain_table_bloom.cc | 78 | ||||
-rw-r--r-- | src/rocksdb/table/plain/plain_table_bloom.h | 135 | ||||
-rw-r--r-- | src/rocksdb/table/plain/plain_table_builder.cc | 314 | ||||
-rw-r--r-- | src/rocksdb/table/plain/plain_table_builder.h | 151 | ||||
-rw-r--r-- | src/rocksdb/table/plain/plain_table_factory.cc | 235 | ||||
-rw-r--r-- | src/rocksdb/table/plain/plain_table_factory.h | 223 | ||||
-rw-r--r-- | src/rocksdb/table/plain/plain_table_index.cc | 211 | ||||
-rw-r--r-- | src/rocksdb/table/plain/plain_table_index.h | 249 | ||||
-rw-r--r-- | src/rocksdb/table/plain/plain_table_key_coding.cc | 498 | ||||
-rw-r--r-- | src/rocksdb/table/plain/plain_table_key_coding.h | 193 | ||||
-rw-r--r-- | src/rocksdb/table/plain/plain_table_reader.cc | 775 | ||||
-rw-r--r-- | src/rocksdb/table/plain/plain_table_reader.h | 246 |
12 files changed, 3308 insertions, 0 deletions
diff --git a/src/rocksdb/table/plain/plain_table_bloom.cc b/src/rocksdb/table/plain/plain_table_bloom.cc new file mode 100644 index 000000000..7b6833524 --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_bloom.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/plain/plain_table_bloom.h" + +#include <algorithm> +#include <string> +#include "util/dynamic_bloom.h" + +#include "memory/allocator.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +uint32_t GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_blocks = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + + // Make num_blocks an odd number to make sure more bits are involved + // when determining which block. + if (num_blocks % 2 == 0) { + num_blocks++; + } + + return num_blocks * (CACHE_LINE_SIZE * 8); +} +} // namespace + +PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes) + : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {} + +void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits, + uint32_t num_blocks) { + data_ = raw_data; + kTotalBits = total_bits; + kNumBlocks = num_blocks; +} + +void PlainTableBloomV1::SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, + size_t huge_page_tlb_size, + Logger* logger) { + kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits) + : (total_bits + 7) / 8 * 8; + kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0; + + assert(kNumBlocks > 0 || kTotalBits > 0); + assert(kNumProbes > 0); + + uint32_t sz = kTotalBits / 8; + if (kNumBlocks > 0) { + sz += CACHE_LINE_SIZE - 1; + } + assert(allocator); + + char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger); + memset(raw, 0, sz); + auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE; + if (kNumBlocks > 0 && cache_line_offset > 0) { + raw += CACHE_LINE_SIZE - cache_line_offset; + } + data_ = raw; +} + +void BloomBlockBuilder::AddKeysHashes( + const std::vector<uint32_t>& keys_hashes) { + for (auto hash : keys_hashes) { + bloom_.AddHash(hash); + } +} + +Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); } + +const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock"; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/plain/plain_table_bloom.h b/src/rocksdb/table/plain/plain_table_bloom.h new file mode 100644 index 000000000..fdacdb0d5 --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_bloom.h @@ -0,0 +1,135 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <string> +#include <vector> + +#include "rocksdb/slice.h" + +#include "port/port.h" +#include "util/bloom_impl.h" +#include "util/hash.h" + +#include "third-party/folly/folly/ConstexprMath.h" + +#include <memory> + +namespace ROCKSDB_NAMESPACE { +class Slice; +class Allocator; +class Logger; + +// A legacy Bloom filter implementation used by Plain Table db format, for +// schema backward compatibility. Not for use in new filter applications. +class PlainTableBloomV1 { + public: + // allocator: pass allocator to bloom filter, hence trace the usage of memory + // total_bits: fixed total bits for the bloom + // num_probes: number of hash probes for a single key + // locality: If positive, optimize for cache line locality, 0 otherwise. + // hash_func: customized hash function + // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB + // within this page size. Need to reserve huge pages for + // it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + explicit PlainTableBloomV1(uint32_t num_probes = 6); + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger); + + ~PlainTableBloomV1() {} + + // Assuming single threaded access to this function. + void AddHash(uint32_t hash); + + // Multithreaded access to this function is OK + bool MayContainHash(uint32_t hash) const; + + void Prefetch(uint32_t hash); + + uint32_t GetNumBlocks() const { return kNumBlocks; } + + Slice GetRawData() const { return Slice(data_, GetTotalBits() / 8); } + + void SetRawData(char* raw_data, uint32_t total_bits, uint32_t num_blocks = 0); + + uint32_t GetTotalBits() const { return kTotalBits; } + + bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; } + + private: + uint32_t kTotalBits; + uint32_t kNumBlocks; + const uint32_t kNumProbes; + + char* data_; + + static constexpr int LOG2_CACHE_LINE_SIZE = + folly::constexpr_log2(CACHE_LINE_SIZE); +}; + +#if defined(_MSC_VER) +#pragma warning(push) +// local variable is initialized but not referenced +#pragma warning(disable : 4189) +#endif +inline void PlainTableBloomV1::Prefetch(uint32_t h) { + if (kNumBlocks != 0) { + uint32_t ignored; + LegacyLocalityBloomImpl</*ExtraRotates*/ true>::PrepareHashMayMatch( + h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE); + } +} +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const { + assert(IsInitialized()); + if (kNumBlocks != 0) { + return LegacyLocalityBloomImpl<true>::HashMayMatch( + h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE); + } else { + return LegacyNoLocalityBloomImpl::HashMayMatch(h, kTotalBits, kNumProbes, + data_); + } +} + +inline void PlainTableBloomV1::AddHash(uint32_t h) { + assert(IsInitialized()); + if (kNumBlocks != 0) { + LegacyLocalityBloomImpl<true>::AddHash(h, kNumBlocks, kNumProbes, data_, + LOG2_CACHE_LINE_SIZE); + } else { + LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_); + } +} + +class BloomBlockBuilder { + public: + static const std::string kBloomBlock; + + explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {} + + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger) { + bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, + logger); + } + + uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); } + + void AddKeysHashes(const std::vector<uint32_t>& keys_hashes); + + Slice Finish(); + + private: + PlainTableBloomV1 bloom_; +}; + +}; // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/table/plain/plain_table_builder.cc b/src/rocksdb/table/plain/plain_table_builder.cc new file mode 100644 index 000000000..147e46db2 --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_builder.cc @@ -0,0 +1,314 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include "table/plain/plain_table_builder.h" + +#include <assert.h> + +#include <string> +#include <limits> +#include <map> + +#include "db/dbformat.h" +#include "file/writable_file_writer.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block_based/block_builder.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_index.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// a utility that helps writing block content to the file +// @offset will advance if @block_contents was successfully written. +// @block_handle the block handle this particular block. +Status WriteBlock(const Slice& block_contents, WritableFileWriter* file, + uint64_t* offset, BlockHandle* block_handle) { + block_handle->set_offset(*offset); + block_handle->set_size(block_contents.size()); + Status s = file->Append(block_contents); + + if (s.ok()) { + *offset += block_contents.size(); + } + return s; +} + +} // namespace + +// kPlainTableMagicNumber was picked by running +// echo rocksdb.table.plain | sha1sum +// and taking the leading 64 bits. +extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; +extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; + +PlainTableBuilder::PlainTableBuilder( + const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_len, + EncodingType encoding_type, size_t index_sparseness, + uint32_t bloom_bits_per_key, const std::string& column_family_name, + uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio, + bool store_index_in_file) + : ioptions_(ioptions), + moptions_(moptions), + bloom_block_(num_probes), + file_(file), + bloom_bits_per_key_(bloom_bits_per_key), + huge_page_tlb_size_(huge_page_tlb_size), + encoder_(encoding_type, user_key_len, moptions.prefix_extractor.get(), + index_sparseness), + store_index_in_file_(store_index_in_file), + prefix_extractor_(moptions.prefix_extractor.get()) { + // Build index block and save it in the file if hash_table_ratio > 0 + if (store_index_in_file_) { + assert(hash_table_ratio > 0 || IsTotalOrderMode()); + index_builder_.reset(new PlainTableIndexBuilder( + &arena_, ioptions, moptions.prefix_extractor.get(), index_sparseness, + hash_table_ratio, huge_page_tlb_size_)); + properties_.user_collected_properties + [PlainTablePropertyNames::kBloomVersion] = "1"; // For future use + } + + properties_.fixed_key_len = user_key_len; + + // for plain table, we put all the data in a big chuck. + properties_.num_data_blocks = 1; + // Fill it later if store_index_in_file_ == true + properties_.index_size = 0; + properties_.filter_size = 0; + // To support roll-back to previous version, now still use version 0 for + // plain encoding. + properties_.format_version = (encoding_type == kPlain) ? 0 : 1; + properties_.column_family_id = column_family_id; + properties_.column_family_name = column_family_name; + properties_.prefix_extractor_name = moptions_.prefix_extractor != nullptr + ? moptions_.prefix_extractor->Name() + : "nullptr"; + + std::string val; + PutFixed32(&val, static_cast<uint32_t>(encoder_.GetEncodingType())); + properties_.user_collected_properties + [PlainTablePropertyNames::kEncodingType] = val; + + for (auto& collector_factories : *int_tbl_prop_collector_factories) { + table_properties_collectors_.emplace_back( + collector_factories->CreateIntTblPropCollector(column_family_id)); + } +} + +PlainTableBuilder::~PlainTableBuilder() { +} + +void PlainTableBuilder::Add(const Slice& key, const Slice& value) { + // temp buffer for metadata bytes between key and value. + char meta_bytes_buf[6]; + size_t meta_bytes_buf_size = 0; + + ParsedInternalKey internal_key; + if (!ParseInternalKey(key, &internal_key)) { + assert(false); + return; + } + if (internal_key.type == kTypeRangeDeletion) { + status_ = Status::NotSupported("Range deletion unsupported"); + return; + } + + // Store key hash + if (store_index_in_file_) { + if (moptions_.prefix_extractor == nullptr) { + keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key)); + } else { + Slice prefix = + moptions_.prefix_extractor->Transform(internal_key.user_key); + keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix)); + } + } + + // Write value + assert(offset_ <= std::numeric_limits<uint32_t>::max()); + auto prev_offset = static_cast<uint32_t>(offset_); + // Write out the key + encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf, + &meta_bytes_buf_size); + if (SaveIndexInFile()) { + index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset); + } + + // Write value length + uint32_t value_size = static_cast<uint32_t>(value.size()); + char* end_ptr = + EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size); + assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf)); + meta_bytes_buf_size = end_ptr - meta_bytes_buf; + file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size)); + + // Write value + file_->Append(value); + offset_ += value_size + meta_bytes_buf_size; + + properties_.num_entries++; + properties_.raw_key_size += key.size(); + properties_.raw_value_size += value.size(); + if (internal_key.type == kTypeDeletion || + internal_key.type == kTypeSingleDeletion) { + properties_.num_deletions++; + } else if (internal_key.type == kTypeMerge) { + properties_.num_merge_operands++; + } + + // notify property collectors + NotifyCollectTableCollectorsOnAdd( + key, value, offset_, table_properties_collectors_, ioptions_.info_log); +} + +Status PlainTableBuilder::status() const { return status_; } + +Status PlainTableBuilder::Finish() { + assert(!closed_); + closed_ = true; + + properties_.data_size = offset_; + + // Write the following blocks + // 1. [meta block: bloom] - optional + // 2. [meta block: index] - optional + // 3. [meta block: properties] + // 4. [metaindex block] + // 5. [footer] + + MetaIndexBuilder meta_index_builer; + + if (store_index_in_file_ && (properties_.num_entries > 0)) { + assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max()); + Status s; + BlockHandle bloom_block_handle; + if (bloom_bits_per_key_ > 0) { + bloom_block_.SetTotalBits( + &arena_, + static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_, + ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.info_log); + + PutVarint32(&properties_.user_collected_properties + [PlainTablePropertyNames::kNumBloomBlocks], + bloom_block_.GetNumBlocks()); + + bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_); + + Slice bloom_finish_result = bloom_block_.Finish(); + + properties_.filter_size = bloom_finish_result.size(); + s = WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle); + + if (!s.ok()) { + return s; + } + meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle); + } + BlockHandle index_block_handle; + Slice index_finish_result = index_builder_->Finish(); + + properties_.index_size = index_finish_result.size(); + s = WriteBlock(index_finish_result, file_, &offset_, &index_block_handle); + + if (!s.ok()) { + return s; + } + + meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock, + index_block_handle); + } + + // Calculate bloom block size and index block size + PropertyBlockBuilder property_block_builder; + // -- Add basic properties + property_block_builder.AddTableProperty(properties_); + + property_block_builder.Add(properties_.user_collected_properties); + + // -- Add user collected properties + NotifyCollectTableCollectorsOnFinish(table_properties_collectors_, + ioptions_.info_log, + &property_block_builder); + + // -- Write property block + BlockHandle property_block_handle; + auto s = WriteBlock( + property_block_builder.Finish(), + file_, + &offset_, + &property_block_handle + ); + if (!s.ok()) { + return s; + } + meta_index_builer.Add(kPropertiesBlock, property_block_handle); + + // -- write metaindex block + BlockHandle metaindex_block_handle; + s = WriteBlock( + meta_index_builer.Finish(), + file_, + &offset_, + &metaindex_block_handle + ); + if (!s.ok()) { + return s; + } + + // Write Footer + // no need to write out new footer if we're using default checksum + Footer footer(kLegacyPlainTableMagicNumber, 0); + footer.set_metaindex_handle(metaindex_block_handle); + footer.set_index_handle(BlockHandle::NullBlockHandle()); + std::string footer_encoding; + footer.EncodeTo(&footer_encoding); + s = file_->Append(footer_encoding); + if (s.ok()) { + offset_ += footer_encoding.size(); + } + + if (file_ != nullptr) { + file_checksum_ = file_->GetFileChecksum(); + } + return s; +} + +void PlainTableBuilder::Abandon() { + closed_ = true; +} + +uint64_t PlainTableBuilder::NumEntries() const { + return properties_.num_entries; +} + +uint64_t PlainTableBuilder::FileSize() const { + return offset_; +} + +const char* PlainTableBuilder::GetFileChecksumFuncName() const { + if (file_ != nullptr) { + return file_->GetFileChecksumFuncName(); + } else { + return kUnknownFileChecksumFuncName.c_str(); + } +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain/plain_table_builder.h b/src/rocksdb/table/plain/plain_table_builder.h new file mode 100644 index 000000000..fe2bf3cf9 --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_builder.h @@ -0,0 +1,151 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE +#include <stdint.h> +#include <string> +#include <vector> +#include "db/version_edit.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_index.h" +#include "table/plain/plain_table_key_coding.h" +#include "table/table_builder.h" + +namespace ROCKSDB_NAMESPACE { + +class BlockBuilder; +class BlockHandle; +class WritableFile; +class TableBuilder; + +// The builder class of PlainTable. For description of PlainTable format +// See comments of class PlainTableFactory, where instances of +// PlainTableReader are created. +class PlainTableBuilder: public TableBuilder { + public: + // Create a builder that will store the contents of the table it is + // building in *file. Does not close the file. It is up to the + // caller to close the file after calling Finish(). The output file + // will be part of level specified by 'level'. A value of -1 means + // that the caller does not know which level the output file will reside. + PlainTableBuilder( + const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, + const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* + int_tbl_prop_collector_factories, + uint32_t column_family_id, WritableFileWriter* file, + uint32_t user_key_size, EncodingType encoding_type, + size_t index_sparseness, uint32_t bloom_bits_per_key, + const std::string& column_family_name, uint32_t num_probes = 6, + size_t huge_page_tlb_size = 0, double hash_table_ratio = 0, + bool store_index_in_file = false); + // No copying allowed + PlainTableBuilder(const PlainTableBuilder&) = delete; + void operator=(const PlainTableBuilder&) = delete; + + // REQUIRES: Either Finish() or Abandon() has been called. + ~PlainTableBuilder(); + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override; + + // Return non-ok iff some error has been detected. + Status status() const override; + + // Finish building the table. Stops using the file passed to the + // constructor after this function returns. + // REQUIRES: Finish(), Abandon() have not been called + Status Finish() override; + + // Indicate that the contents of this builder should be abandoned. Stops + // using the file passed to the constructor after this function returns. + // If the caller is not going to call Finish(), it must call Abandon() + // before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called + void Abandon() override; + + // Number of calls to Add() so far. + uint64_t NumEntries() const override; + + // Size of the file generated so far. If invoked after a successful + // Finish() call, returns the size of the final generated file. + uint64_t FileSize() const override; + + TableProperties GetTableProperties() const override { return properties_; } + + bool SaveIndexInFile() const { return store_index_in_file_; } + + // Get file checksum + const std::string& GetFileChecksum() const override { return file_checksum_; } + + // Get file checksum function name + const char* GetFileChecksumFuncName() const override; + + private: + Arena arena_; + const ImmutableCFOptions& ioptions_; + const MutableCFOptions& moptions_; + std::vector<std::unique_ptr<IntTblPropCollector>> + table_properties_collectors_; + + BloomBlockBuilder bloom_block_; + std::unique_ptr<PlainTableIndexBuilder> index_builder_; + + WritableFileWriter* file_; + uint64_t offset_ = 0; + uint32_t bloom_bits_per_key_; + size_t huge_page_tlb_size_; + Status status_; + TableProperties properties_; + PlainTableKeyEncoder encoder_; + + bool store_index_in_file_; + + std::vector<uint32_t> keys_or_prefixes_hashes_; + bool closed_ = false; // Either Finish() or Abandon() has been called. + + const SliceTransform* prefix_extractor_; + + // Store file checksum. If checksum is disabled, its value is "0". + std::string file_checksum_ = kUnknownFileChecksum; + + Slice GetPrefix(const Slice& target) const { + assert(target.size() >= 8); // target is internal key + return GetPrefixFromUserKey(GetUserKey(target)); + } + + Slice GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); + } + + Slice GetUserKey(const Slice& key) const { + return Slice(key.data(), key.size() - 8); + } + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return prefix_extractor_->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. + // In that case, + // it falls back to pure binary search and + // total iterator seek is supported. + return Slice(); + } + } + + bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain/plain_table_factory.cc b/src/rocksdb/table/plain/plain_table_factory.cc new file mode 100644 index 000000000..c2db8f395 --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_factory.cc @@ -0,0 +1,235 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE +#include "table/plain/plain_table_factory.h" + +#include <stdint.h> +#include <memory> +#include "db/dbformat.h" +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "table/plain/plain_table_builder.h" +#include "table/plain/plain_table_reader.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +Status PlainTableFactory::NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table, + bool /*prefetch_index_and_filter_in_cache*/) const { + return PlainTableReader::Open( + table_reader_options.ioptions, table_reader_options.env_options, + table_reader_options.internal_comparator, std::move(file), file_size, + table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio, + table_options_.index_sparseness, table_options_.huge_page_tlb_size, + table_options_.full_scan_mode, table_reader_options.immortal, + table_reader_options.prefix_extractor); +} + +TableBuilder* PlainTableFactory::NewTableBuilder( + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + WritableFileWriter* file) const { + // Ignore the skip_filters flag. PlainTable format is optimized for small + // in-memory dbs. The skip_filters optimization is not useful for plain + // tables + // + return new PlainTableBuilder( + table_builder_options.ioptions, table_builder_options.moptions, + table_builder_options.int_tbl_prop_collector_factories, column_family_id, + file, table_options_.user_key_len, table_options_.encoding_type, + table_options_.index_sparseness, table_options_.bloom_bits_per_key, + table_builder_options.column_family_name, 6, + table_options_.huge_page_tlb_size, table_options_.hash_table_ratio, + table_options_.store_index_in_file); +} + +std::string PlainTableFactory::GetPrintableTableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " user_key_len: %u\n", + table_options_.user_key_len); + ret.append(buffer); + snprintf(buffer, kBufferSize, " bloom_bits_per_key: %d\n", + table_options_.bloom_bits_per_key); + ret.append(buffer); + snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", + table_options_.hash_table_ratio); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_sparseness: %" ROCKSDB_PRIszt "\n", + table_options_.index_sparseness); + ret.append(buffer); + snprintf(buffer, kBufferSize, " huge_page_tlb_size: %" ROCKSDB_PRIszt "\n", + table_options_.huge_page_tlb_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " encoding_type: %d\n", + table_options_.encoding_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " full_scan_mode: %d\n", + table_options_.full_scan_mode); + ret.append(buffer); + snprintf(buffer, kBufferSize, " store_index_in_file: %d\n", + table_options_.store_index_in_file); + ret.append(buffer); + return ret; +} + +const PlainTableOptions& PlainTableFactory::table_options() const { + return table_options_; +} + +Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options, + const std::string& opts_str, + PlainTableOptions* new_table_options) { + std::unordered_map<std::string, std::string> opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + return s; + } + return GetPlainTableOptionsFromMap(table_options, opts_map, + new_table_options); +} + +Status GetMemTableRepFactoryFromString( + const std::string& opts_str, + std::unique_ptr<MemTableRepFactory>* new_mem_factory) { + std::vector<std::string> opts_list = StringSplit(opts_str, ':'); + size_t len = opts_list.size(); + + if (opts_list.empty() || opts_list.size() > 2) { + return Status::InvalidArgument("Can't parse memtable_factory option ", + opts_str); + } + + MemTableRepFactory* mem_factory = nullptr; + + if (opts_list[0] == "skip_list") { + // Expecting format + // skip_list:<lookahead> + if (2 == len) { + size_t lookahead = ParseSizeT(opts_list[1]); + mem_factory = new SkipListFactory(lookahead); + } else if (1 == len) { + mem_factory = new SkipListFactory(); + } + } else if (opts_list[0] == "prefix_hash") { + // Expecting format + // prfix_hash:<hash_bucket_count> + if (2 == len) { + size_t hash_bucket_count = ParseSizeT(opts_list[1]); + mem_factory = NewHashSkipListRepFactory(hash_bucket_count); + } else if (1 == len) { + mem_factory = NewHashSkipListRepFactory(); + } + } else if (opts_list[0] == "hash_linkedlist") { + // Expecting format + // hash_linkedlist:<hash_bucket_count> + if (2 == len) { + size_t hash_bucket_count = ParseSizeT(opts_list[1]); + mem_factory = NewHashLinkListRepFactory(hash_bucket_count); + } else if (1 == len) { + mem_factory = NewHashLinkListRepFactory(); + } + } else if (opts_list[0] == "vector") { + // Expecting format + // vector:<count> + if (2 == len) { + size_t count = ParseSizeT(opts_list[1]); + mem_factory = new VectorRepFactory(count); + } else if (1 == len) { + mem_factory = new VectorRepFactory(); + } + } else if (opts_list[0] == "cuckoo") { + return Status::NotSupported( + "cuckoo hash memtable is not supported anymore."); + } else { + return Status::InvalidArgument("Unrecognized memtable_factory option ", + opts_str); + } + + if (mem_factory != nullptr) { + new_mem_factory->reset(mem_factory); + } + + return Status::OK(); +} + +std::string ParsePlainTableOptions(const std::string& name, + const std::string& org_value, + PlainTableOptions* new_options, + bool input_strings_escaped = false, + bool ignore_unknown_options = false) { + const std::string& value = + input_strings_escaped ? UnescapeOptionString(org_value) : org_value; + const auto iter = plain_table_type_info.find(name); + if (iter == plain_table_type_info.end()) { + if (ignore_unknown_options) { + return ""; + } else { + return "Unrecognized option"; + } + } + const auto& opt_info = iter->second; + if (opt_info.verification != OptionVerificationType::kDeprecated && + !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset, + opt_info.type, value)) { + return "Invalid value"; + } + return ""; +} + +Status GetPlainTableOptionsFromMap( + const PlainTableOptions& table_options, + const std::unordered_map<std::string, std::string>& opts_map, + PlainTableOptions* new_table_options, bool input_strings_escaped, + bool /*ignore_unknown_options*/) { + assert(new_table_options); + *new_table_options = table_options; + for (const auto& o : opts_map) { + auto error_message = ParsePlainTableOptions( + o.first, o.second, new_table_options, input_strings_escaped); + if (error_message != "") { + const auto iter = plain_table_type_info.find(o.first); + if (iter == plain_table_type_info.end() || + !input_strings_escaped || // !input_strings_escaped indicates + // the old API, where everything is + // parsable. + (iter->second.verification != OptionVerificationType::kByName && + iter->second.verification != + OptionVerificationType::kByNameAllowNull && + iter->second.verification != + OptionVerificationType::kByNameAllowFromNull && + iter->second.verification != OptionVerificationType::kDeprecated)) { + // Restore "new_options" to the default "base_options". + *new_table_options = table_options; + return Status::InvalidArgument("Can't parse PlainTableOptions:", + o.first + " " + error_message); + } + } + } + return Status::OK(); +} + +extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) { + return new PlainTableFactory(options); +} + +const std::string PlainTablePropertyNames::kEncodingType = + "rocksdb.plain.table.encoding.type"; + +const std::string PlainTablePropertyNames::kBloomVersion = + "rocksdb.plain.table.bloom.version"; + +const std::string PlainTablePropertyNames::kNumBloomBlocks = + "rocksdb.plain.table.bloom.numblocks"; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain/plain_table_factory.h b/src/rocksdb/table/plain/plain_table_factory.h new file mode 100644 index 000000000..64dd171cb --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_factory.h @@ -0,0 +1,223 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE +#include <memory> +#include <string> +#include <stdint.h> + +#include "options/options_helper.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace ROCKSDB_NAMESPACE { + +struct EnvOptions; + +class Status; +class RandomAccessFile; +class WritableFile; +class Table; +class TableBuilder; + +// PlainTableFactory is the entrance function to the PlainTable format of +// SST files. It returns instances PlainTableBuilder as the builder +// class and PlainTableReader as the reader class, where the format is +// actually implemented. +// +// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs. +// Data is not organized in blocks, which allows fast access. Because of +// following downsides +// 1. Data compression is not supported. +// 2. Data is not checksumed. +// it is not recommended to use this format on other type of file systems. +// +// PlainTable requires fixed length key, configured as a constructor +// parameter of the factory class. Output file format: +// +-------------+-----------------+ +// | version | user_key_length | +// +------------++------------+-----------------+ <= key1 offset +// | encoded key1 | value_size | | +// +------------+-------------+-------------+ | +// | value1 | +// | | +// +--------------------------+-------------+---+ <= key2 offset +// | encoded key2 | value_size | | +// +------------+-------------+-------------+ | +// | value2 | +// | | +// | ...... | +// +-----------------+--------------------------+ +// +// When the key encoding type is kPlain. Key part is encoded as: +// +------------+--------------------+ +// | [key_size] | internal key | +// +------------+--------------------+ +// for the case of user_key_len = kPlainTableVariableLength case, +// and simply: +// +----------------------+ +// | internal key | +// +----------------------+ +// for user_key_len != kPlainTableVariableLength case. +// +// If key encoding type is kPrefix. Keys are encoding in this format. +// There are three ways to encode a key: +// (1) Full Key +// +---------------+---------------+-------------------+ +// | Full Key Flag | Full Key Size | Full Internal Key | +// +---------------+---------------+-------------------+ +// which simply encodes a full key +// +// (2) A key shared the same prefix as the previous key, which is encoded as +// format of (1). +// +-------------+-------------+-------------+-------------+------------+ +// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix | +// +-------------+-------------+-------------+-------------+------------+ +// where key is the suffix part of the key, including the internal bytes. +// the actual key will be constructed by concatenating prefix part of the +// previous key, with the suffix part of the key here, with sizes given here. +// +// (3) A key shared the same prefix as the previous key, which is encoded as +// the format of (2). +// +-----------------+-----------------+------------------------+ +// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key | +// +-----------------+-----------------+------------------------+ +// The key will be constructed by concatenating previous key's prefix (which is +// also a prefix which the last key encoded in the format of (1)) and the +// key given here. +// +// For example, we for following keys (prefix and suffix are separated by +// spaces): +// 0000 0001 +// 0000 00021 +// 0000 0002 +// 00011 00 +// 0002 0001 +// Will be encoded like this: +// FK 8 00000001 +// PF 4 SF 5 00021 +// SF 4 0002 +// FK 7 0001100 +// FK 8 00020001 +// (where FK means full key flag, PF means prefix flag and SF means suffix flag) +// +// All those "key flag + key size" shown above are in this format: +// The 8 bits of the first byte: +// +----+----+----+----+----+----+----+----+ +// | Type | Size | +// +----+----+----+----+----+----+----+----+ +// Type indicates: full key, prefix, or suffix. +// The last 6 bits are for size. If the size bits are not all 1, it means the +// size of the key. Otherwise, varint32 is read after this byte. This varint +// value + 0x3F (the value of all 1) will be the key size. +// +// For example, full key with length 16 will be encoded as (binary): +// 00 010000 +// (00 means full key) +// and a prefix with 100 bytes will be encoded as: +// 01 111111 00100101 +// (63) (37) +// (01 means key suffix) +// +// All the internal keys above (including kPlain and kPrefix) are encoded in +// this format: +// There are two types: +// (1) normal internal key format +// +----------- ...... -------------+----+---+---+---+---+---+---+---+ +// | user key |type| sequence ID | +// +----------- ..... --------------+----+---+---+---+---+---+---+---+ +// (2) Special case for keys whose sequence ID is 0 and is value type +// +----------- ...... -------------+----+ +// | user key |0x80| +// +----------- ..... --------------+----+ +// To save 7 bytes for the special case where sequence ID = 0. +// +// +class PlainTableFactory : public TableFactory { + public: + ~PlainTableFactory() {} + // user_key_len is the length of the user key. If it is set to be + // kPlainTableVariableLength, then it means variable length. Otherwise, all + // the keys need to have the fix length of this value. bloom_bits_per_key is + // number of bits used for bloom filer per key. hash_table_ratio is + // the desired utilization of the hash table used for prefix hashing. + // hash_table_ratio = number of prefixes / #buckets in the hash table + // hash_table_ratio = 0 means skip hash table but only replying on binary + // search. + // index_sparseness determines index interval for keys + // inside the same prefix. It will be the maximum number of linear search + // required after hash and binary search. + // index_sparseness = 0 means index for every key. + // huge_page_tlb_size determines whether to allocate hash indexes from huge + // page TLB and the page size if allocating from there. See comments of + // Arena::AllocateAligned() for details. + explicit PlainTableFactory( + const PlainTableOptions& _table_options = PlainTableOptions()) + : table_options_(_table_options) {} + + const char* Name() const override { return "PlainTable"; } + Status NewTableReader(const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, + uint64_t file_size, std::unique_ptr<TableReader>* table, + bool prefetch_index_and_filter_in_cache) const override; + + TableBuilder* NewTableBuilder( + const TableBuilderOptions& table_builder_options, + uint32_t column_family_id, WritableFileWriter* file) const override; + + std::string GetPrintableTableOptions() const override; + + const PlainTableOptions& table_options() const; + + static const char kValueTypeSeqId0 = char(~0); + + // Sanitizes the specified DB Options. + Status SanitizeOptions( + const DBOptions& /*db_opts*/, + const ColumnFamilyOptions& /*cf_opts*/) const override { + return Status::OK(); + } + + void* GetOptions() override { return &table_options_; } + + Status GetOptionString(std::string* /*opt_string*/, + const std::string& /*delimiter*/) const override { + return Status::OK(); + } + + private: + PlainTableOptions table_options_; +}; + +static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = { + {"user_key_len", + {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T, + OptionVerificationType::kNormal, false, 0}}, + {"bloom_bits_per_key", + {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt, + OptionVerificationType::kNormal, false, 0}}, + {"hash_table_ratio", + {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble, + OptionVerificationType::kNormal, false, 0}}, + {"index_sparseness", + {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT, + OptionVerificationType::kNormal, false, 0}}, + {"huge_page_tlb_size", + {offsetof(struct PlainTableOptions, huge_page_tlb_size), + OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, + {"encoding_type", + {offsetof(struct PlainTableOptions, encoding_type), + OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}}, + {"full_scan_mode", + {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean, + OptionVerificationType::kNormal, false, 0}}, + {"store_index_in_file", + {offsetof(struct PlainTableOptions, store_index_in_file), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain/plain_table_index.cc b/src/rocksdb/table/plain/plain_table_index.cc new file mode 100644 index 000000000..1099dfa6e --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_index.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include <cinttypes> + +#include "table/plain/plain_table_index.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { + assert(num_buckets > 0); + return hash % num_buckets; +} +} + +Status PlainTableIndex::InitFromRawData(Slice data) { + if (!GetVarint32(&data, &index_size_)) { + return Status::Corruption("Couldn't read the index size!"); + } + assert(index_size_ > 0); + if (!GetVarint32(&data, &num_prefixes_)) { + return Status::Corruption("Couldn't read the index size!"); + } + sub_index_size_ = + static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen; + + char* index_data_begin = const_cast<char*>(data.data()); + index_ = reinterpret_cast<uint32_t*>(index_data_begin); + sub_index_ = reinterpret_cast<char*>(index_ + index_size_); + return Status::OK(); +} + +PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset( + uint32_t prefix_hash, uint32_t* bucket_value) const { + int bucket = GetBucketIdFromHash(prefix_hash, index_size_); + GetUnaligned(index_ + bucket, bucket_value); + if ((*bucket_value & kSubIndexMask) == kSubIndexMask) { + *bucket_value ^= kSubIndexMask; + return kSubindex; + } + if (*bucket_value >= kMaxFileSize) { + return kNoPrefixForBucket; + } else { + // point directly to the file + return kDirectToFile; + } +} + +void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash, + uint32_t offset) { + if (num_records_in_current_group_ == kNumRecordsPerGroup) { + current_group_ = AllocateNewGroup(); + num_records_in_current_group_ = 0; + } + auto& new_record = current_group_[num_records_in_current_group_++]; + new_record.hash = hash; + new_record.offset = offset; + new_record.next = nullptr; +} + +void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice, + uint32_t key_offset) { + if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) { + ++num_prefixes_; + if (!is_first_record_) { + keys_per_prefix_hist_.Add(num_keys_per_prefix_); + } + num_keys_per_prefix_ = 0; + prev_key_prefix_ = key_prefix_slice.ToString(); + prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice); + due_index_ = true; + } + + if (due_index_) { + // Add an index key for every kIndexIntervalForSamePrefixKeys keys + record_list_.AddRecord(prev_key_prefix_hash_, key_offset); + due_index_ = false; + } + + num_keys_per_prefix_++; + if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) { + due_index_ = true; + } + is_first_record_ = false; +} + +Slice PlainTableIndexBuilder::Finish() { + AllocateIndex(); + std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr); + std::vector<uint32_t> entries_per_bucket(index_size_, 0); + BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); + + keys_per_prefix_hist_.Add(num_keys_per_prefix_); + ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s", + keys_per_prefix_hist_.ToString().c_str()); + + // From the temp data structure, populate indexes. + return FillIndexes(hash_to_offsets, entries_per_bucket); +} + +void PlainTableIndexBuilder::AllocateIndex() { + if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) { + // Fall back to pure binary search if the user fails to specify a prefix + // extractor. + index_size_ = 1; + } else { + double hash_table_size_multipier = 1.0 / hash_table_ratio_; + index_size_ = + static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1; + assert(index_size_ > 0); + } +} + +void PlainTableIndexBuilder::BucketizeIndexes( + std::vector<IndexRecord*>* hash_to_offsets, + std::vector<uint32_t>* entries_per_bucket) { + bool first = true; + uint32_t prev_hash = 0; + size_t num_records = record_list_.GetNumRecords(); + for (size_t i = 0; i < num_records; i++) { + IndexRecord* index_record = record_list_.At(i); + uint32_t cur_hash = index_record->hash; + if (first || prev_hash != cur_hash) { + prev_hash = cur_hash; + first = false; + } + uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); + IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; + index_record->next = prev_bucket_head; + (*hash_to_offsets)[bucket] = index_record; + (*entries_per_bucket)[bucket]++; + } + + sub_index_size_ = 0; + for (auto entry_count : *entries_per_bucket) { + if (entry_count <= 1) { + continue; + } + // Only buckets with more than 1 entry will have subindex. + sub_index_size_ += VarintLength(entry_count); + // total bytes needed to store these entries' in-file offsets. + sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen; + } +} + +Slice PlainTableIndexBuilder::FillIndexes( + const std::vector<IndexRecord*>& hash_to_offsets, + const std::vector<uint32_t>& entries_per_bucket) { + ROCKS_LOG_DEBUG(ioptions_.info_log, + "Reserving %" PRIu32 " bytes for plain table's sub_index", + sub_index_size_); + auto total_allocate_size = GetTotalSize(); + char* allocated = arena_->AllocateAligned( + total_allocate_size, huge_page_tlb_size_, ioptions_.info_log); + + auto temp_ptr = EncodeVarint32(allocated, index_size_); + uint32_t* index = + reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_)); + char* sub_index = reinterpret_cast<char*>(index + index_size_); + + uint32_t sub_index_offset = 0; + for (uint32_t i = 0; i < index_size_; i++) { + uint32_t num_keys_for_bucket = entries_per_bucket[i]; + switch (num_keys_for_bucket) { + case 0: + // No key for bucket + PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize); + break; + case 1: + // point directly to the file offset + PutUnaligned(index + i, hash_to_offsets[i]->offset); + break; + default: + // point to second level indexes. + PutUnaligned(index + i, sub_index_offset | PlainTableIndex::kSubIndexMask); + char* prev_ptr = &sub_index[sub_index_offset]; + char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); + sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr); + char* sub_index_pos = &sub_index[sub_index_offset]; + IndexRecord* record = hash_to_offsets[i]; + int j; + for (j = num_keys_for_bucket - 1; j >= 0 && record; + j--, record = record->next) { + EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); + } + assert(j == -1 && record == nullptr); + sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket; + assert(sub_index_offset <= sub_index_size_); + break; + } + } + assert(sub_index_offset == sub_index_size_); + + ROCKS_LOG_DEBUG(ioptions_.info_log, + "hash table size: %" PRIu32 ", suffix_map length %" PRIu32, + index_size_, sub_index_size_); + return Slice(allocated, GetTotalSize()); +} + +const std::string PlainTableIndexBuilder::kPlainTableIndexBlock = + "PlainTableIndexBlock"; +}; // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain/plain_table_index.h b/src/rocksdb/table/plain/plain_table_index.h new file mode 100644 index 000000000..86385b906 --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_index.h @@ -0,0 +1,249 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <string> +#include <vector> + +#include "db/dbformat.h" +#include "memory/arena.h" +#include "monitoring/histogram.h" +#include "options/cf_options.h" +#include "rocksdb/options.h" + +namespace ROCKSDB_NAMESPACE { + +// The file contains two classes PlainTableIndex and PlainTableIndexBuilder +// The two classes implement the index format of PlainTable. +// For descripton of PlainTable format, see comments of class +// PlainTableFactory +// +// +// PlainTableIndex contains buckets size of index_size_, each is a +// 32-bit integer. The lower 31 bits contain an offset value (explained below) +// and the first bit of the integer indicates type of the offset. +// +// +--------------+------------------------------------------------------+ +// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + +// +--------------+------------------------------------------------------+ +// +// Explanation for the "flag bit": +// +// 0 indicates that the bucket contains only one prefix (no conflict when +// hashing this prefix), whose first row starts from this offset of the +// file. +// 1 indicates that the bucket contains more than one prefixes, or there +// are too many rows for one prefix so we need a binary search for it. In +// this case, the offset indicates the offset of sub_index_ holding the +// binary search indexes of keys for those rows. Those binary search indexes +// are organized in this way: +// +// The first 4 bytes, indicate how many indexes (N) are stored after it. After +// it, there are N 32-bit integers, each points of an offset of the file, +// which +// points to starting of a row. Those offsets need to be guaranteed to be in +// ascending order so the keys they are pointing to are also in ascending +// order +// to make sure we can use them to do binary searches. Below is visual +// presentation of a bucket. +// +// <begin> +// number_of_records: varint32 +// record 1 file offset: fixedint32 +// record 2 file offset: fixedint32 +// .... +// record N file offset: fixedint32 +// <end> + +// The class loads the index block from a PlainTable SST file, and executes +// the index lookup. +// The class is used by PlainTableReader class. +class PlainTableIndex { + public: + enum IndexSearchResult { + kNoPrefixForBucket = 0, + kDirectToFile = 1, + kSubindex = 2 + }; + + explicit PlainTableIndex(Slice data) { InitFromRawData(data); } + + PlainTableIndex() + : index_size_(0), + sub_index_size_(0), + num_prefixes_(0), + index_(nullptr), + sub_index_(nullptr) {} + + // The function that executes the lookup the hash table. + // The hash key is `prefix_hash`. The function fills the hash bucket + // content in `bucket_value`, which is up to the caller to interpret. + IndexSearchResult GetOffset(uint32_t prefix_hash, + uint32_t* bucket_value) const; + + // Initialize data from `index_data`, which points to raw data for + // index stored in the SST file. + Status InitFromRawData(Slice index_data); + + // Decode the sub index for specific hash bucket. + // The `offset` is the value returned as `bucket_value` by GetOffset() + // and is only valid when the return value is `kSubindex`. + // The return value is the pointer to the starting address of the + // sub-index. `upper_bound` is filled with the value indicating how many + // entries the sub-index has. + const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset, + uint32_t* upper_bound) const { + const char* index_ptr = &sub_index_[offset]; + return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound); + } + + uint32_t GetIndexSize() const { return index_size_; } + + uint32_t GetSubIndexSize() const { return sub_index_size_; } + + uint32_t GetNumPrefixes() const { return num_prefixes_; } + + static const uint64_t kMaxFileSize = (1u << 31) - 1; + static const uint32_t kSubIndexMask = 0x80000000; + static const size_t kOffsetLen = sizeof(uint32_t); + + private: + uint32_t index_size_; + uint32_t sub_index_size_; + uint32_t num_prefixes_; + + uint32_t* index_; + char* sub_index_; +}; + +// PlainTableIndexBuilder is used to create plain table index. +// After calling Finish(), it returns Slice, which is usually +// used either to initialize PlainTableIndex or +// to save index to sst file. +// For more details about the index, please refer to: +// https://github.com/facebook/rocksdb/wiki/PlainTable-Format +// #wiki-in-memory-index-format +// The class is used by PlainTableBuilder class. +class PlainTableIndexBuilder { + public: + PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, + const SliceTransform* prefix_extractor, + size_t index_sparseness, double hash_table_ratio, + size_t huge_page_tlb_size) + : arena_(arena), + ioptions_(ioptions), + record_list_(kRecordsPerGroup), + is_first_record_(true), + due_index_(false), + num_prefixes_(0), + num_keys_per_prefix_(0), + prev_key_prefix_hash_(0), + index_sparseness_(index_sparseness), + index_size_(0), + sub_index_size_(0), + prefix_extractor_(prefix_extractor), + hash_table_ratio_(hash_table_ratio), + huge_page_tlb_size_(huge_page_tlb_size) {} + + void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset); + + Slice Finish(); + + uint32_t GetTotalSize() const { + return VarintLength(index_size_) + VarintLength(num_prefixes_) + + PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_; + } + + static const std::string kPlainTableIndexBlock; + + private: + struct IndexRecord { + uint32_t hash; // hash of the prefix + uint32_t offset; // offset of a row + IndexRecord* next; + }; + + // Helper class to track all the index records + class IndexRecordList { + public: + explicit IndexRecordList(size_t num_records_per_group) + : kNumRecordsPerGroup(num_records_per_group), + current_group_(nullptr), + num_records_in_current_group_(num_records_per_group) {} + + ~IndexRecordList() { + for (size_t i = 0; i < groups_.size(); i++) { + delete[] groups_[i]; + } + } + + void AddRecord(uint32_t hash, uint32_t offset); + + size_t GetNumRecords() const { + return (groups_.size() - 1) * kNumRecordsPerGroup + + num_records_in_current_group_; + } + IndexRecord* At(size_t index) { + return &(groups_[index / kNumRecordsPerGroup] + [index % kNumRecordsPerGroup]); + } + + private: + IndexRecord* AllocateNewGroup() { + IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; + groups_.push_back(result); + return result; + } + + // Each group in `groups_` contains fix-sized records (determined by + // kNumRecordsPerGroup). Which can help us minimize the cost if resizing + // occurs. + const size_t kNumRecordsPerGroup; + IndexRecord* current_group_; + // List of arrays allocated + std::vector<IndexRecord*> groups_; + size_t num_records_in_current_group_; + }; + + void AllocateIndex(); + + // Internal helper function to bucket index record list to hash buckets. + void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets, + std::vector<uint32_t>* entries_per_bucket); + + // Internal helper class to fill the indexes and bloom filters to internal + // data structures. + Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets, + const std::vector<uint32_t>& entries_per_bucket); + + Arena* arena_; + const ImmutableCFOptions ioptions_; + HistogramImpl keys_per_prefix_hist_; + IndexRecordList record_list_; + bool is_first_record_; + bool due_index_; + uint32_t num_prefixes_; + uint32_t num_keys_per_prefix_; + + uint32_t prev_key_prefix_hash_; + size_t index_sparseness_; + uint32_t index_size_; + uint32_t sub_index_size_; + + const SliceTransform* prefix_extractor_; + double hash_table_ratio_; + size_t huge_page_tlb_size_; + + std::string prev_key_prefix_; + + static const size_t kRecordsPerGroup = 256; +}; + +}; // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain/plain_table_key_coding.cc b/src/rocksdb/table/plain/plain_table_key_coding.cc new file mode 100644 index 000000000..d82b969ba --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_key_coding.cc @@ -0,0 +1,498 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE +#include "table/plain/plain_table_key_coding.h" + +#include <algorithm> +#include <string> +#include "db/dbformat.h" +#include "file/writable_file_writer.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_reader.h" + +namespace ROCKSDB_NAMESPACE { + +enum PlainTableEntryType : unsigned char { + kFullKey = 0, + kPrefixFromPreviousKey = 1, + kKeySuffix = 2, +}; + +namespace { + +// Control byte: +// First two bits indicate type of entry +// Other bytes are inlined sizes. If all bits are 1 (0x03F), overflow bytes +// are used. key_size-0x3F will be encoded as a variint32 after this bytes. + +const unsigned char kSizeInlineLimit = 0x3F; + +// Return 0 for error +size_t EncodeSize(PlainTableEntryType type, uint32_t key_size, + char* out_buffer) { + out_buffer[0] = type << 6; + + if (key_size < static_cast<uint32_t>(kSizeInlineLimit)) { + // size inlined + out_buffer[0] |= static_cast<char>(key_size); + return 1; + } else { + out_buffer[0] |= kSizeInlineLimit; + char* ptr = EncodeVarint32(out_buffer + 1, key_size - kSizeInlineLimit); + return ptr - out_buffer; + } +} +} // namespace + +// Fill bytes_read with number of bytes read. +inline Status PlainTableKeyDecoder::DecodeSize(uint32_t start_offset, + PlainTableEntryType* entry_type, + uint32_t* key_size, + uint32_t* bytes_read) { + Slice next_byte_slice; + bool success = file_reader_.Read(start_offset, 1, &next_byte_slice); + if (!success) { + return file_reader_.status(); + } + *entry_type = static_cast<PlainTableEntryType>( + (static_cast<unsigned char>(next_byte_slice[0]) & ~kSizeInlineLimit) >> + 6); + char inline_key_size = next_byte_slice[0] & kSizeInlineLimit; + if (inline_key_size < kSizeInlineLimit) { + *key_size = inline_key_size; + *bytes_read = 1; + return Status::OK(); + } else { + uint32_t extra_size; + uint32_t tmp_bytes_read; + success = file_reader_.ReadVarint32(start_offset + 1, &extra_size, + &tmp_bytes_read); + if (!success) { + return file_reader_.status(); + } + assert(tmp_bytes_read > 0); + *key_size = kSizeInlineLimit + extra_size; + *bytes_read = tmp_bytes_read + 1; + return Status::OK(); + } +} + +Status PlainTableKeyEncoder::AppendKey(const Slice& key, + WritableFileWriter* file, + uint64_t* offset, char* meta_bytes_buf, + size_t* meta_bytes_buf_size) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(key, &parsed_key)) { + return Status::Corruption(Slice()); + } + + Slice key_to_write = key; // Portion of internal key to write out. + + uint32_t user_key_size = static_cast<uint32_t>(key.size() - 8); + if (encoding_type_ == kPlain) { + if (fixed_user_key_len_ == kPlainTableVariableLength) { + // Write key length + char key_size_buf[5]; // tmp buffer for key size as varint32 + char* ptr = EncodeVarint32(key_size_buf, user_key_size); + assert(ptr <= key_size_buf + sizeof(key_size_buf)); + auto len = ptr - key_size_buf; + Status s = file->Append(Slice(key_size_buf, len)); + if (!s.ok()) { + return s; + } + *offset += len; + } + } else { + assert(encoding_type_ == kPrefix); + char size_bytes[12]; + size_t size_bytes_pos = 0; + + Slice prefix = + prefix_extractor_->Transform(Slice(key.data(), user_key_size)); + if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetUserKey() || + key_count_for_prefix_ % index_sparseness_ == 0) { + key_count_for_prefix_ = 1; + pre_prefix_.SetUserKey(prefix); + size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes); + Status s = file->Append(Slice(size_bytes, size_bytes_pos)); + if (!s.ok()) { + return s; + } + *offset += size_bytes_pos; + } else { + key_count_for_prefix_++; + if (key_count_for_prefix_ == 2) { + // For second key within a prefix, need to encode prefix length + size_bytes_pos += + EncodeSize(kPrefixFromPreviousKey, + static_cast<uint32_t>(pre_prefix_.GetUserKey().size()), + size_bytes + size_bytes_pos); + } + uint32_t prefix_len = + static_cast<uint32_t>(pre_prefix_.GetUserKey().size()); + size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len, + size_bytes + size_bytes_pos); + Status s = file->Append(Slice(size_bytes, size_bytes_pos)); + if (!s.ok()) { + return s; + } + *offset += size_bytes_pos; + key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len); + } + } + + // Encode full key + // For value size as varint32 (up to 5 bytes). + // If the row is of value type with seqId 0, flush the special flag together + // in this buffer to safe one file append call, which takes 1 byte. + if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) { + Status s = + file->Append(Slice(key_to_write.data(), key_to_write.size() - 8)); + if (!s.ok()) { + return s; + } + *offset += key_to_write.size() - 8; + meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0; + *meta_bytes_buf_size += 1; + } else { + file->Append(key_to_write); + *offset += key_to_write.size(); + } + + return Status::OK(); +} + +Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset, + uint32_t len) { + assert(file_offset + len <= file_info_->data_end_offset); + return Slice(buffer->buf.get() + (file_offset - buffer->buf_start_offset), + len); +} + +bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len, + Slice* out) { + const uint32_t kPrefetchSize = 256u; + + // Try to read from buffers. + for (uint32_t i = 0; i < num_buf_; i++) { + Buffer* buffer = buffers_[num_buf_ - 1 - i].get(); + if (file_offset >= buffer->buf_start_offset && + file_offset + len <= buffer->buf_start_offset + buffer->buf_len) { + *out = GetFromBuffer(buffer, file_offset, len); + return true; + } + } + + Buffer* new_buffer; + // Data needed is not in any of the buffer. Allocate a new buffer. + if (num_buf_ < buffers_.size()) { + // Add a new buffer + new_buffer = new Buffer(); + buffers_[num_buf_++].reset(new_buffer); + } else { + // Now simply replace the last buffer. Can improve the placement policy + // if needed. + new_buffer = buffers_[num_buf_ - 1].get(); + } + + assert(file_offset + len <= file_info_->data_end_offset); + uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset, + std::max(kPrefetchSize, len)); + if (size_to_read > new_buffer->buf_capacity) { + new_buffer->buf.reset(new char[size_to_read]); + new_buffer->buf_capacity = size_to_read; + new_buffer->buf_len = 0; + } + Slice read_result; + Status s = file_info_->file->Read(file_offset, size_to_read, &read_result, + new_buffer->buf.get()); + if (!s.ok()) { + status_ = s; + return false; + } + new_buffer->buf_start_offset = file_offset; + new_buffer->buf_len = size_to_read; + *out = GetFromBuffer(new_buffer, file_offset, len); + return true; +} + +inline bool PlainTableFileReader::ReadVarint32(uint32_t offset, uint32_t* out, + uint32_t* bytes_read) { + if (file_info_->is_mmap_mode) { + const char* start = file_info_->file_data.data() + offset; + const char* limit = + file_info_->file_data.data() + file_info_->data_end_offset; + const char* key_ptr = GetVarint32Ptr(start, limit, out); + assert(key_ptr != nullptr); + *bytes_read = static_cast<uint32_t>(key_ptr - start); + return true; + } else { + return ReadVarint32NonMmap(offset, out, bytes_read); + } +} + +bool PlainTableFileReader::ReadVarint32NonMmap(uint32_t offset, uint32_t* out, + uint32_t* bytes_read) { + const char* start; + const char* limit; + const uint32_t kMaxVarInt32Size = 6u; + uint32_t bytes_to_read = + std::min(file_info_->data_end_offset - offset, kMaxVarInt32Size); + Slice bytes; + if (!Read(offset, bytes_to_read, &bytes)) { + return false; + } + start = bytes.data(); + limit = bytes.data() + bytes.size(); + + const char* key_ptr = GetVarint32Ptr(start, limit, out); + *bytes_read = + (key_ptr != nullptr) ? static_cast<uint32_t>(key_ptr - start) : 0; + return true; +} + +Status PlainTableKeyDecoder::ReadInternalKey( + uint32_t file_offset, uint32_t user_key_size, ParsedInternalKey* parsed_key, + uint32_t* bytes_read, bool* internal_key_valid, Slice* internal_key) { + Slice tmp_slice; + bool success = file_reader_.Read(file_offset, user_key_size + 1, &tmp_slice); + if (!success) { + return file_reader_.status(); + } + if (tmp_slice[user_key_size] == PlainTableFactory::kValueTypeSeqId0) { + // Special encoding for the row with seqID=0 + parsed_key->user_key = Slice(tmp_slice.data(), user_key_size); + parsed_key->sequence = 0; + parsed_key->type = kTypeValue; + *bytes_read += user_key_size + 1; + *internal_key_valid = false; + } else { + success = file_reader_.Read(file_offset, user_key_size + 8, internal_key); + if (!success) { + return file_reader_.status(); + } + *internal_key_valid = true; + if (!ParseInternalKey(*internal_key, parsed_key)) { + return Status::Corruption( + Slice("Incorrect value type found when reading the next key")); + } + *bytes_read += user_key_size + 8; + } + return Status::OK(); +} + +Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, + uint32_t* bytes_read, + bool* /*seekable*/) { + uint32_t user_key_size = 0; + Status s; + if (fixed_user_key_len_ != kPlainTableVariableLength) { + user_key_size = fixed_user_key_len_; + } else { + uint32_t tmp_size = 0; + uint32_t tmp_read; + bool success = + file_reader_.ReadVarint32(start_offset, &tmp_size, &tmp_read); + if (!success) { + return file_reader_.status(); + } + assert(tmp_read > 0); + user_key_size = tmp_size; + *bytes_read = tmp_read; + } + // dummy initial value to avoid compiler complain + bool decoded_internal_key_valid = true; + Slice decoded_internal_key; + s = ReadInternalKey(start_offset + *bytes_read, user_key_size, parsed_key, + bytes_read, &decoded_internal_key_valid, + &decoded_internal_key); + if (!s.ok()) { + return s; + } + if (!file_reader_.file_info()->is_mmap_mode) { + cur_key_.SetInternalKey(*parsed_key); + parsed_key->user_key = + Slice(cur_key_.GetInternalKey().data(), user_key_size); + if (internal_key != nullptr) { + *internal_key = cur_key_.GetInternalKey(); + } + } else if (internal_key != nullptr) { + if (decoded_internal_key_valid) { + *internal_key = decoded_internal_key; + } else { + // Need to copy out the internal key + cur_key_.SetInternalKey(*parsed_key); + *internal_key = cur_key_.GetInternalKey(); + } + } + return Status::OK(); +} + +Status PlainTableKeyDecoder::NextPrefixEncodingKey( + uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key, + uint32_t* bytes_read, bool* seekable) { + PlainTableEntryType entry_type; + + bool expect_suffix = false; + Status s; + do { + uint32_t size = 0; + // dummy initial value to avoid compiler complain + bool decoded_internal_key_valid = true; + uint32_t my_bytes_read = 0; + s = DecodeSize(start_offset + *bytes_read, &entry_type, &size, + &my_bytes_read); + if (!s.ok()) { + return s; + } + if (my_bytes_read == 0) { + return Status::Corruption("Unexpected EOF when reading size of the key"); + } + *bytes_read += my_bytes_read; + + switch (entry_type) { + case kFullKey: { + expect_suffix = false; + Slice decoded_internal_key; + s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key, + bytes_read, &decoded_internal_key_valid, + &decoded_internal_key); + if (!s.ok()) { + return s; + } + if (!file_reader_.file_info()->is_mmap_mode || + (internal_key != nullptr && !decoded_internal_key_valid)) { + // In non-mmap mode, always need to make a copy of keys returned to + // users, because after reading value for the key, the key might + // be invalid. + cur_key_.SetInternalKey(*parsed_key); + saved_user_key_ = cur_key_.GetUserKey(); + if (!file_reader_.file_info()->is_mmap_mode) { + parsed_key->user_key = + Slice(cur_key_.GetInternalKey().data(), size); + } + if (internal_key != nullptr) { + *internal_key = cur_key_.GetInternalKey(); + } + } else { + if (internal_key != nullptr) { + *internal_key = decoded_internal_key; + } + saved_user_key_ = parsed_key->user_key; + } + break; + } + case kPrefixFromPreviousKey: { + if (seekable != nullptr) { + *seekable = false; + } + prefix_len_ = size; + assert(prefix_extractor_ == nullptr || + prefix_extractor_->Transform(saved_user_key_).size() == + prefix_len_); + // Need read another size flag for suffix + expect_suffix = true; + break; + } + case kKeySuffix: { + expect_suffix = false; + if (seekable != nullptr) { + *seekable = false; + } + + Slice tmp_slice; + s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key, + bytes_read, &decoded_internal_key_valid, + &tmp_slice); + if (!s.ok()) { + return s; + } + if (!file_reader_.file_info()->is_mmap_mode) { + // In non-mmap mode, we need to make a copy of keys returned to + // users, because after reading value for the key, the key might + // be invalid. + // saved_user_key_ points to cur_key_. We are making a copy of + // the prefix part to another string, and construct the current + // key from the prefix part and the suffix part back to cur_key_. + std::string tmp = + Slice(saved_user_key_.data(), prefix_len_).ToString(); + cur_key_.Reserve(prefix_len_ + size); + cur_key_.SetInternalKey(tmp, *parsed_key); + parsed_key->user_key = + Slice(cur_key_.GetInternalKey().data(), prefix_len_ + size); + saved_user_key_ = cur_key_.GetUserKey(); + } else { + cur_key_.Reserve(prefix_len_ + size); + cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_), + *parsed_key); + } + parsed_key->user_key = cur_key_.GetUserKey(); + if (internal_key != nullptr) { + *internal_key = cur_key_.GetInternalKey(); + } + break; + } + default: + return Status::Corruption("Un-identified size flag."); + } + } while (expect_suffix); // Another round if suffix is expected. + return Status::OK(); +} + +Status PlainTableKeyDecoder::NextKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, Slice* value, + uint32_t* bytes_read, bool* seekable) { + assert(value != nullptr); + Status s = NextKeyNoValue(start_offset, parsed_key, internal_key, bytes_read, + seekable); + if (s.ok()) { + assert(bytes_read != nullptr); + uint32_t value_size; + uint32_t value_size_bytes; + bool success = file_reader_.ReadVarint32(start_offset + *bytes_read, + &value_size, &value_size_bytes); + if (!success) { + return file_reader_.status(); + } + if (value_size_bytes == 0) { + return Status::Corruption( + "Unexpected EOF when reading the next value's size."); + } + *bytes_read += value_size_bytes; + success = file_reader_.Read(start_offset + *bytes_read, value_size, value); + if (!success) { + return file_reader_.status(); + } + *bytes_read += value_size; + } + return s; +} + +Status PlainTableKeyDecoder::NextKeyNoValue(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, + uint32_t* bytes_read, + bool* seekable) { + *bytes_read = 0; + if (seekable != nullptr) { + *seekable = true; + } + Status s; + if (encoding_type_ == kPlain) { + return NextPlainEncodingKey(start_offset, parsed_key, internal_key, + bytes_read, seekable); + } else { + assert(encoding_type_ == kPrefix); + return NextPrefixEncodingKey(start_offset, parsed_key, internal_key, + bytes_read, seekable); + } +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LIT diff --git a/src/rocksdb/table/plain/plain_table_key_coding.h b/src/rocksdb/table/plain/plain_table_key_coding.h new file mode 100644 index 000000000..d1460837d --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_key_coding.h @@ -0,0 +1,193 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include <array> +#include "db/dbformat.h" +#include "rocksdb/slice.h" +#include "table/plain/plain_table_reader.h" + +// The file contains three helper classes of PlainTable format, +// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader. +// These classes issue the lowest level of operations of PlainTable. +// Actual data format of the key is documented in comments of class +// PlainTableFactory. +namespace ROCKSDB_NAMESPACE { + +class WritableFile; +struct ParsedInternalKey; +struct PlainTableReaderFileInfo; +enum PlainTableEntryType : unsigned char; + +// Helper class for PlainTable format to write out a key to an output file +// The class is used in PlainTableBuilder. +class PlainTableKeyEncoder { + public: + explicit PlainTableKeyEncoder(EncodingType encoding_type, + uint32_t user_key_len, + const SliceTransform* prefix_extractor, + size_t index_sparseness) + : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain), + fixed_user_key_len_(user_key_len), + prefix_extractor_(prefix_extractor), + index_sparseness_((index_sparseness > 1) ? index_sparseness : 1), + key_count_for_prefix_(0) {} + // key: the key to write out, in the format of internal key. + // file: the output file to write out + // offset: offset in the file. Needs to be updated after appending bytes + // for the key + // meta_bytes_buf: buffer for extra meta bytes + // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated + // if meta_bytes_buf is updated. + Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset, + char* meta_bytes_buf, size_t* meta_bytes_buf_size); + + // Return actual encoding type to be picked + EncodingType GetEncodingType() { return encoding_type_; } + + private: + EncodingType encoding_type_; + uint32_t fixed_user_key_len_; + const SliceTransform* prefix_extractor_; + const size_t index_sparseness_; + size_t key_count_for_prefix_; + IterKey pre_prefix_; +}; + +// The class does raw file reads for PlainTableReader. +// It hides whether it is a mmap-read, or a non-mmap read. +// The class is implemented in a way to favor the performance of mmap case. +// The class is used by PlainTableReader. +class PlainTableFileReader { + public: + explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info) + : file_info_(_file_info), num_buf_(0) {} + // In mmaped mode, the results point to mmaped area of the file, which + // means it is always valid before closing the file. + // In non-mmap mode, the results point to an internal buffer. If the caller + // makes another read call, the results may not be valid. So callers should + // make a copy when needed. + // In order to save read calls to files, we keep two internal buffers: + // the first read and the most recent read. This is efficient because it + // columns these two common use cases: + // (1) hash index only identify one location, we read the key to verify + // the location, and read key and value if it is the right location. + // (2) after hash index checking, we identify two locations (because of + // hash bucket conflicts), we binary search the two location to see + // which one is what we need and start to read from the location. + // These two most common use cases will be covered by the two buffers + // so that we don't need to re-read the same location. + // Currently we keep a fixed size buffer. If a read doesn't exactly fit + // the buffer, we replace the second buffer with the location user reads. + // + // If return false, status code is stored in status_. + bool Read(uint32_t file_offset, uint32_t len, Slice* out) { + if (file_info_->is_mmap_mode) { + assert(file_offset + len <= file_info_->data_end_offset); + *out = Slice(file_info_->file_data.data() + file_offset, len); + return true; + } else { + return ReadNonMmap(file_offset, len, out); + } + } + + // If return false, status code is stored in status_. + bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output); + + // *bytes_read = 0 means eof. false means failure and status is saved + // in status_. Not directly returning Status to save copying status + // object to map previous performance of mmap mode. + inline bool ReadVarint32(uint32_t offset, uint32_t* output, + uint32_t* bytes_read); + + bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output, + uint32_t* bytes_read); + + Status status() const { return status_; } + + const PlainTableReaderFileInfo* file_info() { return file_info_; } + + private: + const PlainTableReaderFileInfo* file_info_; + + struct Buffer { + Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {} + std::unique_ptr<char[]> buf; + uint32_t buf_start_offset; + uint32_t buf_len; + uint32_t buf_capacity; + }; + + // Keep buffers for two recent reads. + std::array<std::unique_ptr<Buffer>, 2> buffers_; + uint32_t num_buf_; + Status status_; + + Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len); +}; + +// A helper class to decode keys from input buffer +// The class is used by PlainTableBuilder. +class PlainTableKeyDecoder { + public: + explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info, + EncodingType encoding_type, + uint32_t user_key_len, + const SliceTransform* prefix_extractor) + : file_reader_(file_info), + encoding_type_(encoding_type), + prefix_len_(0), + fixed_user_key_len_(user_key_len), + prefix_extractor_(prefix_extractor), + in_prefix_(false) {} + // Find the next key. + // start: char array where the key starts. + // limit: boundary of the char array + // parsed_key: the output of the result key + // internal_key: if not null, fill with the output of the result key in + // un-parsed format + // bytes_read: how many bytes read from start. Output + // seekable: whether key can be read from this place. Used when building + // indexes. Output. + Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key, + Slice* internal_key, Slice* value, uint32_t* bytes_read, + bool* seekable = nullptr); + + Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key, + Slice* internal_key, uint32_t* bytes_read, + bool* seekable = nullptr); + + PlainTableFileReader file_reader_; + EncodingType encoding_type_; + uint32_t prefix_len_; + uint32_t fixed_user_key_len_; + Slice saved_user_key_; + IterKey cur_key_; + const SliceTransform* prefix_extractor_; + bool in_prefix_; + + private: + Status NextPlainEncodingKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, uint32_t* bytes_read, + bool* seekable = nullptr); + Status NextPrefixEncodingKey(uint32_t start_offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, uint32_t* bytes_read, + bool* seekable = nullptr); + Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size, + ParsedInternalKey* parsed_key, uint32_t* bytes_read, + bool* internal_key_valid, Slice* internal_key); + inline Status DecodeSize(uint32_t start_offset, + PlainTableEntryType* entry_type, uint32_t* key_size, + uint32_t* bytes_read); +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain/plain_table_reader.cc b/src/rocksdb/table/plain/plain_table_reader.cc new file mode 100644 index 000000000..55756d9c1 --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_reader.cc @@ -0,0 +1,775 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef ROCKSDB_LITE + +#include "table/plain/plain_table_reader.h" + +#include <string> +#include <vector> + +#include "db/dbformat.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" + +#include "table/block_based/block.h" +#include "table/block_based/filter_block.h" +#include "table/format.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_key_coding.h" +#include "table/two_level_iterator.h" + +#include "memory/arena.h" +#include "monitoring/histogram.h" +#include "monitoring/perf_context_imp.h" +#include "util/coding.h" +#include "util/dynamic_bloom.h" +#include "util/hash.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Safely getting a uint32_t element from a char array, where, starting from +// `base`, every 4 bytes are considered as an fixed 32 bit integer. +inline uint32_t GetFixed32Element(const char* base, size_t offset) { + return DecodeFixed32(base + offset * sizeof(uint32_t)); +} +} // namespace + +// Iterator to iterate IndexedTable +class PlainTableIterator : public InternalIterator { + public: + explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); + // No copying allowed + PlainTableIterator(const PlainTableIterator&) = delete; + void operator=(const Iterator&) = delete; + + ~PlainTableIterator() override; + + bool Valid() const override; + + void SeekToFirst() override; + + void SeekToLast() override; + + void Seek(const Slice& target) override; + + void SeekForPrev(const Slice& target) override; + + void Next() override; + + void Prev() override; + + Slice key() const override; + + Slice value() const override; + + Status status() const override; + + private: + PlainTableReader* table_; + PlainTableKeyDecoder decoder_; + bool use_prefix_seek_; + uint32_t offset_; + uint32_t next_offset_; + Slice key_; + Slice value_; + Status status_; +}; + +extern const uint64_t kPlainTableMagicNumber; +PlainTableReader::PlainTableReader( + const ImmutableCFOptions& ioptions, + std::unique_ptr<RandomAccessFileReader>&& file, + const EnvOptions& storage_options, const InternalKeyComparator& icomparator, + EncodingType encoding_type, uint64_t file_size, + const TableProperties* table_properties, + const SliceTransform* prefix_extractor) + : internal_comparator_(icomparator), + encoding_type_(encoding_type), + full_scan_mode_(false), + user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)), + prefix_extractor_(prefix_extractor), + enable_bloom_(false), + bloom_(6), + file_info_(std::move(file), storage_options, + static_cast<uint32_t>(table_properties->data_size)), + ioptions_(ioptions), + file_size_(file_size), + table_properties_(nullptr) {} + +PlainTableReader::~PlainTableReader() { +} + +Status PlainTableReader::Open( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, + bool full_scan_mode, const bool immortal_table, + const SliceTransform* prefix_extractor) { + if (file_size > PlainTableIndex::kMaxFileSize) { + return Status::NotSupported("File is too large for PlainTableReader!"); + } + + TableProperties* props_ptr = nullptr; + auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, + ioptions, &props_ptr, + true /* compression_type_missing */); + std::shared_ptr<TableProperties> props(props_ptr); + if (!s.ok()) { + return s; + } + + assert(hash_table_ratio >= 0.0); + auto& user_props = props->user_collected_properties; + auto prefix_extractor_in_file = props->prefix_extractor_name; + + if (!full_scan_mode && + !prefix_extractor_in_file.empty() /* old version sst file*/ + && prefix_extractor_in_file != "nullptr") { + if (!prefix_extractor) { + return Status::InvalidArgument( + "Prefix extractor is missing when opening a PlainTable built " + "using a prefix extractor"); + } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) != + 0) { + return Status::InvalidArgument( + "Prefix extractor given doesn't match the one used to build " + "PlainTable"); + } + } + + EncodingType encoding_type = kPlain; + auto encoding_type_prop = + user_props.find(PlainTablePropertyNames::kEncodingType); + if (encoding_type_prop != user_props.end()) { + encoding_type = static_cast<EncodingType>( + DecodeFixed32(encoding_type_prop->second.c_str())); + } + + std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader( + ioptions, std::move(file), env_options, internal_comparator, + encoding_type, file_size, props.get(), prefix_extractor)); + + s = new_reader->MmapDataIfNeeded(); + if (!s.ok()) { + return s; + } + + if (!full_scan_mode) { + s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key, + hash_table_ratio, index_sparseness, + huge_page_tlb_size); + if (!s.ok()) { + return s; + } + } else { + // Flag to indicate it is a full scan mode so that none of the indexes + // can be used. + new_reader->full_scan_mode_ = true; + } + // PopulateIndex can add to the props, so don't store them until now + new_reader->table_properties_ = props; + + if (immortal_table && new_reader->file_info_.is_mmap_mode) { + new_reader->dummy_cleanable_.reset(new Cleanable()); + } + + *table_reader = std::move(new_reader); + return s; +} + +void PlainTableReader::SetupForCompaction() { +} + +InternalIterator* PlainTableReader::NewIterator( + const ReadOptions& options, const SliceTransform* /* prefix_extractor */, + Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/, + size_t /*compaction_readahead_size*/) { + // Not necessarily used here, but make sure this has been initialized + assert(table_properties_); + + // Auto prefix mode is not implemented in PlainTable. + bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek && + !options.auto_prefix_mode; + if (arena == nullptr) { + return new PlainTableIterator(this, use_prefix_seek); + } else { + auto mem = arena->AllocateAligned(sizeof(PlainTableIterator)); + return new (mem) PlainTableIterator(this, use_prefix_seek); + } +} + +Status PlainTableReader::PopulateIndexRecordList( + PlainTableIndexBuilder* index_builder, + std::vector<uint32_t>* prefix_hashes) { + Slice prev_key_prefix_slice; + std::string prev_key_prefix_buf; + uint32_t pos = data_start_offset_; + + bool is_first_record = true; + Slice key_prefix_slice; + PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, + prefix_extractor_); + while (pos < file_info_.data_end_offset) { + uint32_t key_offset = pos; + ParsedInternalKey key; + Slice value_slice; + bool seekable = false; + Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable); + if (!s.ok()) { + return s; + } + + key_prefix_slice = GetPrefix(key); + if (enable_bloom_) { + bloom_.AddHash(GetSliceHash(key.user_key)); + } else { + if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { + if (!is_first_record) { + prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice)); + } + if (file_info_.is_mmap_mode) { + prev_key_prefix_slice = key_prefix_slice; + } else { + prev_key_prefix_buf = key_prefix_slice.ToString(); + prev_key_prefix_slice = prev_key_prefix_buf; + } + } + } + + index_builder->AddKeyPrefix(GetPrefix(key), key_offset); + + if (!seekable && is_first_record) { + return Status::Corruption("Key for a prefix is not seekable"); + } + + is_first_record = false; + } + + prefix_hashes->push_back(GetSliceHash(key_prefix_slice)); + auto s = index_.InitFromRawData(index_builder->Finish()); + return s; +} + +void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys, + size_t huge_page_tlb_size) { + uint32_t bloom_total_bits = num_keys * bloom_bits_per_key; + if (bloom_total_bits > 0) { + enable_bloom_ = true; + bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, + huge_page_tlb_size, ioptions_.info_log); + } +} + +void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) { + assert(bloom_.IsInitialized()); + for (const auto prefix_hash : prefix_hashes) { + bloom_.AddHash(prefix_hash); + } +} + +Status PlainTableReader::MmapDataIfNeeded() { + if (file_info_.is_mmap_mode) { + // Get mmapped memory. + return file_info_.file->Read(0, static_cast<size_t>(file_size_), &file_info_.file_data, nullptr); + } + return Status::OK(); +} + +Status PlainTableReader::PopulateIndex(TableProperties* props, + int bloom_bits_per_key, + double hash_table_ratio, + size_t index_sparseness, + size_t huge_page_tlb_size) { + assert(props != nullptr); + + BlockContents index_block_contents; + Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, + file_size_, kPlainTableMagicNumber, ioptions_, + PlainTableIndexBuilder::kPlainTableIndexBlock, + BlockType::kIndex, &index_block_contents, + true /* compression_type_missing */); + + bool index_in_file = s.ok(); + + BlockContents bloom_block_contents; + bool bloom_in_file = false; + // We only need to read the bloom block if index block is in file. + if (index_in_file) { + s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, + file_size_, kPlainTableMagicNumber, ioptions_, + BloomBlockBuilder::kBloomBlock, BlockType::kFilter, + &bloom_block_contents, + true /* compression_type_missing */); + bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; + } + + Slice* bloom_block; + if (bloom_in_file) { + // If bloom_block_contents.allocation is not empty (which will be the case + // for non-mmap mode), it holds the alloated memory for the bloom block. + // It needs to be kept alive to keep `bloom_block` valid. + bloom_block_alloc_ = std::move(bloom_block_contents.allocation); + bloom_block = &bloom_block_contents.data; + } else { + bloom_block = nullptr; + } + + Slice* index_block; + if (index_in_file) { + // If index_block_contents.allocation is not empty (which will be the case + // for non-mmap mode), it holds the alloated memory for the index block. + // It needs to be kept alive to keep `index_block` valid. + index_block_alloc_ = std::move(index_block_contents.allocation); + index_block = &index_block_contents.data; + } else { + index_block = nullptr; + } + + if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) { + // moptions.prefix_extractor is requried for a hash-based look-up. + return Status::NotSupported( + "PlainTable requires a prefix extractor enable prefix hash mode."); + } + + // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows + // for a prefix (starting from the first one), generate a record of (hash, + // offset) and append it to IndexRecordList, which is a data structure created + // to store them. + + if (!index_in_file) { + // Allocate bloom filter here for total order mode. + if (IsTotalOrderMode()) { + AllocateBloom(bloom_bits_per_key, + static_cast<uint32_t>(props->num_entries), + huge_page_tlb_size); + } + } else if (bloom_in_file) { + enable_bloom_ = true; + auto num_blocks_property = props->user_collected_properties.find( + PlainTablePropertyNames::kNumBloomBlocks); + + uint32_t num_blocks = 0; + if (num_blocks_property != props->user_collected_properties.end()) { + Slice temp_slice(num_blocks_property->second); + if (!GetVarint32(&temp_slice, &num_blocks)) { + num_blocks = 0; + } + } + // cast away const qualifier, because bloom_ won't be changed + bloom_.SetRawData(const_cast<char*>(bloom_block->data()), + static_cast<uint32_t>(bloom_block->size()) * 8, + num_blocks); + } else { + // Index in file but no bloom in file. Disable bloom filter in this case. + enable_bloom_ = false; + bloom_bits_per_key = 0; + } + + PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_, + index_sparseness, hash_table_ratio, + huge_page_tlb_size); + + std::vector<uint32_t> prefix_hashes; + if (!index_in_file) { + // Populates _bloom if enabled (total order mode) + s = PopulateIndexRecordList(&index_builder, &prefix_hashes); + if (!s.ok()) { + return s; + } + } else { + s = index_.InitFromRawData(*index_block); + if (!s.ok()) { + return s; + } + } + + if (!index_in_file) { + if (!IsTotalOrderMode()) { + // Calculated bloom filter size and allocate memory for + // bloom filter based on the number of prefixes, then fill it. + AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(), + huge_page_tlb_size); + if (enable_bloom_) { + FillBloom(prefix_hashes); + } + } + } + + // Fill two table properties. + if (!index_in_file) { + props->user_collected_properties["plain_table_hash_table_size"] = + ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen); + props->user_collected_properties["plain_table_sub_index_size"] = + ToString(index_.GetSubIndexSize()); + } else { + props->user_collected_properties["plain_table_hash_table_size"] = + ToString(0); + props->user_collected_properties["plain_table_sub_index_size"] = + ToString(0); + } + + return Status::OK(); +} + +Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder, + const Slice& target, const Slice& prefix, + uint32_t prefix_hash, bool& prefix_matched, + uint32_t* offset) const { + prefix_matched = false; + uint32_t prefix_index_offset; + auto res = index_.GetOffset(prefix_hash, &prefix_index_offset); + if (res == PlainTableIndex::kNoPrefixForBucket) { + *offset = file_info_.data_end_offset; + return Status::OK(); + } else if (res == PlainTableIndex::kDirectToFile) { + *offset = prefix_index_offset; + return Status::OK(); + } + + // point to sub-index, need to do a binary search + uint32_t upper_bound; + const char* base_ptr = + index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound); + uint32_t low = 0; + uint32_t high = upper_bound; + ParsedInternalKey mid_key; + ParsedInternalKey parsed_target; + if (!ParseInternalKey(target, &parsed_target)) { + return Status::Corruption(Slice()); + } + + // The key is between [low, high). Do a binary search between it. + while (high - low > 1) { + uint32_t mid = (high + low) / 2; + uint32_t file_offset = GetFixed32Element(base_ptr, mid); + uint32_t tmp; + Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp); + if (!s.ok()) { + return s; + } + int cmp_result = internal_comparator_.Compare(mid_key, parsed_target); + if (cmp_result < 0) { + low = mid; + } else { + if (cmp_result == 0) { + // Happen to have found the exact key or target is smaller than the + // first key after base_offset. + prefix_matched = true; + *offset = file_offset; + return Status::OK(); + } else { + high = mid; + } + } + } + // Both of the key at the position low or low+1 could share the same + // prefix as target. We need to rule out one of them to avoid to go + // to the wrong prefix. + ParsedInternalKey low_key; + uint32_t tmp; + uint32_t low_key_offset = GetFixed32Element(base_ptr, low); + Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp); + if (!s.ok()) { + return s; + } + + if (GetPrefix(low_key) == prefix) { + prefix_matched = true; + *offset = low_key_offset; + } else if (low + 1 < upper_bound) { + // There is possible a next prefix, return it + prefix_matched = false; + *offset = GetFixed32Element(base_ptr, low + 1); + } else { + // target is larger than a key of the last prefix in this bucket + // but with a different prefix. Key does not exist. + *offset = file_info_.data_end_offset; + } + return Status::OK(); +} + +bool PlainTableReader::MatchBloom(uint32_t hash) const { + if (!enable_bloom_) { + return true; + } + + if (bloom_.MayContainHash(hash)) { + PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + return true; + } else { + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + return false; + } +} + +Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset, + ParsedInternalKey* parsed_key, + Slice* internal_key, Slice* value, + bool* seekable) const { + if (*offset == file_info_.data_end_offset) { + *offset = file_info_.data_end_offset; + return Status::OK(); + } + + if (*offset > file_info_.data_end_offset) { + return Status::Corruption("Offset is out of file size"); + } + + uint32_t bytes_read; + Status s = decoder->NextKey(*offset, parsed_key, internal_key, value, + &bytes_read, seekable); + if (!s.ok()) { + return s; + } + *offset = *offset + bytes_read; + return Status::OK(); +} + +void PlainTableReader::Prepare(const Slice& target) { + if (enable_bloom_) { + uint32_t prefix_hash = GetSliceHash(GetPrefix(target)); + bloom_.Prefetch(prefix_hash); + } +} + +Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, + GetContext* get_context, + const SliceTransform* /* prefix_extractor */, + bool /*skip_filters*/) { + // Check bloom filter first. + Slice prefix_slice; + uint32_t prefix_hash; + if (IsTotalOrderMode()) { + if (full_scan_mode_) { + status_ = + Status::InvalidArgument("Get() is not allowed in full scan mode."); + } + // Match whole user key for bloom filter check. + if (!MatchBloom(GetSliceHash(GetUserKey(target)))) { + return Status::OK(); + } + // in total order mode, there is only one bucket 0, and we always use empty + // prefix. + prefix_slice = Slice(); + prefix_hash = 0; + } else { + prefix_slice = GetPrefix(target); + prefix_hash = GetSliceHash(prefix_slice); + if (!MatchBloom(prefix_hash)) { + return Status::OK(); + } + } + uint32_t offset; + bool prefix_match; + PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, + prefix_extractor_); + Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash, + prefix_match, &offset); + + if (!s.ok()) { + return s; + } + ParsedInternalKey found_key; + ParsedInternalKey parsed_target; + if (!ParseInternalKey(target, &parsed_target)) { + return Status::Corruption(Slice()); + } + Slice found_value; + while (offset < file_info_.data_end_offset) { + s = Next(&decoder, &offset, &found_key, nullptr, &found_value); + if (!s.ok()) { + return s; + } + if (!prefix_match) { + // Need to verify prefix for the first key found if it is not yet + // checked. + if (GetPrefix(found_key) != prefix_slice) { + return Status::OK(); + } + prefix_match = true; + } + // TODO(ljin): since we know the key comparison result here, + // can we enable the fast path? + if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { + bool dont_care __attribute__((__unused__)); + if (!get_context->SaveValue(found_key, found_value, &dont_care, + dummy_cleanable_.get())) { + break; + } + } + } + return Status::OK(); +} + +uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/, + TableReaderCaller /*caller*/) { + return 0; +} + +uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/, + const Slice& /*end*/, + TableReaderCaller /*caller*/) { + return 0; +} + +PlainTableIterator::PlainTableIterator(PlainTableReader* table, + bool use_prefix_seek) + : table_(table), + decoder_(&table_->file_info_, table_->encoding_type_, + table_->user_key_len_, table_->prefix_extractor_), + use_prefix_seek_(use_prefix_seek) { + next_offset_ = offset_ = table_->file_info_.data_end_offset; +} + +PlainTableIterator::~PlainTableIterator() { +} + +bool PlainTableIterator::Valid() const { + return offset_ < table_->file_info_.data_end_offset && + offset_ >= table_->data_start_offset_; +} + +void PlainTableIterator::SeekToFirst() { + status_ = Status::OK(); + next_offset_ = table_->data_start_offset_; + if (next_offset_ >= table_->file_info_.data_end_offset) { + next_offset_ = offset_ = table_->file_info_.data_end_offset; + } else { + Next(); + } +} + +void PlainTableIterator::SeekToLast() { + assert(false); + status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable"); + next_offset_ = offset_ = table_->file_info_.data_end_offset; +} + +void PlainTableIterator::Seek(const Slice& target) { + if (use_prefix_seek_ != !table_->IsTotalOrderMode()) { + // This check is done here instead of NewIterator() to permit creating an + // iterator with total_order_seek = true even if we won't be able to Seek() + // it. This is needed for compaction: it creates iterator with + // total_order_seek = true but usually never does Seek() on it, + // only SeekToFirst(). + status_ = + Status::InvalidArgument( + "total_order_seek not implemented for PlainTable."); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + + // If the user doesn't set prefix seek option and we are not able to do a + // total Seek(). assert failure. + if (table_->IsTotalOrderMode()) { + if (table_->full_scan_mode_) { + status_ = + Status::InvalidArgument("Seek() is not allowed in full scan mode."); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } else if (table_->GetIndexSize() > 1) { + assert(false); + status_ = Status::NotSupported( + "PlainTable cannot issue non-prefix seek unless in total order " + "mode."); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + } + + Slice prefix_slice = table_->GetPrefix(target); + uint32_t prefix_hash = 0; + // Bloom filter is ignored in total-order mode. + if (!table_->IsTotalOrderMode()) { + prefix_hash = GetSliceHash(prefix_slice); + if (!table_->MatchBloom(prefix_hash)) { + status_ = Status::OK(); + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + } + bool prefix_match; + status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash, + prefix_match, &next_offset_); + if (!status_.ok()) { + offset_ = next_offset_ = table_->file_info_.data_end_offset; + return; + } + + if (next_offset_ < table_->file_info_.data_end_offset) { + for (Next(); status_.ok() && Valid(); Next()) { + if (!prefix_match) { + // Need to verify the first key's prefix + if (table_->GetPrefix(key()) != prefix_slice) { + offset_ = next_offset_ = table_->file_info_.data_end_offset; + break; + } + prefix_match = true; + } + if (table_->internal_comparator_.Compare(key(), target) >= 0) { + break; + } + } + } else { + offset_ = table_->file_info_.data_end_offset; + } +} + +void PlainTableIterator::SeekForPrev(const Slice& /*target*/) { + assert(false); + status_ = + Status::NotSupported("SeekForPrev() is not supported in PlainTable"); + offset_ = next_offset_ = table_->file_info_.data_end_offset; +} + +void PlainTableIterator::Next() { + offset_ = next_offset_; + if (offset_ < table_->file_info_.data_end_offset) { + Slice tmp_slice; + ParsedInternalKey parsed_key; + status_ = + table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_); + if (!status_.ok()) { + offset_ = next_offset_ = table_->file_info_.data_end_offset; + } + } +} + +void PlainTableIterator::Prev() { + assert(false); +} + +Slice PlainTableIterator::key() const { + assert(Valid()); + return key_; +} + +Slice PlainTableIterator::value() const { + assert(Valid()); + return value_; +} + +Status PlainTableIterator::status() const { + return status_; +} + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/src/rocksdb/table/plain/plain_table_reader.h b/src/rocksdb/table/plain/plain_table_reader.h new file mode 100644 index 000000000..db7b0626f --- /dev/null +++ b/src/rocksdb/table/plain/plain_table_reader.h @@ -0,0 +1,246 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE +#include <unordered_map> +#include <memory> +#include <vector> +#include <string> +#include <stdint.h> + +#include "db/dbformat.h" +#include "file/random_access_file_reader.h" +#include "memory/arena.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_index.h" +#include "table/table_reader.h" + +namespace ROCKSDB_NAMESPACE { + +class Block; +struct BlockContents; +class BlockHandle; +class Footer; +struct Options; +class RandomAccessFile; +struct ReadOptions; +class TableCache; +class TableReader; +class InternalKeyComparator; +class PlainTableKeyDecoder; +class GetContext; + +extern const uint32_t kPlainTableVariableLength; + +struct PlainTableReaderFileInfo { + bool is_mmap_mode; + Slice file_data; + uint32_t data_end_offset; + std::unique_ptr<RandomAccessFileReader> file; + + PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file, + const EnvOptions& storage_options, + uint32_t _data_size_offset) + : is_mmap_mode(storage_options.use_mmap_reads), + data_end_offset(_data_size_offset), + file(std::move(_file)) {} +}; + +// The reader class of PlainTable. For description of PlainTable format +// See comments of class PlainTableFactory, where instances of +// PlainTableReader are created. +class PlainTableReader: public TableReader { + public: +// Based on following output file format shown in plain_table_factory.h +// When opening the output file, PlainTableReader creates a hash table +// from key prefixes to offset of the output file. PlainTable will decide +// whether it points to the data offset of the first key with the key prefix +// or the offset of it. If there are too many keys share this prefix, it will +// create a binary search-able index from the suffix to offset on disk. + static Status Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr<RandomAccessFileReader>&& file, + uint64_t file_size, std::unique_ptr<TableReader>* table, + const int bloom_bits_per_key, double hash_table_ratio, + size_t index_sparseness, size_t huge_page_tlb_size, + bool full_scan_mode, const bool immortal_table = false, + const SliceTransform* prefix_extractor = nullptr); + + // Returns new iterator over table contents + // compaction_readahead_size: its value will only be used if for_compaction = + // true + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0) override; + + void Prepare(const Slice& target) override; + + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + uint64_t ApproximateOffsetOf(const Slice& key, + TableReaderCaller caller) override; + + uint64_t ApproximateSize(const Slice& start, const Slice& end, + TableReaderCaller caller) override; + + uint32_t GetIndexSize() const { return index_.GetIndexSize(); } + void SetupForCompaction() override; + + std::shared_ptr<const TableProperties> GetTableProperties() const override { + return table_properties_; + } + + virtual size_t ApproximateMemoryUsage() const override { + return arena_.MemoryAllocatedBytes(); + } + + PlainTableReader(const ImmutableCFOptions& ioptions, + std::unique_ptr<RandomAccessFileReader>&& file, + const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + EncodingType encoding_type, uint64_t file_size, + const TableProperties* table_properties, + const SliceTransform* prefix_extractor); + virtual ~PlainTableReader(); + + protected: + // Check bloom filter to see whether it might contain this prefix. + // The hash of the prefix is given, since it can be reused for index lookup + // too. + virtual bool MatchBloom(uint32_t hash) const; + + // PopulateIndex() builds index of keys. It must be called before any query + // to the table. + // + // props: the table properties object that need to be stored. Ownership of + // the object will be passed. + // + + Status PopulateIndex(TableProperties* props, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + size_t huge_page_tlb_size); + + Status MmapDataIfNeeded(); + + private: + const InternalKeyComparator internal_comparator_; + EncodingType encoding_type_; + // represents plain table's current status. + Status status_; + + PlainTableIndex index_; + bool full_scan_mode_; + + // data_start_offset_ and data_end_offset_ defines the range of the + // sst file that stores data. + const uint32_t data_start_offset_ = 0; + const uint32_t user_key_len_; + const SliceTransform* prefix_extractor_; + + static const size_t kNumInternalBytes = 8; + + // Bloom filter is used to rule out non-existent key + bool enable_bloom_; + PlainTableBloomV1 bloom_; + PlainTableReaderFileInfo file_info_; + Arena arena_; + CacheAllocationPtr index_block_alloc_; + CacheAllocationPtr bloom_block_alloc_; + + const ImmutableCFOptions& ioptions_; + std::unique_ptr<Cleanable> dummy_cleanable_; + uint64_t file_size_; + protected: // for testing + std::shared_ptr<const TableProperties> table_properties_; + private: + + bool IsFixedLength() const { + return user_key_len_ != kPlainTableVariableLength; + } + + size_t GetFixedInternalKeyLength() const { + return user_key_len_ + kNumInternalBytes; + } + + Slice GetPrefix(const Slice& target) const { + assert(target.size() >= 8); // target is internal key + return GetPrefixFromUserKey(GetUserKey(target)); + } + + Slice GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); + } + + Slice GetUserKey(const Slice& key) const { + return Slice(key.data(), key.size() - 8); + } + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return prefix_extractor_->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. + // In that case, + // it falls back to pure binary search and + // total iterator seek is supported. + return Slice(); + } + } + + friend class TableCache; + friend class PlainTableIterator; + + // Internal helper function to generate an IndexRecordList object from all + // the rows, which contains index records as a list. + // If bloom_ is not null, all the keys' full-key hash will be added to the + // bloom filter. + Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder, + std::vector<uint32_t>* prefix_hashes); + + // Internal helper function to allocate memory for bloom filter + void AllocateBloom(int bloom_bits_per_key, int num_prefixes, + size_t huge_page_tlb_size); + + void FillBloom(const std::vector<uint32_t>& prefix_hashes); + + // Read the key and value at `offset` to parameters for keys, the and + // `seekable`. + // On success, `offset` will be updated as the offset for the next key. + // `parsed_key` will be key in parsed format. + // if `internal_key` is not empty, it will be filled with key with slice + // format. + // if `seekable` is not null, it will return whether we can directly read + // data using this offset. + Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset, + ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value, + bool* seekable = nullptr) const; + // Get file offset for key target. + // return value prefix_matched is set to true if the offset is confirmed + // for a key with the same prefix as target. + Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target, + const Slice& prefix, uint32_t prefix_hash, + bool& prefix_matched, uint32_t* offset) const; + + bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } + + // No copying allowed + explicit PlainTableReader(const TableReader&) = delete; + void operator=(const TableReader&) = delete; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE |