diff options
Diffstat (limited to 'src/rocksdb/table/block_based/block_based_table_factory.cc')
-rw-r--r-- | src/rocksdb/table/block_based/block_based_table_factory.cc | 649 |
1 files changed, 649 insertions, 0 deletions
diff --git a/src/rocksdb/table/block_based/block_based_table_factory.cc b/src/rocksdb/table/block_based/block_based_table_factory.cc new file mode 100644 index 000000000..70a6f38d5 --- /dev/null +++ b/src/rocksdb/table/block_based/block_based_table_factory.cc @@ -0,0 +1,649 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <stdint.h> +#include <cinttypes> + +#include <memory> +#include <string> + +#include "options/options_helper.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/flush_block_policy.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/format.h" +#include "util/mutexlock.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +void TailPrefetchStats::RecordEffectiveSize(size_t len) { + MutexLock l(&mutex_); + if (num_records_ < kNumTracked) { + num_records_++; + } + records_[next_++] = len; + if (next_ == kNumTracked) { + next_ = 0; + } +} + +size_t TailPrefetchStats::GetSuggestedPrefetchSize() { + std::vector<size_t> sorted; + { + MutexLock l(&mutex_); + + if (num_records_ == 0) { + return 0; + } + sorted.assign(records_, records_ + num_records_); + } + + // Of the historic size, we find the maximum one that satisifis the condtiion + // that if prefetching all, less than 1/8 will be wasted. + std::sort(sorted.begin(), sorted.end()); + + // Assuming we have 5 data points, and after sorting it looks like this: + // + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // +---+ | | | | + // | | | | | | + // +---+ | | | | | | + // | | | | | | | | + // +---+ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // and we use every of the value as a candidate, and estimate how much we + // wasted, compared to read. For example, when we use the 3rd record + // as candiate. This area is what we read: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ***+ *** *** *** *** ** + // * | | | | | | + // +---+ | | | | | * + // * | | | | | | | | + // +---+ | | | | | | | * + // * | | | | X | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // | | | | | | | | | * + // * | | | | | | | | | + // *** *** ***-*** ***--*** ***--*** +**** + // which is (size of the record) X (number of records). + // + // While wasted is this area: + // +---+ + // +---+ | | + // | | | | + // | | | | + // | | | | + // | | | | + // *** *** *** ****---+ | | | | + // * * | | | | | + // * *-*** *** | | | | | + // * * | | | | | | | + // *--** *** | | | | | | | + // | | | | | X | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // Which can be calculated iteratively. + // The difference between wasted using 4st and 3rd record, will + // be following area: + // +---+ + // +--+ +-+ ++ +-+ +-+ +---+ | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // | xxxxxxxxxxxxxxxxxxxxxxxx | | | | + // +-+ +-+ +-+ ++ +---+ +--+ | | | + // | | | | | | | + // +---+ ++ | | | | | | + // | | | | | | X | | | + // +---+ ++ | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // | | | | | | | | | | + // +---+ +---+ +---+ +---+ +---+ + // + // which will be the size difference between 4st and 3rd record, + // times 3, which is number of records before the 4st. + // Here we assume that all data within the prefetch range will be useful. In + // reality, it may not be the case when a partial block is inside the range, + // or there are data in the middle that is not read. We ignore those cases + // for simplicity. + assert(!sorted.empty()); + size_t prev_size = sorted[0]; + size_t max_qualified_size = sorted[0]; + size_t wasted = 0; + for (size_t i = 1; i < sorted.size(); i++) { + size_t read = sorted[i] * sorted.size(); + wasted += (sorted[i] - prev_size) * i; + if (wasted <= read / 8) { + max_qualified_size = sorted[i]; + } + prev_size = sorted[i]; + } + const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB + return std::min(kMaxPrefetchSize, max_qualified_size); +} + +// TODO(myabandeh): We should return an error instead of silently changing the +// options +BlockBasedTableFactory::BlockBasedTableFactory( + const BlockBasedTableOptions& _table_options) + : table_options_(_table_options) { + if (table_options_.flush_block_policy_factory == nullptr) { + table_options_.flush_block_policy_factory.reset( + new FlushBlockBySizePolicyFactory()); + } + if (table_options_.no_block_cache) { + table_options_.block_cache.reset(); + } else if (table_options_.block_cache == nullptr) { + LRUCacheOptions co; + co.capacity = 8 << 20; + // It makes little sense to pay overhead for mid-point insertion while the + // block size is only 8MB. + co.high_pri_pool_ratio = 0.0; + table_options_.block_cache = NewLRUCache(co); + } + if (table_options_.block_size_deviation < 0 || + table_options_.block_size_deviation > 100) { + table_options_.block_size_deviation = 0; + } + if (table_options_.block_restart_interval < 1) { + table_options_.block_restart_interval = 1; + } + if (table_options_.index_block_restart_interval < 1) { + table_options_.index_block_restart_interval = 1; + } + if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && + table_options_.index_block_restart_interval != 1) { + // Currently kHashSearch is incompatible with index_block_restart_interval > 1 + table_options_.index_block_restart_interval = 1; + } + if (table_options_.partition_filters && + table_options_.index_type != + BlockBasedTableOptions::kTwoLevelIndexSearch) { + // We do not support partitioned filters without partitioning indexes + table_options_.partition_filters = false; + } +} + +Status BlockBasedTableFactory::NewTableReader( + const TableReaderOptions& table_reader_options, + std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, + std::unique_ptr<TableReader>* table_reader, + bool prefetch_index_and_filter_in_cache) const { + return BlockBasedTable::Open( + table_reader_options.ioptions, table_reader_options.env_options, + table_options_, table_reader_options.internal_comparator, std::move(file), + file_size, table_reader, table_reader_options.prefix_extractor, + prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, + table_reader_options.level, table_reader_options.immortal, + table_reader_options.largest_seqno, &tail_prefetch_stats_, + table_reader_options.block_cache_tracer); +} + +TableBuilder* BlockBasedTableFactory::NewTableBuilder( + const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + WritableFileWriter* file) const { + auto table_builder = new BlockBasedTableBuilder( + table_builder_options.ioptions, table_builder_options.moptions, + table_options_, table_builder_options.internal_comparator, + table_builder_options.int_tbl_prop_collector_factories, column_family_id, + file, table_builder_options.compression_type, + table_builder_options.sample_for_compression, + table_builder_options.compression_opts, + table_builder_options.skip_filters, + table_builder_options.column_family_name, table_builder_options.level, + table_builder_options.creation_time, + table_builder_options.oldest_key_time, + table_builder_options.target_file_size, + table_builder_options.file_creation_time); + + return table_builder; +} + +Status BlockBasedTableFactory::SanitizeOptions( + const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { + if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && + cf_opts.prefix_extractor == nullptr) { + return Status::InvalidArgument( + "Hash index is specified for block-based " + "table, but prefix_extractor is not given"); + } + if (table_options_.cache_index_and_filter_blocks && + table_options_.no_block_cache) { + return Status::InvalidArgument( + "Enable cache_index_and_filter_blocks, " + ", but block cache is disabled"); + } + if (table_options_.pin_l0_filter_and_index_blocks_in_cache && + table_options_.no_block_cache) { + return Status::InvalidArgument( + "Enable pin_l0_filter_and_index_blocks_in_cache, " + ", but block cache is disabled"); + } + if (!BlockBasedTableSupportedVersion(table_options_.format_version)) { + return Status::InvalidArgument( + "Unsupported BlockBasedTable format_version. Please check " + "include/rocksdb/table.h for more info"); + } + if (table_options_.block_align && (cf_opts.compression != kNoCompression)) { + return Status::InvalidArgument( + "Enable block_align, but compression " + "enabled"); + } + if (table_options_.block_align && + (table_options_.block_size & (table_options_.block_size - 1))) { + return Status::InvalidArgument( + "Block alignment requested but block size is not a power of 2"); + } + if (table_options_.block_size > port::kMaxUint32) { + return Status::InvalidArgument( + "block size exceeds maximum number (4GiB) allowed"); + } + if (table_options_.data_block_index_type == + BlockBasedTableOptions::kDataBlockBinaryAndHash && + table_options_.data_block_hash_table_util_ratio <= 0) { + return Status::InvalidArgument( + "data_block_hash_table_util_ratio should be greater than 0 when " + "data_block_index_type is set to kDataBlockBinaryAndHash"); + } + if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) { + // TODO(myabandeh): support it + return Status::InvalidArgument( + "max_successive_merges larger than 0 is currently inconsistent with " + "unordered_write"); + } + return Status::OK(); +} + +std::string BlockBasedTableFactory::GetPrintableTableOptions() const { + std::string ret; + ret.reserve(20000); + const int kBufferSize = 200; + char buffer[kBufferSize]; + + snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n", + table_options_.flush_block_policy_factory->Name(), + static_cast<void*>(table_options_.flush_block_policy_factory.get())); + ret.append(buffer); + snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n", + table_options_.cache_index_and_filter_blocks); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " cache_index_and_filter_blocks_with_high_priority: %d\n", + table_options_.cache_index_and_filter_blocks_with_high_priority); + ret.append(buffer); + snprintf(buffer, kBufferSize, + " pin_l0_filter_and_index_blocks_in_cache: %d\n", + table_options_.pin_l0_filter_and_index_blocks_in_cache); + ret.append(buffer); + snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n", + table_options_.pin_top_level_index_and_filter); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_type: %d\n", + table_options_.index_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " data_block_index_type: %d\n", + table_options_.data_block_index_type); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_shortening: %d\n", + static_cast<int>(table_options_.index_shortening)); + ret.append(buffer); + snprintf(buffer, kBufferSize, " data_block_hash_table_util_ratio: %lf\n", + table_options_.data_block_hash_table_util_ratio); + ret.append(buffer); + snprintf(buffer, kBufferSize, " hash_index_allow_collision: %d\n", + table_options_.hash_index_allow_collision); + ret.append(buffer); + snprintf(buffer, kBufferSize, " checksum: %d\n", table_options_.checksum); + ret.append(buffer); + snprintf(buffer, kBufferSize, " no_block_cache: %d\n", + table_options_.no_block_cache); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_cache: %p\n", + static_cast<void*>(table_options_.block_cache.get())); + ret.append(buffer); + if (table_options_.block_cache) { + const char* block_cache_name = table_options_.block_cache->Name(); + if (block_cache_name != nullptr) { + snprintf(buffer, kBufferSize, " block_cache_name: %s\n", + block_cache_name); + ret.append(buffer); + } + ret.append(" block_cache_options:\n"); + ret.append(table_options_.block_cache->GetPrintableOptions()); + } + snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n", + static_cast<void*>(table_options_.block_cache_compressed.get())); + ret.append(buffer); + if (table_options_.block_cache_compressed) { + const char* block_cache_compressed_name = + table_options_.block_cache_compressed->Name(); + if (block_cache_compressed_name != nullptr) { + snprintf(buffer, kBufferSize, " block_cache_name: %s\n", + block_cache_compressed_name); + ret.append(buffer); + } + ret.append(" block_cache_compressed_options:\n"); + ret.append(table_options_.block_cache_compressed->GetPrintableOptions()); + } + snprintf(buffer, kBufferSize, " persistent_cache: %p\n", + static_cast<void*>(table_options_.persistent_cache.get())); + ret.append(buffer); + if (table_options_.persistent_cache) { + snprintf(buffer, kBufferSize, " persistent_cache_options:\n"); + ret.append(buffer); + ret.append(table_options_.persistent_cache->GetPrintableOptions()); + } + snprintf(buffer, kBufferSize, " block_size: %" ROCKSDB_PRIszt "\n", + table_options_.block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_size_deviation: %d\n", + table_options_.block_size_deviation); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_restart_interval: %d\n", + table_options_.block_restart_interval); + ret.append(buffer); + snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n", + table_options_.index_block_restart_interval); + ret.append(buffer); + snprintf(buffer, kBufferSize, " metadata_block_size: %" PRIu64 "\n", + table_options_.metadata_block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " partition_filters: %d\n", + table_options_.partition_filters); + ret.append(buffer); + snprintf(buffer, kBufferSize, " use_delta_encoding: %d\n", + table_options_.use_delta_encoding); + ret.append(buffer); + snprintf(buffer, kBufferSize, " filter_policy: %s\n", + table_options_.filter_policy == nullptr + ? "nullptr" + : table_options_.filter_policy->Name()); + ret.append(buffer); + snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n", + table_options_.whole_key_filtering); + ret.append(buffer); + snprintf(buffer, kBufferSize, " verify_compression: %d\n", + table_options_.verify_compression); + ret.append(buffer); + snprintf(buffer, kBufferSize, " read_amp_bytes_per_bit: %d\n", + table_options_.read_amp_bytes_per_bit); + ret.append(buffer); + snprintf(buffer, kBufferSize, " format_version: %d\n", + table_options_.format_version); + ret.append(buffer); + snprintf(buffer, kBufferSize, " enable_index_compression: %d\n", + table_options_.enable_index_compression); + ret.append(buffer); + snprintf(buffer, kBufferSize, " block_align: %d\n", + table_options_.block_align); + ret.append(buffer); + return ret; +} + +#ifndef ROCKSDB_LITE +namespace { +bool SerializeSingleBlockBasedTableOption( + std::string* opt_string, const BlockBasedTableOptions& bbt_options, + const std::string& name, const std::string& delimiter) { + auto iter = block_based_table_type_info.find(name); + if (iter == block_based_table_type_info.end()) { + return false; + } + auto& opt_info = iter->second; + const char* opt_address = + reinterpret_cast<const char*>(&bbt_options) + opt_info.offset; + std::string value; + bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value); + if (result) { + *opt_string = name + "=" + value + delimiter; + } + return result; +} +} // namespace + +Status BlockBasedTableFactory::GetOptionString( + std::string* opt_string, const std::string& delimiter) const { + assert(opt_string); + opt_string->clear(); + for (auto iter = block_based_table_type_info.begin(); + iter != block_based_table_type_info.end(); ++iter) { + if (iter->second.verification == OptionVerificationType::kDeprecated) { + // If the option is no longer used in rocksdb and marked as deprecated, + // we skip it in the serialization. + continue; + } + std::string single_output; + bool result = SerializeSingleBlockBasedTableOption( + &single_output, table_options_, iter->first, delimiter); + assert(result); + if (result) { + opt_string->append(single_output); + } + } + return Status::OK(); +} +#else +Status BlockBasedTableFactory::GetOptionString( + std::string* /*opt_string*/, const std::string& /*delimiter*/) const { + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +const BlockBasedTableOptions& BlockBasedTableFactory::table_options() const { + return table_options_; +} + +#ifndef ROCKSDB_LITE +namespace { +std::string ParseBlockBasedTableOption(const std::string& name, + const std::string& org_value, + BlockBasedTableOptions* new_options, + bool input_strings_escaped = false, + bool ignore_unknown_options = false) { + const std::string& value = + input_strings_escaped ? UnescapeOptionString(org_value) : org_value; + if (!input_strings_escaped) { + // if the input string is not escaped, it means this function is + // invoked from SetOptions, which takes the old format. + if (name == "block_cache" || name == "block_cache_compressed") { + // cache options can be specified in the following format + // "block_cache={capacity=1M;num_shard_bits=4; + // strict_capacity_limit=true;high_pri_pool_ratio=0.5;}" + // To support backward compatibility, the following format + // is also supported. + // "block_cache=1M" + std::shared_ptr<Cache> cache; + // block_cache is specified in format block_cache=<cache_size>. + if (value.find('=') == std::string::npos) { + cache = NewLRUCache(ParseSizeT(value)); + } else { + LRUCacheOptions cache_opts; + if (!ParseOptionHelper(reinterpret_cast<char*>(&cache_opts), + OptionType::kLRUCacheOptions, value)) { + return "Invalid cache options"; + } + cache = NewLRUCache(cache_opts); + } + + if (name == "block_cache") { + new_options->block_cache = cache; + } else { + new_options->block_cache_compressed = cache; + } + return ""; + } else if (name == "filter_policy") { + // Expect the following format + // bloomfilter:int:bool + const std::string kName = "bloomfilter:"; + if (value.compare(0, kName.size(), kName) != 0) { + return "Invalid filter policy name"; + } + size_t pos = value.find(':', kName.size()); + if (pos == std::string::npos) { + return "Invalid filter policy config, missing bits_per_key"; + } + double bits_per_key = + ParseDouble(trim(value.substr(kName.size(), pos - kName.size()))); + bool use_block_based_builder = + ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1))); + new_options->filter_policy.reset( + NewBloomFilterPolicy(bits_per_key, use_block_based_builder)); + return ""; + } + } + const auto iter = block_based_table_type_info.find(name); + if (iter == block_based_table_type_info.end()) { + if (ignore_unknown_options) { + return ""; + } else { + return "Unrecognized option"; + } + } + const auto& opt_info = iter->second; + if (opt_info.verification != OptionVerificationType::kDeprecated && + !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset, + opt_info.type, value)) { + return "Invalid value"; + } + return ""; +} +} // namespace + +Status GetBlockBasedTableOptionsFromString( + const BlockBasedTableOptions& table_options, const std::string& opts_str, + BlockBasedTableOptions* new_table_options) { + std::unordered_map<std::string, std::string> opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + return s; + } + + return GetBlockBasedTableOptionsFromMap(table_options, opts_map, + new_table_options); +} + +Status GetBlockBasedTableOptionsFromMap( + const BlockBasedTableOptions& table_options, + const std::unordered_map<std::string, std::string>& opts_map, + BlockBasedTableOptions* new_table_options, bool input_strings_escaped, + bool ignore_unknown_options) { + assert(new_table_options); + *new_table_options = table_options; + for (const auto& o : opts_map) { + auto error_message = ParseBlockBasedTableOption( + o.first, o.second, new_table_options, input_strings_escaped, + ignore_unknown_options); + if (error_message != "") { + const auto iter = block_based_table_type_info.find(o.first); + if (iter == block_based_table_type_info.end() || + !input_strings_escaped || // !input_strings_escaped indicates + // the old API, where everything is + // parsable. + (iter->second.verification != OptionVerificationType::kByName && + iter->second.verification != + OptionVerificationType::kByNameAllowNull && + iter->second.verification != + OptionVerificationType::kByNameAllowFromNull && + iter->second.verification != OptionVerificationType::kDeprecated)) { + // Restore "new_options" to the default "base_options". + *new_table_options = table_options; + return Status::InvalidArgument("Can't parse BlockBasedTableOptions:", + o.first + " " + error_message); + } + } + } + return Status::OK(); +} + +Status VerifyBlockBasedTableFactory( + const BlockBasedTableFactory* base_tf, + const BlockBasedTableFactory* file_tf, + OptionsSanityCheckLevel sanity_check_level) { + if ((base_tf != nullptr) != (file_tf != nullptr) && + sanity_check_level > kSanityLevelNone) { + return Status::Corruption( + "[RocksDBOptionsParser]: Inconsistent TableFactory class type"); + } + if (base_tf == nullptr) { + return Status::OK(); + } + assert(file_tf != nullptr); + + const auto& base_opt = base_tf->table_options(); + const auto& file_opt = file_tf->table_options(); + + for (auto& pair : block_based_table_type_info) { + if (pair.second.verification == OptionVerificationType::kDeprecated) { + // We skip checking deprecated variables as they might + // contain random values since they might not be initialized + continue; + } + if (BBTOptionSanityCheckLevel(pair.first) <= sanity_check_level) { + if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt), + reinterpret_cast<const char*>(&file_opt), + pair.second, pair.first, nullptr)) { + return Status::Corruption( + "[RocksDBOptionsParser]: " + "failed the verification on BlockBasedTableOptions::", + pair.first); + } + } + } + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +TableFactory* NewBlockBasedTableFactory( + const BlockBasedTableOptions& _table_options) { + return new BlockBasedTableFactory(_table_options); +} + +const std::string BlockBasedTableFactory::kName = "BlockBasedTable"; +const std::string BlockBasedTablePropertyNames::kIndexType = + "rocksdb.block.based.table.index.type"; +const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering = + "rocksdb.block.based.table.whole.key.filtering"; +const std::string BlockBasedTablePropertyNames::kPrefixFiltering = + "rocksdb.block.based.table.prefix.filtering"; +const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes"; +const std::string kHashIndexPrefixesMetadataBlock = + "rocksdb.hashindex.metadata"; +const std::string kPropTrue = "1"; +const std::string kPropFalse = "0"; + +} // namespace ROCKSDB_NAMESPACE |