summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/table/block_based/block_based_table_factory.cc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/rocksdb/table/block_based/block_based_table_factory.cc1058
1 files changed, 1058 insertions, 0 deletions
diff --git a/src/rocksdb/table/block_based/block_based_table_factory.cc b/src/rocksdb/table/block_based/block_based_table_factory.cc
new file mode 100644
index 000000000..09c1d2f62
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_factory.cc
@@ -0,0 +1,1058 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_table_factory.h"
+
+#include <stdint.h>
+
+#include <cinttypes>
+#include <memory>
+#include <string>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "logging/logging.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void TailPrefetchStats::RecordEffectiveSize(size_t len) {
+ MutexLock l(&mutex_);
+ if (num_records_ < kNumTracked) {
+ num_records_++;
+ }
+ records_[next_++] = len;
+ if (next_ == kNumTracked) {
+ next_ = 0;
+ }
+}
+
+size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
+ std::vector<size_t> sorted;
+ {
+ MutexLock l(&mutex_);
+
+ if (num_records_ == 0) {
+ return 0;
+ }
+ sorted.assign(records_, records_ + num_records_);
+ }
+
+ // Of the historic size, we find the maximum one that satisifis the condtiion
+ // that if prefetching all, less than 1/8 will be wasted.
+ std::sort(sorted.begin(), sorted.end());
+
+ // Assuming we have 5 data points, and after sorting it looks like this:
+ //
+ // +---+
+ // +---+ | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // +---+ | | | |
+ // | | | | | |
+ // +---+ | | | | | |
+ // | | | | | | | |
+ // +---+ | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // +---+ +---+ +---+ +---+ +---+
+ //
+ // and we use every of the value as a candidate, and estimate how much we
+ // wasted, compared to read. For example, when we use the 3rd record
+ // as candiate. This area is what we read:
+ // +---+
+ // +---+ | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // *** *** *** ***+ *** *** *** *** **
+ // * | | | | | |
+ // +---+ | | | | | *
+ // * | | | | | | | |
+ // +---+ | | | | | | | *
+ // * | | | | X | | | | |
+ // | | | | | | | | | *
+ // * | | | | | | | | |
+ // | | | | | | | | | *
+ // * | | | | | | | | |
+ // *** *** ***-*** ***--*** ***--*** +****
+ // which is (size of the record) X (number of records).
+ //
+ // While wasted is this area:
+ // +---+
+ // +---+ | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // | | | |
+ // *** *** *** ****---+ | | | |
+ // * * | | | | |
+ // * *-*** *** | | | | |
+ // * * | | | | | | |
+ // *--** *** | | | | | | |
+ // | | | | | X | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // +---+ +---+ +---+ +---+ +---+
+ //
+ // Which can be calculated iteratively.
+ // The difference between wasted using 4st and 3rd record, will
+ // be following area:
+ // +---+
+ // +--+ +-+ ++ +-+ +-+ +---+ | |
+ // + xxxxxxxxxxxxxxxxxxxxxxxx | | | |
+ // xxxxxxxxxxxxxxxxxxxxxxxx | | | |
+ // + xxxxxxxxxxxxxxxxxxxxxxxx | | | |
+ // | xxxxxxxxxxxxxxxxxxxxxxxx | | | |
+ // +-+ +-+ +-+ ++ +---+ +--+ | | |
+ // | | | | | | |
+ // +---+ ++ | | | | | |
+ // | | | | | | X | | |
+ // +---+ ++ | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // | | | | | | | | | |
+ // +---+ +---+ +---+ +---+ +---+
+ //
+ // which will be the size difference between 4st and 3rd record,
+ // times 3, which is number of records before the 4st.
+ // Here we assume that all data within the prefetch range will be useful. In
+ // reality, it may not be the case when a partial block is inside the range,
+ // or there are data in the middle that is not read. We ignore those cases
+ // for simplicity.
+ assert(!sorted.empty());
+ size_t prev_size = sorted[0];
+ size_t max_qualified_size = sorted[0];
+ size_t wasted = 0;
+ for (size_t i = 1; i < sorted.size(); i++) {
+ size_t read = sorted[i] * sorted.size();
+ wasted += (sorted[i] - prev_size) * i;
+ if (wasted <= read / 8) {
+ max_qualified_size = sorted[i];
+ }
+ prev_size = sorted[i];
+ }
+ const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB
+ return std::min(kMaxPrefetchSize, max_qualified_size);
+}
+
+#ifndef ROCKSDB_LITE
+
+const std::string kOptNameMetadataCacheOpts = "metadata_cache_options";
+
+static std::unordered_map<std::string, PinningTier>
+ pinning_tier_type_string_map = {
+ {"kFallback", PinningTier::kFallback},
+ {"kNone", PinningTier::kNone},
+ {"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar},
+ {"kAll", PinningTier::kAll}};
+
+static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
+ block_base_table_index_type_string_map = {
+ {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
+ {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
+ {"kTwoLevelIndexSearch",
+ BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch},
+ {"kBinarySearchWithFirstKey",
+ BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
+
+static std::unordered_map<std::string,
+ BlockBasedTableOptions::DataBlockIndexType>
+ block_base_table_data_block_index_type_string_map = {
+ {"kDataBlockBinarySearch",
+ BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
+ {"kDataBlockBinaryAndHash",
+ BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};
+
+static std::unordered_map<std::string,
+ BlockBasedTableOptions::IndexShorteningMode>
+ block_base_table_index_shortening_mode_string_map = {
+ {"kNoShortening",
+ BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
+ {"kShortenSeparators",
+ BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
+ {"kShortenSeparatorsAndSuccessor",
+ BlockBasedTableOptions::IndexShorteningMode::
+ kShortenSeparatorsAndSuccessor}};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+ metadata_cache_options_type_info = {
+ {"top_level_index_pinning",
+ OptionTypeInfo::Enum<PinningTier>(
+ offsetof(struct MetadataCacheOptions, top_level_index_pinning),
+ &pinning_tier_type_string_map)},
+ {"partition_pinning",
+ OptionTypeInfo::Enum<PinningTier>(
+ offsetof(struct MetadataCacheOptions, partition_pinning),
+ &pinning_tier_type_string_map)},
+ {"unpartitioned_pinning",
+ OptionTypeInfo::Enum<PinningTier>(
+ offsetof(struct MetadataCacheOptions, unpartitioned_pinning),
+ &pinning_tier_type_string_map)}};
+
+static std::unordered_map<std::string,
+ BlockBasedTableOptions::PrepopulateBlockCache>
+ block_base_table_prepopulate_block_cache_string_map = {
+ {"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable},
+ {"kFlushOnly",
+ BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}};
+
+#endif // ROCKSDB_LITE
+
+static std::unordered_map<std::string, OptionTypeInfo>
+ block_based_table_type_info = {
+#ifndef ROCKSDB_LITE
+ /* currently not supported
+ std::shared_ptr<Cache> block_cache = nullptr;
+ std::shared_ptr<Cache> block_cache_compressed = nullptr;
+ CacheUsageOptions cache_usage_options;
+ */
+ {"flush_block_policy_factory",
+ OptionTypeInfo::AsCustomSharedPtr<FlushBlockPolicyFactory>(
+ offsetof(struct BlockBasedTableOptions,
+ flush_block_policy_factory),
+ OptionVerificationType::kByName, OptionTypeFlags::kCompareNever)},
+ {"cache_index_and_filter_blocks",
+ {offsetof(struct BlockBasedTableOptions,
+ cache_index_and_filter_blocks),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"cache_index_and_filter_blocks_with_high_priority",
+ {offsetof(struct BlockBasedTableOptions,
+ cache_index_and_filter_blocks_with_high_priority),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"pin_l0_filter_and_index_blocks_in_cache",
+ {offsetof(struct BlockBasedTableOptions,
+ pin_l0_filter_and_index_blocks_in_cache),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"index_type", OptionTypeInfo::Enum<BlockBasedTableOptions::IndexType>(
+ offsetof(struct BlockBasedTableOptions, index_type),
+ &block_base_table_index_type_string_map)},
+ {"hash_index_allow_collision",
+ {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+ OptionTypeFlags::kNone}},
+ {"data_block_index_type",
+ OptionTypeInfo::Enum<BlockBasedTableOptions::DataBlockIndexType>(
+ offsetof(struct BlockBasedTableOptions, data_block_index_type),
+ &block_base_table_data_block_index_type_string_map)},
+ {"index_shortening",
+ OptionTypeInfo::Enum<BlockBasedTableOptions::IndexShorteningMode>(
+ offsetof(struct BlockBasedTableOptions, index_shortening),
+ &block_base_table_index_shortening_mode_string_map)},
+ {"data_block_hash_table_util_ratio",
+ {offsetof(struct BlockBasedTableOptions,
+ data_block_hash_table_util_ratio),
+ OptionType::kDouble, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"checksum",
+ {offsetof(struct BlockBasedTableOptions, checksum),
+ OptionType::kChecksumType, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"no_block_cache",
+ {offsetof(struct BlockBasedTableOptions, no_block_cache),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"block_size",
+ {offsetof(struct BlockBasedTableOptions, block_size),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"block_size_deviation",
+ {offsetof(struct BlockBasedTableOptions, block_size_deviation),
+ OptionType::kInt, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"block_restart_interval",
+ {offsetof(struct BlockBasedTableOptions, block_restart_interval),
+ OptionType::kInt, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"index_block_restart_interval",
+ {offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
+ OptionType::kInt, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"index_per_partition",
+ {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
+ OptionTypeFlags::kNone}},
+ {"metadata_block_size",
+ {offsetof(struct BlockBasedTableOptions, metadata_block_size),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"partition_filters",
+ {offsetof(struct BlockBasedTableOptions, partition_filters),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"optimize_filters_for_memory",
+ {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"filter_policy",
+ OptionTypeInfo::AsCustomSharedPtr<const FilterPolicy>(
+ offsetof(struct BlockBasedTableOptions, filter_policy),
+ OptionVerificationType::kByNameAllowFromNull,
+ OptionTypeFlags::kNone)},
+ {"whole_key_filtering",
+ {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"detect_filter_construct_corruption",
+ {offsetof(struct BlockBasedTableOptions,
+ detect_filter_construct_corruption),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"reserve_table_builder_memory",
+ {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+ OptionTypeFlags::kNone}},
+ {"reserve_table_reader_memory",
+ {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+ OptionTypeFlags::kNone}},
+ {"skip_table_builder_flush",
+ {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+ OptionTypeFlags::kNone}},
+ {"format_version",
+ {offsetof(struct BlockBasedTableOptions, format_version),
+ OptionType::kUInt32T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"verify_compression",
+ {offsetof(struct BlockBasedTableOptions, verify_compression),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"read_amp_bytes_per_bit",
+ {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
+ OptionType::kUInt32T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone,
+ [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+ const std::string& value, void* addr) {
+ // A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13
+ // and 6.14. The bug will write out 8 bytes to OPTIONS file from the
+ // starting address of BlockBasedTableOptions.read_amp_bytes_per_bit
+ // which is actually a uint32. Consequently, the value of
+ // read_amp_bytes_per_bit written in the OPTIONS file is wrong.
+ // From 6.15, RocksDB will try to parse the read_amp_bytes_per_bit
+ // from OPTIONS file as a uint32. To be able to load OPTIONS file
+ // generated by affected releases before the fix, we need to
+ // manually parse read_amp_bytes_per_bit with this special hack.
+ uint64_t read_amp_bytes_per_bit = ParseUint64(value);
+ *(static_cast<uint32_t*>(addr)) =
+ static_cast<uint32_t>(read_amp_bytes_per_bit);
+ return Status::OK();
+ }}},
+ {"enable_index_compression",
+ {offsetof(struct BlockBasedTableOptions, enable_index_compression),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"block_align",
+ {offsetof(struct BlockBasedTableOptions, block_align),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {"pin_top_level_index_and_filter",
+ {offsetof(struct BlockBasedTableOptions,
+ pin_top_level_index_and_filter),
+ OptionType::kBoolean, OptionVerificationType::kNormal,
+ OptionTypeFlags::kNone}},
+ {kOptNameMetadataCacheOpts,
+ OptionTypeInfo::Struct(
+ kOptNameMetadataCacheOpts, &metadata_cache_options_type_info,
+ offsetof(struct BlockBasedTableOptions, metadata_cache_options),
+ OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+ {"block_cache",
+ {offsetof(struct BlockBasedTableOptions, block_cache),
+ OptionType::kUnknown, OptionVerificationType::kNormal,
+ (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
+ // Parses the input value as a Cache
+ [](const ConfigOptions& opts, const std::string&,
+ const std::string& value, void* addr) {
+ auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
+ return Cache::CreateFromString(opts, value, cache);
+ }}},
+ {"block_cache_compressed",
+ {offsetof(struct BlockBasedTableOptions, block_cache_compressed),
+ OptionType::kUnknown, OptionVerificationType::kNormal,
+ (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
+ // Parses the input value as a Cache
+ [](const ConfigOptions& opts, const std::string&,
+ const std::string& value, void* addr) {
+ auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
+ return Cache::CreateFromString(opts, value, cache);
+ }}},
+ {"max_auto_readahead_size",
+ {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"prepopulate_block_cache",
+ OptionTypeInfo::Enum<BlockBasedTableOptions::PrepopulateBlockCache>(
+ offsetof(struct BlockBasedTableOptions, prepopulate_block_cache),
+ &block_base_table_prepopulate_block_cache_string_map,
+ OptionTypeFlags::kMutable)},
+ {"initial_auto_readahead_size",
+ {offsetof(struct BlockBasedTableOptions, initial_auto_readahead_size),
+ OptionType::kSizeT, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+ {"num_file_reads_for_auto_readahead",
+ {offsetof(struct BlockBasedTableOptions,
+ num_file_reads_for_auto_readahead),
+ OptionType::kUInt64T, OptionVerificationType::kNormal,
+ OptionTypeFlags::kMutable}},
+
+#endif // ROCKSDB_LITE
+};
+
+// TODO(myabandeh): We should return an error instead of silently changing the
+// options
+BlockBasedTableFactory::BlockBasedTableFactory(
+ const BlockBasedTableOptions& _table_options)
+ : table_options_(_table_options) {
+ InitializeOptions();
+ RegisterOptions(&table_options_, &block_based_table_type_info);
+
+ const auto table_reader_charged =
+ table_options_.cache_usage_options.options_overrides
+ .at(CacheEntryRole::kBlockBasedTableReader)
+ .charged;
+ if (table_options_.block_cache &&
+ table_reader_charged == CacheEntryRoleOptions::Decision::kEnabled) {
+ table_reader_cache_res_mgr_.reset(new ConcurrentCacheReservationManager(
+ std::make_shared<CacheReservationManagerImpl<
+ CacheEntryRole::kBlockBasedTableReader>>(
+ table_options_.block_cache)));
+ }
+}
+
+void BlockBasedTableFactory::InitializeOptions() {
+ if (table_options_.flush_block_policy_factory == nullptr) {
+ table_options_.flush_block_policy_factory.reset(
+ new FlushBlockBySizePolicyFactory());
+ }
+ if (table_options_.no_block_cache) {
+ table_options_.block_cache.reset();
+ } else if (table_options_.block_cache == nullptr) {
+ LRUCacheOptions co;
+ co.capacity = 8 << 20;
+ // It makes little sense to pay overhead for mid-point insertion while the
+ // block size is only 8MB.
+ co.high_pri_pool_ratio = 0.0;
+ co.low_pri_pool_ratio = 0.0;
+ table_options_.block_cache = NewLRUCache(co);
+ }
+ if (table_options_.block_size_deviation < 0 ||
+ table_options_.block_size_deviation > 100) {
+ table_options_.block_size_deviation = 0;
+ }
+ if (table_options_.block_restart_interval < 1) {
+ table_options_.block_restart_interval = 1;
+ }
+ if (table_options_.index_block_restart_interval < 1) {
+ table_options_.index_block_restart_interval = 1;
+ }
+ if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+ table_options_.index_block_restart_interval != 1) {
+ // Currently kHashSearch is incompatible with
+ // index_block_restart_interval > 1
+ table_options_.index_block_restart_interval = 1;
+ }
+ if (table_options_.partition_filters &&
+ table_options_.index_type !=
+ BlockBasedTableOptions::kTwoLevelIndexSearch) {
+ // We do not support partitioned filters without partitioning indexes
+ table_options_.partition_filters = false;
+ }
+ auto& options_overrides =
+ table_options_.cache_usage_options.options_overrides;
+ const auto options = table_options_.cache_usage_options.options;
+ for (std::uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
+ CacheEntryRole role = static_cast<CacheEntryRole>(i);
+ auto options_overrides_iter = options_overrides.find(role);
+ if (options_overrides_iter == options_overrides.end()) {
+ options_overrides.insert({role, options});
+ } else if (options_overrides_iter->second.charged ==
+ CacheEntryRoleOptions::Decision::kFallback) {
+ options_overrides_iter->second.charged = options.charged;
+ }
+ }
+}
+
+Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) {
+ InitializeOptions();
+ return TableFactory::PrepareOptions(opts);
+}
+
+namespace {
+// Different cache kinds use the same keys for physically different values, so
+// they must not share an underlying key space with each other.
+Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {
+ int cache_count = (bbto.block_cache != nullptr) +
+ (bbto.block_cache_compressed != nullptr) +
+ (bbto.persistent_cache != nullptr);
+ if (cache_count <= 1) {
+ // Nothing to share / overlap
+ return Status::OK();
+ }
+
+ // Simple pointer equality
+ if (bbto.block_cache == bbto.block_cache_compressed) {
+ return Status::InvalidArgument(
+ "block_cache same as block_cache_compressed not currently supported, "
+ "and would be bad for performance anyway");
+ }
+
+ // More complex test of shared key space, in case the instances are wrappers
+ // for some shared underlying cache.
+ CacheKey sentinel_key = CacheKey::CreateUniqueForProcessLifetime();
+ static char kRegularBlockCacheMarker = 'b';
+ static char kCompressedBlockCacheMarker = 'c';
+ static char kPersistentCacheMarker = 'p';
+ if (bbto.block_cache) {
+ bbto.block_cache
+ ->Insert(sentinel_key.AsSlice(), &kRegularBlockCacheMarker, 1,
+ GetNoopDeleterForRole<CacheEntryRole::kMisc>())
+ .PermitUncheckedError();
+ }
+ if (bbto.block_cache_compressed) {
+ bbto.block_cache_compressed
+ ->Insert(sentinel_key.AsSlice(), &kCompressedBlockCacheMarker, 1,
+ GetNoopDeleterForRole<CacheEntryRole::kMisc>())
+ .PermitUncheckedError();
+ }
+ if (bbto.persistent_cache) {
+ // Note: persistent cache copies the data, not keeping the pointer
+ bbto.persistent_cache
+ ->Insert(sentinel_key.AsSlice(), &kPersistentCacheMarker, 1)
+ .PermitUncheckedError();
+ }
+ // If we get something different from what we inserted, that indicates
+ // dangerously overlapping key spaces.
+ if (bbto.block_cache) {
+ auto handle = bbto.block_cache->Lookup(sentinel_key.AsSlice());
+ if (handle) {
+ auto v = static_cast<char*>(bbto.block_cache->Value(handle));
+ char c = *v;
+ bbto.block_cache->Release(handle);
+ if (v == &kCompressedBlockCacheMarker) {
+ return Status::InvalidArgument(
+ "block_cache and block_cache_compressed share the same key space, "
+ "which is not supported");
+ } else if (c == kPersistentCacheMarker) {
+ return Status::InvalidArgument(
+ "block_cache and persistent_cache share the same key space, "
+ "which is not supported");
+ } else if (v != &kRegularBlockCacheMarker) {
+ return Status::Corruption("Unexpected mutation to block_cache");
+ }
+ }
+ }
+ if (bbto.block_cache_compressed) {
+ auto handle = bbto.block_cache_compressed->Lookup(sentinel_key.AsSlice());
+ if (handle) {
+ auto v = static_cast<char*>(bbto.block_cache_compressed->Value(handle));
+ char c = *v;
+ bbto.block_cache_compressed->Release(handle);
+ if (v == &kRegularBlockCacheMarker) {
+ return Status::InvalidArgument(
+ "block_cache_compressed and block_cache share the same key space, "
+ "which is not supported");
+ } else if (c == kPersistentCacheMarker) {
+ return Status::InvalidArgument(
+ "block_cache_compressed and persistent_cache share the same key "
+ "space, "
+ "which is not supported");
+ } else if (v != &kCompressedBlockCacheMarker) {
+ return Status::Corruption(
+ "Unexpected mutation to block_cache_compressed");
+ }
+ }
+ }
+ if (bbto.persistent_cache) {
+ std::unique_ptr<char[]> data;
+ size_t size = 0;
+ bbto.persistent_cache->Lookup(sentinel_key.AsSlice(), &data, &size)
+ .PermitUncheckedError();
+ if (data && size > 0) {
+ if (data[0] == kRegularBlockCacheMarker) {
+ return Status::InvalidArgument(
+ "persistent_cache and block_cache share the same key space, "
+ "which is not supported");
+ } else if (data[0] == kCompressedBlockCacheMarker) {
+ return Status::InvalidArgument(
+ "persistent_cache and block_cache_compressed share the same key "
+ "space, "
+ "which is not supported");
+ } else if (data[0] != kPersistentCacheMarker) {
+ return Status::Corruption("Unexpected mutation to persistent_cache");
+ }
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace
+
+Status BlockBasedTableFactory::NewTableReader(
+ const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+ std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+ std::unique_ptr<TableReader>* table_reader,
+ bool prefetch_index_and_filter_in_cache) const {
+ return BlockBasedTable::Open(
+ ro, table_reader_options.ioptions, table_reader_options.env_options,
+ table_options_, table_reader_options.internal_comparator, std::move(file),
+ file_size, table_reader, table_reader_cache_res_mgr_,
+ table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache,
+ table_reader_options.skip_filters, table_reader_options.level,
+ table_reader_options.immortal, table_reader_options.largest_seqno,
+ table_reader_options.force_direct_prefetch, &tail_prefetch_stats_,
+ table_reader_options.block_cache_tracer,
+ table_reader_options.max_file_size_for_l0_meta_pin,
+ table_reader_options.cur_db_session_id, table_reader_options.cur_file_num,
+ table_reader_options.unique_id);
+}
+
+TableBuilder* BlockBasedTableFactory::NewTableBuilder(
+ const TableBuilderOptions& table_builder_options,
+ WritableFileWriter* file) const {
+ return new BlockBasedTableBuilder(table_options_, table_builder_options,
+ file);
+}
+
+Status BlockBasedTableFactory::ValidateOptions(
+ const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+ if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+ cf_opts.prefix_extractor == nullptr) {
+ return Status::InvalidArgument(
+ "Hash index is specified for block-based "
+ "table, but prefix_extractor is not given");
+ }
+ if (table_options_.cache_index_and_filter_blocks &&
+ table_options_.no_block_cache) {
+ return Status::InvalidArgument(
+ "Enable cache_index_and_filter_blocks, "
+ ", but block cache is disabled");
+ }
+ if (table_options_.pin_l0_filter_and_index_blocks_in_cache &&
+ table_options_.no_block_cache) {
+ return Status::InvalidArgument(
+ "Enable pin_l0_filter_and_index_blocks_in_cache, "
+ ", but block cache is disabled");
+ }
+ if (!IsSupportedFormatVersion(table_options_.format_version)) {
+ return Status::InvalidArgument(
+ "Unsupported BlockBasedTable format_version. Please check "
+ "include/rocksdb/table.h for more info");
+ }
+ if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
+ return Status::InvalidArgument(
+ "Enable block_align, but compression "
+ "enabled");
+ }
+ if (table_options_.block_align &&
+ (table_options_.block_size & (table_options_.block_size - 1))) {
+ return Status::InvalidArgument(
+ "Block alignment requested but block size is not a power of 2");
+ }
+ if (table_options_.block_size > std::numeric_limits<uint32_t>::max()) {
+ return Status::InvalidArgument(
+ "block size exceeds maximum number (4GiB) allowed");
+ }
+ if (table_options_.data_block_index_type ==
+ BlockBasedTableOptions::kDataBlockBinaryAndHash &&
+ table_options_.data_block_hash_table_util_ratio <= 0) {
+ return Status::InvalidArgument(
+ "data_block_hash_table_util_ratio should be greater than 0 when "
+ "data_block_index_type is set to kDataBlockBinaryAndHash");
+ }
+ if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
+ // TODO(myabandeh): support it
+ return Status::InvalidArgument(
+ "max_successive_merges larger than 0 is currently inconsistent with "
+ "unordered_write");
+ }
+ const auto& options_overrides =
+ table_options_.cache_usage_options.options_overrides;
+ for (auto options_overrides_iter = options_overrides.cbegin();
+ options_overrides_iter != options_overrides.cend();
+ ++options_overrides_iter) {
+ const CacheEntryRole role = options_overrides_iter->first;
+ const CacheEntryRoleOptions options = options_overrides_iter->second;
+ static const std::set<CacheEntryRole> kMemoryChargingSupported = {
+ CacheEntryRole::kCompressionDictionaryBuildingBuffer,
+ CacheEntryRole::kFilterConstruction,
+ CacheEntryRole::kBlockBasedTableReader, CacheEntryRole::kFileMetadata,
+ CacheEntryRole::kBlobCache};
+ if (options.charged != CacheEntryRoleOptions::Decision::kFallback &&
+ kMemoryChargingSupported.count(role) == 0) {
+ return Status::NotSupported(
+ "Enable/Disable CacheEntryRoleOptions::charged"
+ " for CacheEntryRole " +
+ kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+ " is not supported");
+ }
+ if (table_options_.no_block_cache &&
+ options.charged == CacheEntryRoleOptions::Decision::kEnabled) {
+ return Status::InvalidArgument(
+ "Enable CacheEntryRoleOptions::charged"
+ " for CacheEntryRole " +
+ kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+ " but block cache is disabled");
+ }
+ if (role == CacheEntryRole::kBlobCache &&
+ options.charged == CacheEntryRoleOptions::Decision::kEnabled) {
+ if (cf_opts.blob_cache == nullptr) {
+ return Status::InvalidArgument(
+ "Enable CacheEntryRoleOptions::charged"
+ " for CacheEntryRole " +
+ kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+ " but blob cache is not configured");
+ }
+ if (table_options_.no_block_cache) {
+ return Status::InvalidArgument(
+ "Enable CacheEntryRoleOptions::charged"
+ " for CacheEntryRole " +
+ kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+ " but block cache is disabled");
+ }
+ if (table_options_.block_cache == cf_opts.blob_cache) {
+ return Status::InvalidArgument(
+ "Enable CacheEntryRoleOptions::charged"
+ " for CacheEntryRole " +
+ kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+ " but blob cache is the same as block cache");
+ }
+ if (cf_opts.blob_cache->GetCapacity() >
+ table_options_.block_cache->GetCapacity()) {
+ return Status::InvalidArgument(
+ "Enable CacheEntryRoleOptions::charged"
+ " for CacheEntryRole " +
+ kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+ " but blob cache capacity is larger than block cache capacity");
+ }
+ }
+ }
+ {
+ Status s = CheckCacheOptionCompatibility(table_options_);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ std::string garbage;
+ if (!SerializeEnum<ChecksumType>(checksum_type_string_map,
+ table_options_.checksum, &garbage)) {
+ return Status::InvalidArgument(
+ "Unrecognized ChecksumType for checksum: " +
+ std::to_string(static_cast<uint32_t>(table_options_.checksum)));
+ }
+ return TableFactory::ValidateOptions(db_opts, cf_opts);
+}
+
+std::string BlockBasedTableFactory::GetPrintableOptions() const {
+ std::string ret;
+ ret.reserve(20000);
+ const int kBufferSize = 200;
+ char buffer[kBufferSize];
+
+ snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n",
+ table_options_.flush_block_policy_factory->Name(),
+ static_cast<void*>(table_options_.flush_block_policy_factory.get()));
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n",
+ table_options_.cache_index_and_filter_blocks);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize,
+ " cache_index_and_filter_blocks_with_high_priority: %d\n",
+ table_options_.cache_index_and_filter_blocks_with_high_priority);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize,
+ " pin_l0_filter_and_index_blocks_in_cache: %d\n",
+ table_options_.pin_l0_filter_and_index_blocks_in_cache);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n",
+ table_options_.pin_top_level_index_and_filter);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " index_type: %d\n",
+ table_options_.index_type);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " data_block_index_type: %d\n",
+ table_options_.data_block_index_type);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " index_shortening: %d\n",
+ static_cast<int>(table_options_.index_shortening));
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " data_block_hash_table_util_ratio: %lf\n",
+ table_options_.data_block_hash_table_util_ratio);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " checksum: %d\n", table_options_.checksum);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " no_block_cache: %d\n",
+ table_options_.no_block_cache);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " block_cache: %p\n",
+ static_cast<void*>(table_options_.block_cache.get()));
+ ret.append(buffer);
+ if (table_options_.block_cache) {
+ const char* block_cache_name = table_options_.block_cache->Name();
+ if (block_cache_name != nullptr) {
+ snprintf(buffer, kBufferSize, " block_cache_name: %s\n",
+ block_cache_name);
+ ret.append(buffer);
+ }
+ ret.append(" block_cache_options:\n");
+ ret.append(table_options_.block_cache->GetPrintableOptions());
+ }
+ snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n",
+ static_cast<void*>(table_options_.block_cache_compressed.get()));
+ ret.append(buffer);
+ if (table_options_.block_cache_compressed) {
+ const char* block_cache_compressed_name =
+ table_options_.block_cache_compressed->Name();
+ if (block_cache_compressed_name != nullptr) {
+ snprintf(buffer, kBufferSize, " block_cache_name: %s\n",
+ block_cache_compressed_name);
+ ret.append(buffer);
+ }
+ ret.append(" block_cache_compressed_options:\n");
+ ret.append(table_options_.block_cache_compressed->GetPrintableOptions());
+ }
+ snprintf(buffer, kBufferSize, " persistent_cache: %p\n",
+ static_cast<void*>(table_options_.persistent_cache.get()));
+ ret.append(buffer);
+ if (table_options_.persistent_cache) {
+ snprintf(buffer, kBufferSize, " persistent_cache_options:\n");
+ ret.append(buffer);
+ ret.append(table_options_.persistent_cache->GetPrintableOptions());
+ }
+ snprintf(buffer, kBufferSize, " block_size: %" PRIu64 "\n",
+ table_options_.block_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " block_size_deviation: %d\n",
+ table_options_.block_size_deviation);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " block_restart_interval: %d\n",
+ table_options_.block_restart_interval);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n",
+ table_options_.index_block_restart_interval);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " metadata_block_size: %" PRIu64 "\n",
+ table_options_.metadata_block_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " partition_filters: %d\n",
+ table_options_.partition_filters);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " use_delta_encoding: %d\n",
+ table_options_.use_delta_encoding);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " filter_policy: %s\n",
+ table_options_.filter_policy == nullptr
+ ? "nullptr"
+ : table_options_.filter_policy->Name());
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n",
+ table_options_.whole_key_filtering);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " verify_compression: %d\n",
+ table_options_.verify_compression);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " read_amp_bytes_per_bit: %d\n",
+ table_options_.read_amp_bytes_per_bit);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " format_version: %d\n",
+ table_options_.format_version);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " enable_index_compression: %d\n",
+ table_options_.enable_index_compression);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " block_align: %d\n",
+ table_options_.block_align);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize,
+ " max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
+ table_options_.max_auto_readahead_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize, " prepopulate_block_cache: %d\n",
+ static_cast<int>(table_options_.prepopulate_block_cache));
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize,
+ " initial_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
+ table_options_.initial_auto_readahead_size);
+ ret.append(buffer);
+ snprintf(buffer, kBufferSize,
+ " num_file_reads_for_auto_readahead: %" PRIu64 "\n",
+ table_options_.num_file_reads_for_auto_readahead);
+ ret.append(buffer);
+ return ret;
+}
+
+const void* BlockBasedTableFactory::GetOptionsPtr(
+ const std::string& name) const {
+ if (name == kBlockCacheOpts()) {
+ if (table_options_.no_block_cache) {
+ return nullptr;
+ } else {
+ return table_options_.block_cache.get();
+ }
+ } else {
+ return TableFactory::GetOptionsPtr(name);
+ }
+}
+
+#ifndef ROCKSDB_LITE
+// Take a default BlockBasedTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// BlockBasedTableOptions "new_table_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in BlockBasedTableOptions:
+//
+// * filter_policy:
+// We currently only support the following FilterPolicy in the convenience
+// functions:
+// - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
+// to specify BloomFilter. The above string is equivalent to calling
+// NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
+// [Example]:
+// - Pass {"filter_policy", "bloomfilter:4:true"} in
+// GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
+// per key and use_block_based_builder enabled.
+//
+// * block_cache / block_cache_compressed:
+// We currently only support LRU cache in the GetOptions API. The LRU
+// cache can be set by directly specifying its size.
+// [Example]:
+// - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
+// equivalent to setting block_cache using NewLRUCache(1024 * 1024).
+//
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+// "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+// with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+// prefixed by '\' in the values of the opts_map will be further converted
+// back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+// instead of resulting in an unknown-option error.
+// @return Status::OK() on success. Otherwise, a non-ok status indicating
+// error will be returned, and "new_table_options" will be set to
+// "table_options".
+Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options,
+ const OptionTypeInfo& opt_info,
+ const std::string& opt_name,
+ const std::string& opt_value,
+ void* opt_ptr) {
+ Status status = TableFactory::ParseOption(config_options, opt_info, opt_name,
+ opt_value, opt_ptr);
+ if (config_options.input_strings_escaped && !status.ok()) { // Got an error
+ // !input_strings_escaped indicates the old API, where everything is
+ // parsable.
+ if (opt_info.IsByName()) {
+ status = Status::OK();
+ }
+ }
+ return status;
+}
+
+Status GetBlockBasedTableOptionsFromString(
+ const BlockBasedTableOptions& table_options, const std::string& opts_str,
+ BlockBasedTableOptions* new_table_options) {
+ ConfigOptions config_options;
+ config_options.input_strings_escaped = false;
+ config_options.ignore_unknown_options = false;
+ config_options.invoke_prepare_options = false;
+ config_options.ignore_unsupported_options = false;
+
+ return GetBlockBasedTableOptionsFromString(config_options, table_options,
+ opts_str, new_table_options);
+}
+Status GetBlockBasedTableOptionsFromString(
+ const ConfigOptions& config_options,
+ const BlockBasedTableOptions& table_options, const std::string& opts_str,
+ BlockBasedTableOptions* new_table_options) {
+ std::unordered_map<std::string, std::string> opts_map;
+ Status s = StringToMap(opts_str, &opts_map);
+ if (!s.ok()) {
+ return s;
+ }
+ s = GetBlockBasedTableOptionsFromMap(config_options, table_options, opts_map,
+ new_table_options);
+ // Translate any errors (NotFound, NotSupported, to InvalidArgument
+ if (s.ok() || s.IsInvalidArgument()) {
+ return s;
+ } else {
+ return Status::InvalidArgument(s.getState());
+ }
+}
+
+Status GetBlockBasedTableOptionsFromMap(
+ const BlockBasedTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ BlockBasedTableOptions* new_table_options, bool input_strings_escaped,
+ bool ignore_unknown_options) {
+ ConfigOptions config_options;
+ config_options.input_strings_escaped = input_strings_escaped;
+ config_options.ignore_unknown_options = ignore_unknown_options;
+ config_options.invoke_prepare_options = false;
+
+ return GetBlockBasedTableOptionsFromMap(config_options, table_options,
+ opts_map, new_table_options);
+}
+
+Status GetBlockBasedTableOptionsFromMap(
+ const ConfigOptions& config_options,
+ const BlockBasedTableOptions& table_options,
+ const std::unordered_map<std::string, std::string>& opts_map,
+ BlockBasedTableOptions* new_table_options) {
+ assert(new_table_options);
+ BlockBasedTableFactory bbtf(table_options);
+ Status s = bbtf.ConfigureFromMap(config_options, opts_map);
+ if (s.ok()) {
+ *new_table_options = *(bbtf.GetOptions<BlockBasedTableOptions>());
+ } else {
+ *new_table_options = table_options;
+ }
+ return s;
+}
+#endif // !ROCKSDB_LITE
+
+TableFactory* NewBlockBasedTableFactory(
+ const BlockBasedTableOptions& _table_options) {
+ return new BlockBasedTableFactory(_table_options);
+}
+
+const std::string BlockBasedTablePropertyNames::kIndexType =
+ "rocksdb.block.based.table.index.type";
+const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
+ "rocksdb.block.based.table.whole.key.filtering";
+const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
+ "rocksdb.block.based.table.prefix.filtering";
+const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
+const std::string kHashIndexPrefixesMetadataBlock =
+ "rocksdb.hashindex.metadata";
+const std::string kPropTrue = "1";
+const std::string kPropFalse = "0";
+
+} // namespace ROCKSDB_NAMESPACE