summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/db/table_cache.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/rocksdb/db/table_cache.h275
1 files changed, 275 insertions, 0 deletions
diff --git a/src/rocksdb/db/table_cache.h b/src/rocksdb/db/table_cache.h
new file mode 100644
index 000000000..2e50f2c77
--- /dev/null
+++ b/src/rocksdb/db/table_cache.h
@@ -0,0 +1,275 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#pragma once
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "options/cf_options.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/table_reader.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/coro_utils.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Arena;
+struct FileDescriptor;
+class GetContext;
+class HistogramImpl;
+
+// Manages caching for TableReader objects for a column family. The actual
+// cache is allocated separately and passed to the constructor. TableCache
+// wraps around the underlying SST file readers by providing Get(),
+// MultiGet() and NewIterator() methods that hide the instantiation,
+// caching and access to the TableReader. The main purpose of this is
+// performance - by caching the TableReader, it avoids unnecessary file opens
+// and object allocation and instantiation. One exception is compaction, where
+// a new TableReader may be instantiated - see NewIterator() comments
+//
+// Another service provided by TableCache is managing the row cache - if the
+// DB is configured with a row cache, and the lookup key is present in the row
+// cache, lookup is very fast. The row cache is obtained from
+// ioptions.row_cache
+class TableCache {
+ public:
+ TableCache(const ImmutableOptions& ioptions,
+ const FileOptions* storage_options, Cache* cache,
+ BlockCacheTracer* const block_cache_tracer,
+ const std::shared_ptr<IOTracer>& io_tracer,
+ const std::string& db_session_id);
+ ~TableCache();
+
+ // Return an iterator for the specified file number (the corresponding
+ // file length must be exactly "file_size" bytes). If "table_reader_ptr"
+ // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
+ // underlying the returned iterator, or nullptr if no Table object underlies
+ // the returned iterator. The returned "*table_reader_ptr" object is owned
+ // by the cache and should not be deleted, and is valid for as long as the
+ // returned iterator is live.
+ // If !options.ignore_range_deletions, and range_del_iter is non-nullptr,
+ // then range_del_iter is set to a TruncatedRangeDelIterator for range
+ // tombstones in the SST file corresponding to the specified file number. The
+ // upper/lower bounds for the TruncatedRangeDelIterator are set to the SST
+ // file's boundary.
+ // @param options Must outlive the returned iterator.
+ // @param range_del_agg If non-nullptr, adds range deletions to the
+ // aggregator. If an error occurs, returns it in a NewErrorInternalIterator
+ // @param for_compaction If true, a new TableReader may be allocated (but
+ // not cached), depending on the CF options
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level The level this table is at, -1 for "not set / don't know"
+ InternalIterator* NewIterator(
+ const ReadOptions& options, const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
+ TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
+ size_t max_file_size_for_l0_meta_pin,
+ const InternalKey* smallest_compaction_key,
+ const InternalKey* largest_compaction_key, bool allow_unprepared_value,
+ TruncatedRangeDelIterator** range_del_iter = nullptr);
+
+ // If a seek to internal key "k" in specified file finds an entry,
+ // call get_context->SaveValue() repeatedly until
+ // it returns false. As a side effect, it will insert the TableReader
+ // into the cache and potentially evict another entry
+ // @param get_context Context for get operation. The result of the lookup
+ // can be retrieved by calling get_context->State()
+ // @param file_read_hist If non-nullptr, the file reader statistics are
+ // recorded
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level The level this table is at, -1 for "not set / don't know"
+ Status Get(
+ const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+ HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+ int level = -1, size_t max_file_size_for_l0_meta_pin = 0);
+
+ // Return the range delete tombstone iterator of the file specified by
+ // `file_meta`.
+ Status GetRangeTombstoneIterator(
+ const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
+
+ // Call table reader's MultiGetFilter to use the bloom filter to filter out
+ // keys. Returns Status::NotSupported() if row cache needs to be checked.
+ // If the table cache is looked up to get the table reader, the cache handle
+ // is returned in table_handle. This handle should be passed back to
+ // MultiGet() so it can be released.
+ Status MultiGetFilter(
+ const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ HistogramImpl* file_read_hist, int level,
+ MultiGetContext::Range* mget_range, Cache::Handle** table_handle);
+
+ // If a seek to internal key "k" in specified file finds an entry,
+ // call get_context->SaveValue() repeatedly until
+ // it returns false. As a side effect, it will insert the TableReader
+ // into the cache and potentially evict another entry
+ // @param mget_range Pointer to the structure describing a batch of keys to
+ // be looked up in this table file. The result is stored
+ // in the embedded GetContext
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level The level this table is at, -1 for "not set / don't know"
+ DECLARE_SYNC_AND_ASYNC(
+ Status, MultiGet, const ReadOptions& options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+ HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+ bool skip_range_deletions = false, int level = -1,
+ Cache::Handle* table_handle = nullptr);
+
+ // Evict any entry for the specified file number
+ static void Evict(Cache* cache, uint64_t file_number);
+
+ // Query whether specified file number is currently in cache
+ static bool HasEntry(Cache* cache, uint64_t file_number);
+
+ // Clean table handle and erase it from the table cache
+ // Used in DB close, or the file is not live anymore.
+ void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle);
+
+ // Find table reader
+ // @param skip_filters Disables loading/accessing the filter block
+ // @param level == -1 means not specified
+ Status FindTable(
+ const ReadOptions& ro, const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, Cache::Handle**,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+ const bool no_io = false, bool record_read_stats = true,
+ HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+ int level = -1, bool prefetch_index_and_filter_in_cache = true,
+ size_t max_file_size_for_l0_meta_pin = 0,
+ Temperature file_temperature = Temperature::kUnknown);
+
+ // Get TableReader from a cache handle.
+ TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
+
+ // Get the table properties of a given table.
+ // @no_io: indicates if we should load table to the cache if it is not present
+ // in table cache yet.
+ // @returns: `properties` will be reset on success. Please note that we will
+ // return Status::Incomplete() if table is not present in cache and
+ // we set `no_io` to be true.
+ Status GetTableProperties(
+ const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ std::shared_ptr<const TableProperties>* properties,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+ bool no_io = false);
+
+ Status ApproximateKeyAnchors(const ReadOptions& ro,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ std::vector<TableReader::Anchor>& anchors);
+
+ // Return total memory usage of the table reader of the file.
+ // 0 if table reader of the file is not loaded.
+ size_t GetMemoryUsageByTableReader(
+ const FileOptions& toptions,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+ // Returns approximated offset of a key in a file represented by fd.
+ uint64_t ApproximateOffsetOf(
+ const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller,
+ const InternalKeyComparator& internal_comparator,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+ // Returns approximated data size between start and end keys in a file
+ // represented by fd (the start key must not be greater than the end key).
+ uint64_t ApproximateSize(
+ const Slice& start, const Slice& end, const FileMetaData& file_meta,
+ TableReaderCaller caller,
+ const InternalKeyComparator& internal_comparator,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+ // Release the handle from a cache
+ void ReleaseHandle(Cache::Handle* handle);
+
+ Cache* get_cache() const { return cache_; }
+
+ // Capacity of the backing Cache that indicates infinite TableCache capacity.
+ // For example when max_open_files is -1 we set the backing Cache to this.
+ static const int kInfiniteCapacity = 0x400000;
+
+ // The tables opened with this TableCache will be immortal, i.e., their
+ // lifetime is as long as that of the DB.
+ void SetTablesAreImmortal() {
+ if (cache_->GetCapacity() >= kInfiniteCapacity) {
+ immortal_tables_ = true;
+ }
+ }
+
+ private:
+ // Build a table reader
+ Status GetTableReader(
+ const ReadOptions& ro, const FileOptions& file_options,
+ const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, bool sequential_mode,
+ bool record_read_stats, HistogramImpl* file_read_hist,
+ std::unique_ptr<TableReader>* table_reader,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+ bool skip_filters = false, int level = -1,
+ bool prefetch_index_and_filter_in_cache = true,
+ size_t max_file_size_for_l0_meta_pin = 0,
+ Temperature file_temperature = Temperature::kUnknown);
+
+ // Update the max_covering_tombstone_seq in the GetContext for each key based
+ // on the range deletions in the table
+ void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t,
+ MultiGetContext::Range& table_range);
+
+ // Create a key prefix for looking up the row cache. The prefix is of the
+ // format row_cache_id + fd_number + seq_no. Later, the user key can be
+ // appended to form the full key
+ void CreateRowCacheKeyPrefix(const ReadOptions& options,
+ const FileDescriptor& fd,
+ const Slice& internal_key,
+ GetContext* get_context, IterKey& row_cache_key);
+
+ // Helper function to lookup the row cache for a key. It appends the
+ // user key to row_cache_key at offset prefix_size
+ bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+ size_t prefix_size, GetContext* get_context);
+
+ const ImmutableOptions& ioptions_;
+ const FileOptions& file_options_;
+ Cache* const cache_;
+ std::string row_cache_id_;
+ bool immortal_tables_;
+ BlockCacheTracer* const block_cache_tracer_;
+ Striped<port::Mutex, Slice> loader_mutex_;
+ std::shared_ptr<IOTracer> io_tracer_;
+ std::string db_session_id_;
+};
+
+} // namespace ROCKSDB_NAMESPACE