summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/db/compaction/compaction.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/rocksdb/db/compaction/compaction.h559
1 files changed, 559 insertions, 0 deletions
diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h
new file mode 100644
index 000000000..21d1190ac
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.h
@@ -0,0 +1,559 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/sst_partitioner.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The file contains class Compaction, as well as some helper functions
+// and data structures used by the class.
+
+// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
+// null which provides the property that a==null indicates a key that is less
+// than any key and b==null indicates a key that is greater than any key. Note
+// that the comparison is performed primarily on the user-key portion of the
+// key. If the user-keys compare equal, an additional test is made to sort
+// range tombstone sentinel keys before other keys with the same user-key. The
+// result is that 2 user-keys will compare equal if they differ purely on
+// their sequence number and value, but the range tombstone sentinel for that
+// user-key will compare not equal. This is necessary because the range
+// tombstone sentinel key is set as the largest key for an sstable even though
+// that key never appears in the database. We don't want adjacent sstables to
+// be considered overlapping if they are separated by the range tombstone
+// sentinel.
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+ const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+ const InternalKey* b);
+
+// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
+// largest] that exactly spans one ore more neighbouring SSTs on the same
+// level. Every pair of SSTs in this range "overlap" (i.e., the largest
+// user key of one file is the smallest user key of the next file). These
+// boundaries are propagated down to RangeDelAggregator during compaction
+// to provide safe truncation boundaries for range tombstones.
+struct AtomicCompactionUnitBoundary {
+ const InternalKey* smallest = nullptr;
+ const InternalKey* largest = nullptr;
+};
+
+// The structure that manages compaction input files associated
+// with the same physical level.
+struct CompactionInputFiles {
+ int level;
+ std::vector<FileMetaData*> files;
+ std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
+ inline bool empty() const { return files.empty(); }
+ inline size_t size() const { return files.size(); }
+ inline void clear() { files.clear(); }
+ inline FileMetaData* operator[](size_t i) const { return files[i]; }
+};
+
+class Version;
+class ColumnFamilyData;
+class VersionStorageInfo;
+class CompactionFilter;
+
+// A Compaction encapsulates metadata about a compaction.
+class Compaction {
+ public:
+ Compaction(VersionStorageInfo* input_version,
+ const ImmutableOptions& immutable_options,
+ const MutableCFOptions& mutable_cf_options,
+ const MutableDBOptions& mutable_db_options,
+ std::vector<CompactionInputFiles> inputs, int output_level,
+ uint64_t target_file_size, uint64_t max_compaction_bytes,
+ uint32_t output_path_id, CompressionType compression,
+ CompressionOptions compression_opts,
+ Temperature output_temperature, uint32_t max_subcompactions,
+ std::vector<FileMetaData*> grandparents,
+ bool manual_compaction = false, const std::string& trim_ts = "",
+ double score = -1, bool deletion_compaction = false,
+ bool l0_files_might_overlap = true,
+ CompactionReason compaction_reason = CompactionReason::kUnknown,
+ BlobGarbageCollectionPolicy blob_garbage_collection_policy =
+ BlobGarbageCollectionPolicy::kUseDefault,
+ double blob_garbage_collection_age_cutoff = -1);
+
+ // The type of the penultimate level output range
+ enum class PenultimateOutputRangeType : int {
+ kNotSupported, // it cannot output to the penultimate level
+ kFullRange, // any data could be output to the penultimate level
+ kNonLastRange, // only the keys within non_last_level compaction inputs can
+ // be outputted to the penultimate level
+ kDisabled, // no data can be outputted to the penultimate level
+ };
+
+ // No copying allowed
+ Compaction(const Compaction&) = delete;
+ void operator=(const Compaction&) = delete;
+
+ ~Compaction();
+
+ // Returns the level associated to the specified compaction input level.
+ // If compaction_input_level is not specified, then input_level is set to 0.
+ int level(size_t compaction_input_level = 0) const {
+ return inputs_[compaction_input_level].level;
+ }
+
+ int start_level() const { return start_level_; }
+
+ // Outputs will go to this level
+ int output_level() const { return output_level_; }
+
+ // Returns the number of input levels in this compaction.
+ size_t num_input_levels() const { return inputs_.size(); }
+
+ // Return the object that holds the edits to the descriptor done
+ // by this compaction.
+ VersionEdit* edit() { return &edit_; }
+
+ // Returns the number of input files associated to the specified
+ // compaction input level.
+ // The function will return 0 if when "compaction_input_level" < 0
+ // or "compaction_input_level" >= "num_input_levels()".
+ size_t num_input_files(size_t compaction_input_level) const {
+ if (compaction_input_level < inputs_.size()) {
+ return inputs_[compaction_input_level].size();
+ }
+ return 0;
+ }
+
+ // Returns input version of the compaction
+ Version* input_version() const { return input_version_; }
+
+ // Returns the ColumnFamilyData associated with the compaction.
+ ColumnFamilyData* column_family_data() const { return cfd_; }
+
+ // Returns the file meta data of the 'i'th input file at the
+ // specified compaction input level.
+ // REQUIREMENT: "compaction_input_level" must be >= 0 and
+ // < "input_levels()"
+ FileMetaData* input(size_t compaction_input_level, size_t i) const {
+ assert(compaction_input_level < inputs_.size());
+ return inputs_[compaction_input_level][i];
+ }
+
+ const std::vector<AtomicCompactionUnitBoundary>* boundaries(
+ size_t compaction_input_level) const {
+ assert(compaction_input_level < inputs_.size());
+ return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
+ }
+
+ // Returns the list of file meta data of the specified compaction
+ // input level.
+ // REQUIREMENT: "compaction_input_level" must be >= 0 and
+ // < "input_levels()"
+ const std::vector<FileMetaData*>* inputs(
+ size_t compaction_input_level) const {
+ assert(compaction_input_level < inputs_.size());
+ return &inputs_[compaction_input_level].files;
+ }
+
+ const std::vector<CompactionInputFiles>* inputs() { return &inputs_; }
+
+ // Returns the LevelFilesBrief of the specified compaction input level.
+ const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
+ return &input_levels_[compaction_input_level];
+ }
+
+ // Maximum size of files to build during this compaction.
+ uint64_t max_output_file_size() const { return max_output_file_size_; }
+
+ // Target output file size for this compaction
+ uint64_t target_output_file_size() const { return target_output_file_size_; }
+
+ // What compression for output
+ CompressionType output_compression() const { return output_compression_; }
+
+ // What compression options for output
+ const CompressionOptions& output_compression_opts() const {
+ return output_compression_opts_;
+ }
+
+ // Whether need to write output file to second DB path.
+ uint32_t output_path_id() const { return output_path_id_; }
+
+ // Is this a trivial compaction that can be implemented by just
+ // moving a single input file to the next level (no merging or splitting)
+ bool IsTrivialMove() const;
+
+ // The split user key in the output level if this compaction is required to
+ // split the output files according to the existing cursor in the output
+ // level under round-robin compaction policy. Empty indicates no required
+ // splitting key
+ const InternalKey* GetOutputSplitKey() const { return output_split_key_; }
+
+ // If true, then the compaction can be done by simply deleting input files.
+ bool deletion_compaction() const { return deletion_compaction_; }
+
+ // Add all inputs to this compaction as delete operations to *edit.
+ void AddInputDeletions(VersionEdit* edit);
+
+ // Returns true if the available information we have guarantees that
+ // the input "user_key" does not exist in any level beyond "output_level()".
+ bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
+ std::vector<size_t>* level_ptrs) const;
+
+ // Clear all files to indicate that they are not being compacted
+ // Delete this compaction from the list of running compactions.
+ //
+ // Requirement: DB mutex held
+ void ReleaseCompactionFiles(Status status);
+
+ // Returns the summary of the compaction in "output" with maximum "len"
+ // in bytes. The caller is responsible for the memory management of
+ // "output".
+ void Summary(char* output, int len);
+
+ // Return the score that was used to pick this compaction run.
+ double score() const { return score_; }
+
+ // Is this compaction creating a file in the bottom most level?
+ bool bottommost_level() const { return bottommost_level_; }
+
+ // Is the compaction compact to the last level
+ bool is_last_level() const {
+ return output_level_ == immutable_options_.num_levels - 1;
+ }
+
+ // Does this compaction include all sst files?
+ bool is_full_compaction() const { return is_full_compaction_; }
+
+ // Was this compaction triggered manually by the client?
+ bool is_manual_compaction() const { return is_manual_compaction_; }
+
+ std::string trim_ts() const { return trim_ts_; }
+
+ // Used when allow_trivial_move option is set in
+ // Universal compaction. If all the input files are
+ // non overlapping, then is_trivial_move_ variable
+ // will be set true, else false
+ void set_is_trivial_move(bool trivial_move) {
+ is_trivial_move_ = trivial_move;
+ }
+
+ // Used when allow_trivial_move option is set in
+ // Universal compaction. Returns true, if the input files
+ // are non-overlapping and can be trivially moved.
+ bool is_trivial_move() const { return is_trivial_move_; }
+
+ // How many total levels are there?
+ int number_levels() const { return number_levels_; }
+
+ // Return the ImmutableOptions that should be used throughout the compaction
+ // procedure
+ const ImmutableOptions* immutable_options() const {
+ return &immutable_options_;
+ }
+
+ // Return the MutableCFOptions that should be used throughout the compaction
+ // procedure
+ const MutableCFOptions* mutable_cf_options() const {
+ return &mutable_cf_options_;
+ }
+
+ // Returns the size in bytes that the output file should be preallocated to.
+ // In level compaction, that is max_file_size_. In universal compaction, that
+ // is the sum of all input file sizes.
+ uint64_t OutputFilePreallocationSize() const;
+
+ void SetInputVersion(Version* input_version);
+
+ struct InputLevelSummaryBuffer {
+ char buffer[128];
+ };
+
+ const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const;
+
+ uint64_t CalculateTotalInputSize() const;
+
+ // In case of compaction error, reset the nextIndex that is used
+ // to pick up the next file to be compacted from files_by_size_
+ void ResetNextCompactionIndex();
+
+ // Create a CompactionFilter from compaction_filter_factory
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter() const;
+
+ // Create a SstPartitioner from sst_partitioner_factory
+ std::unique_ptr<SstPartitioner> CreateSstPartitioner() const;
+
+ // Is the input level corresponding to output_level_ empty?
+ bool IsOutputLevelEmpty() const;
+
+ // Should this compaction be broken up into smaller ones run in parallel?
+ bool ShouldFormSubcompactions() const;
+
+ // Returns true iff at least one input file references a blob file.
+ //
+ // PRE: input version has been set.
+ bool DoesInputReferenceBlobFiles() const;
+
+ // test function to validate the functionality of IsBottommostLevel()
+ // function -- determines if compaction with inputs and storage is bottommost
+ static bool TEST_IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ TablePropertiesCollection GetOutputTableProperties() const {
+ return output_table_properties_;
+ }
+
+ void SetOutputTableProperties(TablePropertiesCollection tp) {
+ output_table_properties_ = std::move(tp);
+ }
+
+ Slice GetSmallestUserKey() const { return smallest_user_key_; }
+
+ Slice GetLargestUserKey() const { return largest_user_key_; }
+
+ Slice GetPenultimateLevelSmallestUserKey() const {
+ return penultimate_level_smallest_user_key_;
+ }
+
+ Slice GetPenultimateLevelLargestUserKey() const {
+ return penultimate_level_largest_user_key_;
+ }
+
+ PenultimateOutputRangeType GetPenultimateOutputRangeType() const {
+ return penultimate_output_range_type_;
+ }
+
+ // Return true if the compaction supports per_key_placement
+ bool SupportsPerKeyPlacement() const;
+
+ // Get per_key_placement penultimate output level, which is `last_level - 1`
+ // if per_key_placement feature is supported. Otherwise, return -1.
+ int GetPenultimateLevel() const;
+
+ // Return true if the given range is overlap with penultimate level output
+ // range.
+ // Both smallest_key and largest_key include timestamps if user-defined
+ // timestamp is enabled.
+ bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
+ const Slice& largest_key) const;
+
+ // Return true if the key is within penultimate level output range for
+ // per_key_placement feature, which is safe to place the key to the
+ // penultimate level. different compaction strategy has different rules.
+ // If per_key_placement is not supported, always return false.
+ // TODO: currently it doesn't support moving data from the last level to the
+ // penultimate level
+ // key includes timestamp if user-defined timestamp is enabled.
+ bool WithinPenultimateLevelOutputRange(const Slice& key) const;
+
+ CompactionReason compaction_reason() const { return compaction_reason_; }
+
+ const std::vector<FileMetaData*>& grandparents() const {
+ return grandparents_;
+ }
+
+ uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
+
+ Temperature output_temperature() const { return output_temperature_; }
+
+ uint32_t max_subcompactions() const { return max_subcompactions_; }
+
+ bool enable_blob_garbage_collection() const {
+ return enable_blob_garbage_collection_;
+ }
+
+ double blob_garbage_collection_age_cutoff() const {
+ return blob_garbage_collection_age_cutoff_;
+ }
+
+ // start and end are sub compact range. Null if no boundary.
+ // This is used to filter out some input files' ancester's time range.
+ uint64_t MinInputFileOldestAncesterTime(const InternalKey* start,
+ const InternalKey* end) const;
+
+ // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of
+ // compaction begin and compaction completion callbacks match.
+ void SetNotifyOnCompactionCompleted() {
+ notify_on_compaction_completion_ = true;
+ }
+
+ bool ShouldNotifyOnCompactionCompleted() const {
+ return notify_on_compaction_completion_;
+ }
+
+ static constexpr int kInvalidLevel = -1;
+
+ // Evaluate penultimate output level. If the compaction supports
+ // per_key_placement feature, it returns the penultimate level number.
+ // Otherwise, it's set to kInvalidLevel (-1), which means
+ // output_to_penultimate_level is not supported.
+ // Note: even the penultimate level output is supported (PenultimateLevel !=
+ // kInvalidLevel), some key range maybe unsafe to be outputted to the
+ // penultimate level. The safe key range is populated by
+ // `PopulatePenultimateLevelOutputRange()`.
+ // Which could potentially disable all penultimate level output.
+ static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage,
+ const ImmutableOptions& immutable_options,
+ const int start_level,
+ const int output_level);
+
+ private:
+ // mark (or clear) all files that are being compacted
+ void MarkFilesBeingCompacted(bool mark_as_compacted);
+
+ // get the smallest and largest key present in files to be compacted
+ static void GetBoundaryKeys(VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs,
+ Slice* smallest_key, Slice* largest_key,
+ int exclude_level = -1);
+
+ // populate penultimate level output range, which will be used to determine if
+ // a key is safe to output to the penultimate level (details see
+ // `Compaction::WithinPenultimateLevelOutputRange()`.
+ void PopulatePenultimateLevelOutputRange();
+
+ // Get the atomic file boundaries for all files in the compaction. Necessary
+ // in order to avoid the scenario described in
+ // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and
+ // plumb down appropriate key boundaries to RangeDelAggregator during
+ // compaction.
+ static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
+ VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
+
+ // helper function to determine if compaction with inputs and storage is
+ // bottommost
+ static bool IsBottommostLevel(
+ int output_level, VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ static bool IsFullCompaction(VersionStorageInfo* vstorage,
+ const std::vector<CompactionInputFiles>& inputs);
+
+ VersionStorageInfo* input_vstorage_;
+
+ const int start_level_; // the lowest level to be compacted
+ const int output_level_; // levels to which output files are stored
+ uint64_t target_output_file_size_;
+ uint64_t max_output_file_size_;
+ uint64_t max_compaction_bytes_;
+ uint32_t max_subcompactions_;
+ const ImmutableOptions immutable_options_;
+ const MutableCFOptions mutable_cf_options_;
+ Version* input_version_;
+ VersionEdit edit_;
+ const int number_levels_;
+ ColumnFamilyData* cfd_;
+ Arena arena_; // Arena used to allocate space for file_levels_
+
+ const uint32_t output_path_id_;
+ CompressionType output_compression_;
+ CompressionOptions output_compression_opts_;
+ Temperature output_temperature_;
+ // If true, then the compaction can be done by simply deleting input files.
+ const bool deletion_compaction_;
+ // should it split the output file using the compact cursor?
+ const InternalKey* output_split_key_;
+
+ // L0 files in LSM-tree might be overlapping. But the compaction picking
+ // logic might pick a subset of the files that aren't overlapping. if
+ // that is the case, set the value to false. Otherwise, set it true.
+ bool l0_files_might_overlap_;
+
+ // Compaction input files organized by level. Constant after construction
+ const std::vector<CompactionInputFiles> inputs_;
+
+ // A copy of inputs_, organized more closely in memory
+ autovector<LevelFilesBrief, 2> input_levels_;
+
+ // State used to check for number of overlapping grandparent files
+ // (grandparent == "output_level_ + 1")
+ std::vector<FileMetaData*> grandparents_;
+ const double score_; // score that was used to pick this compaction.
+
+ // Is this compaction creating a file in the bottom most level?
+ const bool bottommost_level_;
+ // Does this compaction include all sst files?
+ const bool is_full_compaction_;
+
+ // Is this compaction requested by the client?
+ const bool is_manual_compaction_;
+
+ // The data with timestamp > trim_ts_ will be removed
+ const std::string trim_ts_;
+
+ // True if we can do trivial move in Universal multi level
+ // compaction
+ bool is_trivial_move_;
+
+ // Does input compression match the output compression?
+ bool InputCompressionMatchesOutput() const;
+
+ // table properties of output files
+ TablePropertiesCollection output_table_properties_;
+
+ // smallest user keys in compaction
+ // includes timestamp if user-defined timestamp is enabled.
+ Slice smallest_user_key_;
+
+ // largest user keys in compaction
+ // includes timestamp if user-defined timestamp is enabled.
+ Slice largest_user_key_;
+
+ // Reason for compaction
+ CompactionReason compaction_reason_;
+
+ // Notify on compaction completion only if listener was notified on compaction
+ // begin.
+ bool notify_on_compaction_completion_;
+
+ // Enable/disable GC collection for blobs during compaction.
+ bool enable_blob_garbage_collection_;
+
+ // Blob garbage collection age cutoff.
+ double blob_garbage_collection_age_cutoff_;
+
+ // only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
+ // means not supported.
+ const int penultimate_level_;
+
+ // Key range for penultimate level output
+ // includes timestamp if user-defined timestamp is enabled.
+ // penultimate_output_range_type_ shows the range type
+ Slice penultimate_level_smallest_user_key_;
+ Slice penultimate_level_largest_user_key_;
+ PenultimateOutputRangeType penultimate_output_range_type_ =
+ PenultimateOutputRangeType::kNotSupported;
+};
+
+#ifndef NDEBUG
+// Helper struct only for tests, which contains the data to decide if a key
+// should be output to the penultimate level.
+// TODO: remove this when the public feature knob is available
+struct PerKeyPlacementContext {
+ const int level;
+ const Slice key;
+ const Slice value;
+ const SequenceNumber seq_num;
+
+ bool output_to_penultimate_level;
+
+ PerKeyPlacementContext(int _level, Slice _key, Slice _value,
+ SequenceNumber _seq_num)
+ : level(_level), key(_key), value(_value), seq_num(_seq_num) {
+ output_to_penultimate_level = false;
+ }
+};
+#endif /* !NDEBUG */
+
+// Return sum of sizes of all files in `files`.
+extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
+
+} // namespace ROCKSDB_NAMESPACE