Adding upstream version 18.2.2.upstream/18.2.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
commit: e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree: 64f88b554b444a49f656b6c656111a145cbbaa28 /src/rocksdb/db/memtable.h
parent: Initial commit. (diff)
download: ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
1 files changed, 664 insertions, 0 deletions
diff --git a/src/rocksdb/db/memtable.h b/src/rocksdb/db/memtable.h
new file mode 100644
index 000000000..6db2721e4
--- /dev/null
+++ b/src/rocksdb/db/memtable.h
@@ -0,0 +1,664 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/kv_checksum.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
+#include "db/version_edit.h"
+#include "memory/allocator.h"
+#include "memory/concurrent_arena.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "table/multiget_context.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FlushJobInfo;
+class Mutex;
+class MemTableIterator;
+class MergeContext;
+class SystemClock;
+
+struct ImmutableMemTableOptions {
+  explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions,
+                                    const MutableCFOptions& mutable_cf_options);
+  size_t arena_block_size;
+  uint32_t memtable_prefix_bloom_bits;
+  size_t memtable_huge_page_size;
+  bool memtable_whole_key_filtering;
+  bool inplace_update_support;
+  size_t inplace_update_num_locks;
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+  size_t max_successive_merges;
+  Statistics* statistics;
+  MergeOperator* merge_operator;
+  Logger* info_log;
+  bool allow_data_in_errors;
+  uint32_t protection_bytes_per_key;
+};
+
+// Batched counters to updated when inserting keys in one write batch.
+// In post process of the write batch, these can be updated together.
+// Only used in concurrent memtable insert case.
+struct MemTablePostProcessInfo {
+  uint64_t data_size = 0;
+  uint64_t num_entries = 0;
+  uint64_t num_deletes = 0;
+};
+
+using MultiGetRange = MultiGetContext::Range;
+// Note:  Many of the methods in this class have comments indicating that
+// external synchronization is required as these methods are not thread-safe.
+// It is up to higher layers of code to decide how to prevent concurrent
+// invocation of these methods.  This is usually done by acquiring either
+// the db mutex or the single writer thread.
+//
+// Some of these methods are documented to only require external
+// synchronization if this memtable is immutable.  Calling MarkImmutable() is
+// not sufficient to guarantee immutability.  It is up to higher layers of
+// code to determine if this MemTable can still be modified by other threads.
+// Eg: The Superversion stores a pointer to the current MemTable (that can
+// be modified) and a separate list of the MemTables that can no longer be
+// written to (aka the 'immutable memtables').
+class MemTable {
+ public:
+  struct KeyComparator : public MemTableRep::KeyComparator {
+    const InternalKeyComparator comparator;
+    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {}
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const override;
+    virtual int operator()(const char* prefix_len_key,
+                           const DecodedType& key) const override;
+  };
+
+  // MemTables are reference counted.  The initial reference count
+  // is zero and the caller must call Ref() at least once.
+  //
+  // earliest_seq should be the current SequenceNumber in the db such that any
+  // key inserted into this memtable will have an equal or larger seq number.
+  // (When a db is first created, the earliest sequence number will be 0).
+  // If the earliest sequence number is not known, kMaxSequenceNumber may be
+  // used, but this may prevent some transactions from succeeding until the
+  // first key is inserted into the memtable.
+  explicit MemTable(const InternalKeyComparator& comparator,
+                    const ImmutableOptions& ioptions,
+                    const MutableCFOptions& mutable_cf_options,
+                    WriteBufferManager* write_buffer_manager,
+                    SequenceNumber earliest_seq, uint32_t column_family_id);
+  // No copying allowed
+  MemTable(const MemTable&) = delete;
+  MemTable& operator=(const MemTable&) = delete;
+
+  // Do not delete this MemTable unless Unref() indicates it not in use.
+  ~MemTable();
+
+  // Increase reference count.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void Ref() { ++refs_; }
+
+  // Drop reference count.
+  // If the refcount goes to zero return this memtable, otherwise return null.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  MemTable* Unref() {
+    --refs_;
+    assert(refs_ >= 0);
+    if (refs_ <= 0) {
+      return this;
+    }
+    return nullptr;
+  }
+
+  // Returns an estimate of the number of bytes of data in use by this
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  size_t ApproximateMemoryUsage();
+
+  // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
+  // require external synchronization. The value may be less accurate though
+  size_t ApproximateMemoryUsageFast() const {
+    return approximate_memory_usage_.load(std::memory_order_relaxed);
+  }
+
+  // used by MemTableListVersion::MemoryAllocatedBytesExcludingLast
+  size_t MemoryAllocatedBytes() const {
+    return table_->ApproximateMemoryUsage() +
+           range_del_table_->ApproximateMemoryUsage() +
+           arena_.MemoryAllocatedBytes();
+  }
+
+  // Returns a vector of unique random memtable entries of size 'sample_size'.
+  //
+  // Note: the entries are stored in the unordered_set as length-prefixed keys,
+  //       hence their representation in the set as "const char*".
+  // Note2: the size of the output set 'entries' is not enforced to be strictly
+  //        equal to 'target_sample_size'. Its final size might be slightly
+  //        greater or slightly less than 'target_sample_size'
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  // REQUIRES: SkipList memtable representation. This function is not
+  // implemented for any other type of memtable representation (vectorrep,
+  // hashskiplist,...).
+  void UniqueRandomSample(const uint64_t& target_sample_size,
+                          std::unordered_set<const char*>* entries) {
+    // TODO(bjlemaire): at the moment, only supported by skiplistrep.
+    // Extend it to all other memtable representations.
+    table_->UniqueRandomSample(num_entries(), target_sample_size, entries);
+  }
+
+  // This method heuristically determines if the memtable should continue to
+  // host more data.
+  bool ShouldScheduleFlush() const {
+    return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
+  }
+
+  // Returns true if a flush should be scheduled and the caller should
+  // be the one to schedule it
+  bool MarkFlushScheduled() {
+    auto before = FLUSH_REQUESTED;
+    return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
+                                                std::memory_order_relaxed,
+                                                std::memory_order_relaxed);
+  }
+
+  // Return an iterator that yields the contents of the memtable.
+  //
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.  The keys returned by this
+  // iterator are internal keys encoded by AppendInternalKey in the
+  // db/dbformat.{h,cc} module.
+  //
+  // By default, it returns an iterator for prefix seek if prefix_extractor
+  // is configured in Options.
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        Calling ~Iterator of the iterator will destroy all the states but
+  //        those allocated in arena.
+  InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
+
+  // Returns an iterator that yields the range tombstones of the memtable.
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.
+  // @param immutable_memtable Whether this memtable is an immutable memtable.
+  // This information is not stored in memtable itself, so it needs to be
+  // specified by the caller. This flag is used internally to decide whether a
+  // cached fragmented range tombstone list can be returned. This cached version
+  // is constructed when a memtable becomes immutable. Setting the flag to false
+  // will always yield correct result, but may incur performance penalty as it
+  // always creates a new fragmented range tombstone list.
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& read_options, SequenceNumber read_seq,
+      bool immutable_memtable);
+
+  Status VerifyEncodedEntry(Slice encoded,
+                            const ProtectionInfoKVOS64& kv_prot_info);
+
+  // Add an entry into memtable that maps key to value at the
+  // specified sequence number and with the specified type.
+  // Typically value will be empty if type==kTypeDeletion.
+  //
+  // REQUIRES: if allow_concurrent = false, external synchronization to prevent
+  // simultaneous operations on the same MemTable.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
+  Status Add(SequenceNumber seq, ValueType type, const Slice& key,
+             const Slice& value, const ProtectionInfoKVOS64* kv_prot_info,
+             bool allow_concurrent = false,
+             MemTablePostProcessInfo* post_process_info = nullptr,
+             void** hint = nullptr);
+
+  // Used to Get value associated with key or Get Merge Operands associated
+  // with key.
+  // If do_merge = true the default behavior which is Get value for key is
+  // executed. Expected behavior is described right below.
+  // If memtable contains a value for key, store it in *value and return true.
+  // If memtable contains a deletion for key, store a NotFound() error
+  // in *status and return true.
+  // If memtable contains Merge operation as the most recent entry for a key,
+  //   and the merge process does not stop (not reaching a value or delete),
+  //   prepend the current merge operand to *operands.
+  //   store MergeInProgress in s, and return false.
+  // Else, return false.
+  // If any operation was found, its most recent sequence number
+  // will be stored in *seq on success (regardless of whether true/false is
+  // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
+  // On success, *s may be set to OK, NotFound, or MergeInProgress.  Any other
+  // status returned indicates a corruption or other unexpected error.
+  // If do_merge = false then any Merge Operands encountered for key are simply
+  // stored in merge_context.operands_list and never actually merged to get a
+  // final value. The raw Merge Operands are eventually returned to the user.
+  // @param immutable_memtable Whether this memtable is immutable. Used
+  // internally by NewRangeTombstoneIterator(). See comment above
+  // NewRangeTombstoneIterator() for more detail.
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, bool immutable_memtable,
+           ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+           bool do_merge = true);
+
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           const ReadOptions& read_opts, bool immutable_memtable,
+           ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+           bool do_merge = true) {
+    SequenceNumber seq;
+    return Get(key, value, columns, timestamp, s, merge_context,
+               max_covering_tombstone_seq, &seq, read_opts, immutable_memtable,
+               callback, is_blob_index, do_merge);
+  }
+
+  // @param immutable_memtable Whether this memtable is immutable. Used
+  // internally by NewRangeTombstoneIterator(). See comment above
+  // NewRangeTombstoneIterator() for more detail.
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback, bool immutable_memtable);
+
+  // If `key` exists in current memtable with type value_type and the existing
+  // value is at least as large as the new value, updates it in-place. Otherwise
+  // adds the new value to the memtable out-of-place.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  Status Update(SequenceNumber seq, ValueType value_type, const Slice& key,
+                const Slice& value, const ProtectionInfoKVOS64* kv_prot_info);
+
+  // If `key` exists in current memtable with type `kTypeValue` and the existing
+  // value is at least as large as the new value, updates it in-place. Otherwise
+  // if `key` exists in current memtable with type `kTypeValue`, adds the new
+  // value to the memtable out-of-place.
+  //
+  // Returns `Status::NotFound` if `key` does not exist in current memtable or
+  // the latest version of `key` does not have `kTypeValue`.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  Status UpdateCallback(SequenceNumber seq, const Slice& key,
+                        const Slice& delta,
+                        const ProtectionInfoKVOS64* kv_prot_info);
+
+  // Returns the number of successive merge entries starting from the newest
+  // entry for the key up to the last non-merge entry or last entry for the
+  // key in the memtable.
+  size_t CountSuccessiveMergeEntries(const LookupKey& key);
+
+  // Update counters and flush status after inserting a whole write batch
+  // Used in concurrent memtable inserts.
+  void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
+    num_entries_.fetch_add(update_counters.num_entries,
+                           std::memory_order_relaxed);
+    data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
+    if (update_counters.num_deletes != 0) {
+      num_deletes_.fetch_add(update_counters.num_deletes,
+                             std::memory_order_relaxed);
+    }
+    UpdateFlushState();
+  }
+
+  // Get total number of entries in the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  uint64_t num_entries() const {
+    return num_entries_.load(std::memory_order_relaxed);
+  }
+
+  // Get total number of deletes in the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  uint64_t num_deletes() const {
+    return num_deletes_.load(std::memory_order_relaxed);
+  }
+
+  uint64_t get_data_size() const {
+    return data_size_.load(std::memory_order_relaxed);
+  }
+
+  // Dynamically change the memtable's capacity. If set below the current usage,
+  // the next key added will trigger a flush. Can only increase size when
+  // memtable prefix bloom is disabled, since we can't easily allocate more
+  // space.
+  void UpdateWriteBufferSize(size_t new_write_buffer_size) {
+    if (bloom_filter_ == nullptr ||
+        new_write_buffer_size < write_buffer_size_) {
+      write_buffer_size_.store(new_write_buffer_size,
+                               std::memory_order_relaxed);
+    }
+  }
+
+  // Returns the edits area that is needed for flushing the memtable
+  VersionEdit* GetEdits() { return &edit_; }
+
+  // Returns if there is no entry inserted to the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  bool IsEmpty() const { return first_seqno_ == 0; }
+
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  SequenceNumber GetFirstSequenceNumber() {
+    return first_seqno_.load(std::memory_order_relaxed);
+  }
+
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  void SetFirstSequenceNumber(SequenceNumber first_seqno) {
+    return first_seqno_.store(first_seqno, std::memory_order_relaxed);
+  }
+
+  // Returns the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into this
+  // memtable. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  //
+  // If the earliest sequence number could not be determined,
+  // kMaxSequenceNumber will be returned.
+  SequenceNumber GetEarliestSequenceNumber() {
+    return earliest_seqno_.load(std::memory_order_relaxed);
+  }
+
+  // Sets the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into this
+  // memtable. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  // Used only for MemPurge operation
+  void SetEarliestSequenceNumber(SequenceNumber earliest_seqno) {
+    return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed);
+  }
+
+  // DB's latest sequence ID when the memtable is created. This number
+  // may be updated to a more recent one before any key is inserted.
+  SequenceNumber GetCreationSeq() const { return creation_seq_; }
+
+  void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
+
+  // Returns the next active logfile number when this memtable is about to
+  // be flushed to storage
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
+
+  // Sets the next active logfile number when this memtable is about to
+  // be flushed to storage
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+
+  // if this memtable contains data from a committed
+  // two phase transaction we must take note of the
+  // log which contains that data so we can know
+  // when to relese that log
+  void RefLogContainingPrepSection(uint64_t log);
+  uint64_t GetMinLogContainingPrepSection();
+
+  // Notify the underlying storage that no more items will be added.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  // After MarkImmutable() is called, you should not attempt to
+  // write anything to this MemTable().  (Ie. do not call Add() or Update()).
+  void MarkImmutable() {
+    table_->MarkReadOnly();
+    mem_tracker_.DoneAllocating();
+  }
+
+  // Notify the underlying storage that all data it contained has been
+  // persisted.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void MarkFlushed() { table_->MarkFlushed(); }
+
+  // return true if the current MemTableRep supports merge operator.
+  bool IsMergeOperatorSupported() const {
+    return table_->IsMergeOperatorSupported();
+  }
+
+  // return true if the current MemTableRep supports snapshots.
+  // inplace update prevents snapshots,
+  bool IsSnapshotSupported() const {
+    return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
+  }
+
+  struct MemTableStats {
+    uint64_t size;
+    uint64_t count;
+  };
+
+  MemTableStats ApproximateStats(const Slice& start_ikey,
+                                 const Slice& end_ikey);
+
+  // Get the lock associated for the key
+  port::RWMutex* GetLock(const Slice& key);
+
+  const InternalKeyComparator& GetInternalKeyComparator() const {
+    return comparator_.comparator;
+  }
+
+  const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
+    return &moptions_;
+  }
+
+  uint64_t ApproximateOldestKeyTime() const {
+    return oldest_key_time_.load(std::memory_order_relaxed);
+  }
+
+  // REQUIRES: db_mutex held.
+  void SetID(uint64_t id) { id_ = id; }
+
+  uint64_t GetID() const { return id_; }
+
+  void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
+
+  uint64_t GetFileNumber() const { return file_number_; }
+
+  void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
+
+  void SetFlushInProgress(bool in_progress) {
+    flush_in_progress_ = in_progress;
+  }
+
+#ifndef ROCKSDB_LITE
+  void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
+    flush_job_info_ = std::move(info);
+  }
+
+  std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
+    return std::move(flush_job_info_);
+  }
+#endif  // !ROCKSDB_LITE
+
+  // Returns a heuristic flush decision
+  bool ShouldFlushNow();
+
+  void ConstructFragmentedRangeTombstones();
+
+  // Returns whether a fragmented range tombstone list is already constructed
+  // for this memtable. It should be constructed right before a memtable is
+  // added to an immutable memtable list. Note that if a memtable does not have
+  // any range tombstone, then no range tombstone list will ever be constructed.
+  // @param allow_empty Specifies whether a memtable with no range tombstone is
+  // considered to have its fragmented range tombstone list constructed.
+  bool IsFragmentedRangeTombstonesConstructed(bool allow_empty = true) const {
+    if (allow_empty) {
+      return fragmented_range_tombstone_list_.get() != nullptr ||
+             is_range_del_table_empty_;
+    } else {
+      return fragmented_range_tombstone_list_.get() != nullptr;
+    }
+  }
+
+  // Returns Corruption status if verification fails.
+  static Status VerifyEntryChecksum(const char* entry,
+                                    size_t protection_bytes_per_key,
+                                    bool allow_data_in_errors = false);
+
+ private:
+  enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
+
+  friend class MemTableIterator;
+  friend class MemTableBackwardIterator;
+  friend class MemTableList;
+
+  KeyComparator comparator_;
+  const ImmutableMemTableOptions moptions_;
+  int refs_;
+  const size_t kArenaBlockSize;
+  AllocTracker mem_tracker_;
+  ConcurrentArena arena_;
+  std::unique_ptr<MemTableRep> table_;
+  std::unique_ptr<MemTableRep> range_del_table_;
+  std::atomic_bool is_range_del_table_empty_;
+
+  // Total data size of all data inserted
+  std::atomic<uint64_t> data_size_;
+  std::atomic<uint64_t> num_entries_;
+  std::atomic<uint64_t> num_deletes_;
+
+  // Dynamically changeable memtable option
+  std::atomic<size_t> write_buffer_size_;
+
+  // These are used to manage memtable flushes to storage
+  bool flush_in_progress_;  // started the flush
+  bool flush_completed_;    // finished the flush
+  uint64_t file_number_;    // filled up after flush is complete
+
+  // The updates to be applied to the transaction log when this
+  // memtable is flushed to storage.
+  VersionEdit edit_;
+
+  // The sequence number of the kv that was inserted first
+  std::atomic<SequenceNumber> first_seqno_;
+
+  // The db sequence number at the time of creation or kMaxSequenceNumber
+  // if not set.
+  std::atomic<SequenceNumber> earliest_seqno_;
+
+  SequenceNumber creation_seq_;
+
+  // The log files earlier than this number can be deleted.
+  uint64_t mem_next_logfile_number_;
+
+  // the earliest log containing a prepared section
+  // which has been inserted into this memtable.
+  std::atomic<uint64_t> min_prep_log_referenced_;
+
+  // rw locks for inplace updates
+  std::vector<port::RWMutex> locks_;
+
+  const SliceTransform* const prefix_extractor_;
+  std::unique_ptr<DynamicBloom> bloom_filter_;
+
+  std::atomic<FlushStateEnum> flush_state_;
+
+  SystemClock* clock_;
+
+  // Extract sequential insert prefixes.
+  const SliceTransform* insert_with_hint_prefix_extractor_;
+
+  // Insert hints for each prefix.
+  UnorderedMapH<Slice, void*, SliceHasher> insert_hints_;
+
+  // Timestamp of oldest key
+  std::atomic<uint64_t> oldest_key_time_;
+
+  // Memtable id to track flush.
+  uint64_t id_ = 0;
+
+  // Sequence number of the atomic flush that is responsible for this memtable.
+  // The sequence number of atomic flush is a seq, such that no writes with
+  // sequence numbers greater than or equal to seq are flushed, while all
+  // writes with sequence number smaller than seq are flushed.
+  SequenceNumber atomic_flush_seqno_;
+
+  // keep track of memory usage in table_, arena_, and range_del_table_.
+  // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
+  std::atomic<uint64_t> approximate_memory_usage_;
+
+#ifndef ROCKSDB_LITE
+  // Flush job info of the current memtable.
+  std::unique_ptr<FlushJobInfo> flush_job_info_;
+#endif  // !ROCKSDB_LITE
+
+  // Updates flush_state_ using ShouldFlushNow()
+  void UpdateFlushState();
+
+  void UpdateOldestKeyTime();
+
+  void GetFromTable(const LookupKey& key,
+                    SequenceNumber max_covering_tombstone_seq, bool do_merge,
+                    ReadCallback* callback, bool* is_blob_index,
+                    std::string* value, PinnableWideColumns* columns,
+                    std::string* timestamp, Status* s,
+                    MergeContext* merge_context, SequenceNumber* seq,
+                    bool* found_final_value, bool* merge_in_progress);
+
+  // Always returns non-null and assumes certain pre-checks (e.g.,
+  // is_range_del_table_empty_) are done. This is only valid during the lifetime
+  // of the underlying memtable.
+  // read_seq and read_options.timestamp will be used as the upper bound
+  // for range tombstones.
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
+      const ReadOptions& read_options, SequenceNumber read_seq,
+      bool immutable_memtable);
+
+  // The fragmented range tombstones of this memtable.
+  // This is constructed when this memtable becomes immutable
+  // if !is_range_del_table_empty_.
+  std::unique_ptr<FragmentedRangeTombstoneList>
+      fragmented_range_tombstone_list_;
+
+  // makes sure there is a single range tombstone writer to invalidate cache
+  std::mutex range_del_mutex_;
+  CoreLocalArray<std::shared_ptr<FragmentedRangeTombstoneListCache>>
+      cached_range_tombstone_;
+
+  void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
+                           const Slice& key, const Slice& value, ValueType type,
+                           SequenceNumber s, char* checksum_ptr);
+};
+
+extern const char* EncodeKey(std::string* scratch, const Slice& target);
+
+}  // namespace ROCKSDB_NAMESPACE
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
commit	e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree	64f88b554b444a49f656b6c656111a145cbbaa28 /src/rocksdb/db/memtable.h
parent	Initial commit. (diff)
download	ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip