diff options
Diffstat (limited to 'src/rocksdb/db/memtable_list.cc')
-rw-r--r-- | src/rocksdb/db/memtable_list.cc | 991 |
1 files changed, 991 insertions, 0 deletions
diff --git a/src/rocksdb/db/memtable_list.cc b/src/rocksdb/db/memtable_list.cc new file mode 100644 index 000000000..1545003ad --- /dev/null +++ b/src/rocksdb/db/memtable_list.cc @@ -0,0 +1,991 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "db/memtable_list.h" + +#include <algorithm> +#include <cinttypes> +#include <limits> +#include <queue> +#include <string> + +#include "db/db_impl/db_impl.h" +#include "db/memtable.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_set.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "monitoring/thread_status_util.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "table/merging_iterator.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class InternalKeyComparator; +class Mutex; +class VersionSet; + +void MemTableListVersion::AddMemTable(MemTable* m) { + memlist_.push_front(m); + *parent_memtable_list_memory_usage_ += m->ApproximateMemoryUsage(); +} + +void MemTableListVersion::UnrefMemTable(autovector<MemTable*>* to_delete, + MemTable* m) { + if (m->Unref()) { + to_delete->push_back(m); + assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage()); + *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage(); + } +} + +MemTableListVersion::MemTableListVersion( + size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old) + : max_write_buffer_number_to_maintain_( + old.max_write_buffer_number_to_maintain_), + max_write_buffer_size_to_maintain_( + old.max_write_buffer_size_to_maintain_), + parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) { + memlist_ = old.memlist_; + for (auto& m : memlist_) { + m->Ref(); + } + + memlist_history_ = old.memlist_history_; + for (auto& m : memlist_history_) { + m->Ref(); + } +} + +MemTableListVersion::MemTableListVersion( + size_t* parent_memtable_list_memory_usage, + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain) + : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain), + max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain), + parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {} + +void MemTableListVersion::Ref() { ++refs_; } + +// called by superversion::clean() +void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) { + assert(refs_ >= 1); + --refs_; + if (refs_ == 0) { + // if to_delete is equal to nullptr it means we're confident + // that refs_ will not be zero + assert(to_delete != nullptr); + for (const auto& m : memlist_) { + UnrefMemTable(to_delete, m); + } + for (const auto& m : memlist_history_) { + UnrefMemTable(to_delete, m); + } + delete this; + } +} + +int MemTableList::NumNotFlushed() const { + int size = static_cast<int>(current_->memlist_.size()); + assert(num_flush_not_started_ <= size); + return size; +} + +int MemTableList::NumFlushed() const { + return static_cast<int>(current_->memlist_history_.size()); +} + +// Search all the memtables starting from the most recent one. +// Return the most recent value found, if any. +// Operands stores the list of merge operations to apply, so far. +bool MemTableListVersion::Get(const LookupKey& key, std::string* value, + PinnableWideColumns* columns, + std::string* timestamp, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, + ReadCallback* callback, bool* is_blob_index) { + return GetFromList(&memlist_, key, value, columns, timestamp, s, + merge_context, max_covering_tombstone_seq, seq, read_opts, + callback, is_blob_index); +} + +void MemTableListVersion::MultiGet(const ReadOptions& read_options, + MultiGetRange* range, + ReadCallback* callback) { + for (auto memtable : memlist_) { + memtable->MultiGet(read_options, range, callback, + true /* immutable_memtable */); + if (range->empty()) { + return; + } + } +} + +bool MemTableListVersion::GetMergeOperands( + const LookupKey& key, Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) { + for (MemTable* memtable : memlist_) { + bool done = memtable->Get( + key, /*value=*/nullptr, /*columns=*/nullptr, /*timestamp=*/nullptr, s, + merge_context, max_covering_tombstone_seq, read_opts, + true /* immutable_memtable */, nullptr, nullptr, false); + if (done) { + return true; + } + } + return false; +} + +bool MemTableListVersion::GetFromHistory( + const LookupKey& key, std::string* value, PinnableWideColumns* columns, + std::string* timestamp, Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, + const ReadOptions& read_opts, bool* is_blob_index) { + return GetFromList(&memlist_history_, key, value, columns, timestamp, s, + merge_context, max_covering_tombstone_seq, seq, read_opts, + nullptr /*read_callback*/, is_blob_index); +} + +bool MemTableListVersion::GetFromList( + std::list<MemTable*>* list, const LookupKey& key, std::string* value, + PinnableWideColumns* columns, std::string* timestamp, Status* s, + MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, + SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback, + bool* is_blob_index) { + *seq = kMaxSequenceNumber; + + for (auto& memtable : *list) { + assert(memtable->IsFragmentedRangeTombstonesConstructed()); + SequenceNumber current_seq = kMaxSequenceNumber; + + bool done = + memtable->Get(key, value, columns, timestamp, s, merge_context, + max_covering_tombstone_seq, ¤t_seq, read_opts, + true /* immutable_memtable */, callback, is_blob_index); + if (*seq == kMaxSequenceNumber) { + // Store the most recent sequence number of any operation on this key. + // Since we only care about the most recent change, we only need to + // return the first operation found when searching memtables in + // reverse-chronological order. + // current_seq would be equal to kMaxSequenceNumber if the value was to be + // skipped. This allows seq to be assigned again when the next value is + // read. + *seq = current_seq; + } + + if (done) { + assert(*seq != kMaxSequenceNumber || s->IsNotFound()); + return true; + } + if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { + return false; + } + } + return false; +} + +Status MemTableListVersion::AddRangeTombstoneIterators( + const ReadOptions& read_opts, Arena* /*arena*/, + RangeDelAggregator* range_del_agg) { + assert(range_del_agg != nullptr); + // Except for snapshot read, using kMaxSequenceNumber is OK because these + // are immutable memtables. + SequenceNumber read_seq = read_opts.snapshot != nullptr + ? read_opts.snapshot->GetSequenceNumber() + : kMaxSequenceNumber; + for (auto& m : memlist_) { + assert(m->IsFragmentedRangeTombstonesConstructed()); + std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter( + m->NewRangeTombstoneIterator(read_opts, read_seq, + true /* immutable_memtable */)); + range_del_agg->AddTombstones(std::move(range_del_iter)); + } + return Status::OK(); +} + +void MemTableListVersion::AddIterators( + const ReadOptions& options, std::vector<InternalIterator*>* iterator_list, + Arena* arena) { + for (auto& m : memlist_) { + iterator_list->push_back(m->NewIterator(options, arena)); + } +} + +void MemTableListVersion::AddIterators(const ReadOptions& options, + MergeIteratorBuilder* merge_iter_builder, + bool add_range_tombstone_iter) { + for (auto& m : memlist_) { + auto mem_iter = m->NewIterator(options, merge_iter_builder->GetArena()); + if (!add_range_tombstone_iter || options.ignore_range_deletions) { + merge_iter_builder->AddIterator(mem_iter); + } else { + // Except for snapshot read, using kMaxSequenceNumber is OK because these + // are immutable memtables. + SequenceNumber read_seq = options.snapshot != nullptr + ? options.snapshot->GetSequenceNumber() + : kMaxSequenceNumber; + TruncatedRangeDelIterator* mem_tombstone_iter = nullptr; + auto range_del_iter = m->NewRangeTombstoneIterator( + options, read_seq, true /* immutale_memtable */); + if (range_del_iter == nullptr || range_del_iter->empty()) { + delete range_del_iter; + } else { + mem_tombstone_iter = new TruncatedRangeDelIterator( + std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter), + &m->GetInternalKeyComparator(), nullptr /* smallest */, + nullptr /* largest */); + } + merge_iter_builder->AddPointAndTombstoneIterator(mem_iter, + mem_tombstone_iter); + } + } +} + +uint64_t MemTableListVersion::GetTotalNumEntries() const { + uint64_t total_num = 0; + for (auto& m : memlist_) { + total_num += m->num_entries(); + } + return total_num; +} + +MemTable::MemTableStats MemTableListVersion::ApproximateStats( + const Slice& start_ikey, const Slice& end_ikey) { + MemTable::MemTableStats total_stats = {0, 0}; + for (auto& m : memlist_) { + auto mStats = m->ApproximateStats(start_ikey, end_ikey); + total_stats.size += mStats.size; + total_stats.count += mStats.count; + } + return total_stats; +} + +uint64_t MemTableListVersion::GetTotalNumDeletes() const { + uint64_t total_num = 0; + for (auto& m : memlist_) { + total_num += m->num_deletes(); + } + return total_num; +} + +SequenceNumber MemTableListVersion::GetEarliestSequenceNumber( + bool include_history) const { + if (include_history && !memlist_history_.empty()) { + return memlist_history_.back()->GetEarliestSequenceNumber(); + } else if (!memlist_.empty()) { + return memlist_.back()->GetEarliestSequenceNumber(); + } else { + return kMaxSequenceNumber; + } +} + +SequenceNumber MemTableListVersion::GetFirstSequenceNumber() const { + SequenceNumber min_first_seqno = kMaxSequenceNumber; + // The first memtable in the list might not be the oldest one with mempurge + for (const auto& m : memlist_) { + min_first_seqno = std::min(m->GetFirstSequenceNumber(), min_first_seqno); + } + return min_first_seqno; +} + +// caller is responsible for referencing m +void MemTableListVersion::Add(MemTable* m, autovector<MemTable*>* to_delete) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + AddMemTable(m); + // m->MemoryAllocatedBytes() is added in MemoryAllocatedBytesExcludingLast + TrimHistory(to_delete, 0); +} + +// Removes m from list of memtables not flushed. Caller should NOT Unref m. +void MemTableListVersion::Remove(MemTable* m, + autovector<MemTable*>* to_delete) { + assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable + memlist_.remove(m); + + m->MarkFlushed(); + if (max_write_buffer_size_to_maintain_ > 0 || + max_write_buffer_number_to_maintain_ > 0) { + memlist_history_.push_front(m); + // Unable to get size of mutable memtable at this point, pass 0 to + // TrimHistory as a best effort. + TrimHistory(to_delete, 0); + } else { + UnrefMemTable(to_delete, m); + } +} + +// return the total memory usage assuming the oldest flushed memtable is dropped +size_t MemTableListVersion::MemoryAllocatedBytesExcludingLast() const { + size_t total_memtable_size = 0; + for (auto& memtable : memlist_) { + total_memtable_size += memtable->MemoryAllocatedBytes(); + } + for (auto& memtable : memlist_history_) { + total_memtable_size += memtable->MemoryAllocatedBytes(); + } + if (!memlist_history_.empty()) { + total_memtable_size -= memlist_history_.back()->MemoryAllocatedBytes(); + } + return total_memtable_size; +} + +bool MemTableListVersion::MemtableLimitExceeded(size_t usage) { + if (max_write_buffer_size_to_maintain_ > 0) { + // calculate the total memory usage after dropping the oldest flushed + // memtable, compare with max_write_buffer_size_to_maintain_ to decide + // whether to trim history + return MemoryAllocatedBytesExcludingLast() + usage >= + static_cast<size_t>(max_write_buffer_size_to_maintain_); + } else if (max_write_buffer_number_to_maintain_ > 0) { + return memlist_.size() + memlist_history_.size() > + static_cast<size_t>(max_write_buffer_number_to_maintain_); + } else { + return false; + } +} + +// Make sure we don't use up too much space in history +bool MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete, + size_t usage) { + bool ret = false; + while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) { + MemTable* x = memlist_history_.back(); + memlist_history_.pop_back(); + + UnrefMemTable(to_delete, x); + ret = true; + } + return ret; +} + +// Returns true if there is at least one memtable on which flush has +// not yet started. +bool MemTableList::IsFlushPending() const { + if ((flush_requested_ && num_flush_not_started_ > 0) || + (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) { + assert(imm_flush_needed.load(std::memory_order_relaxed)); + return true; + } + return false; +} + +bool MemTableList::IsFlushPendingOrRunning() const { + if (current_->memlist_.size() - num_flush_not_started_ > 0) { + // Flush is already running on at least one memtable + return true; + } + return IsFlushPending(); +} + +// Returns the memtables that need to be flushed. +void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, + autovector<MemTable*>* ret, + uint64_t* max_next_log_number) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH); + const auto& memlist = current_->memlist_; + bool atomic_flush = false; + + // Note: every time MemTableList::Add(mem) is called, it adds the new mem + // at the FRONT of the memlist (memlist.push_front(mem)). Therefore, by + // iterating through the memlist starting at the end, the vector<MemTable*> + // ret is filled with memtables already sorted in increasing MemTable ID. + // However, when the mempurge feature is activated, new memtables with older + // IDs will be added to the memlist. + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) { + atomic_flush = true; + } + if (m->GetID() > max_memtable_id) { + break; + } + if (!m->flush_in_progress_) { + assert(!m->flush_completed_); + num_flush_not_started_--; + if (num_flush_not_started_ == 0) { + imm_flush_needed.store(false, std::memory_order_release); + } + m->flush_in_progress_ = true; // flushing will start very soon + if (max_next_log_number) { + *max_next_log_number = + std::max(m->GetNextLogNumber(), *max_next_log_number); + } + ret->push_back(m); + } else if (!ret->empty()) { + // This `break` is necessary to prevent picking non-consecutive memtables + // in case `memlist` has one or more entries with + // `flush_in_progress_ == true` sandwiched between entries with + // `flush_in_progress_ == false`. This could happen after parallel flushes + // are picked and the one flushing older memtables is rolled back. + break; + } + } + if (!atomic_flush || num_flush_not_started_ == 0) { + flush_requested_ = false; // start-flush request is complete + } +} + +void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems, + uint64_t /*file_number*/) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_ROLLBACK); + assert(!mems.empty()); + + // If the flush was not successful, then just reset state. + // Maybe a succeeding attempt to flush will be successful. + for (MemTable* m : mems) { + assert(m->flush_in_progress_); + assert(m->file_number_ == 0); + + m->flush_in_progress_ = false; + m->flush_completed_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + } + imm_flush_needed.store(true, std::memory_order_release); +} + +// Try record a successful flush in the manifest file. It might just return +// Status::OK letting a concurrent flush to do actual the recording.. +Status MemTableList::TryInstallMemtableFlushResults( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + const autovector<MemTable*>& mems, LogsWithPrepTracker* prep_tracker, + VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, + autovector<MemTable*>* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer, + std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info, + bool write_edits) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); + mu->AssertHeld(); + + // Flush was successful + // Record the status on the memtable object. Either this call or a call by a + // concurrent flush thread will read the status and write it to manifest. + for (size_t i = 0; i < mems.size(); ++i) { + // All the edits are associated with the first memtable of this batch. + assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0); + + mems[i]->flush_completed_ = true; + mems[i]->file_number_ = file_number; + } + + // if some other thread is already committing, then return + Status s; + if (commit_in_progress_) { + TEST_SYNC_POINT("MemTableList::TryInstallMemtableFlushResults:InProgress"); + return s; + } + + // Only a single thread can be executing this piece of code + commit_in_progress_ = true; + + // Retry until all completed flushes are committed. New flushes can finish + // while the current thread is writing manifest where mutex is released. + while (s.ok()) { + auto& memlist = current_->memlist_; + // The back is the oldest; if flush_completed_ is not set to it, it means + // that we were assigned a more recent memtable. The memtables' flushes must + // be recorded in manifest in order. A concurrent flush thread, who is + // assigned to flush the oldest memtable, will later wake up and does all + // the pending writes to manifest, in order. + if (memlist.empty() || !memlist.back()->flush_completed_) { + break; + } + // scan all memtables from the earliest, and commit those + // (in that order) that have finished flushing. Memtables + // are always committed in the order that they were created. + uint64_t batch_file_number = 0; + size_t batch_count = 0; + autovector<VersionEdit*> edit_list; + autovector<MemTable*> memtables_to_flush; + // enumerate from the last (earliest) element to see how many batch finished + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (!m->flush_completed_) { + break; + } + if (it == memlist.rbegin() || batch_file_number != m->file_number_) { + batch_file_number = m->file_number_; + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 " started", + cfd->GetName().c_str(), m->file_number_); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files) started", + cfd->GetName().c_str(), m->file_number_, + m->edit_.GetBlobFileAdditions().size()); + } + + edit_list.push_back(&m->edit_); + memtables_to_flush.push_back(m); +#ifndef ROCKSDB_LITE + std::unique_ptr<FlushJobInfo> info = m->ReleaseFlushJobInfo(); + if (info != nullptr) { + committed_flush_jobs_info->push_back(std::move(info)); + } +#else + (void)committed_flush_jobs_info; +#endif // !ROCKSDB_LITE + } + batch_count++; + } + + // TODO(myabandeh): Not sure how batch_count could be 0 here. + if (batch_count > 0) { + uint64_t min_wal_number_to_keep = 0; + assert(edit_list.size() > 0); + if (vset->db_options()->allow_2pc) { + // Note that if mempurge is successful, the edit_list will + // not be applicable (contains info of new min_log number to keep, + // and level 0 file path of SST file created during normal flush, + // so both pieces of information are irrelevant after a successful + // mempurge operation). + min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC( + vset, *cfd, edit_list, memtables_to_flush, prep_tracker); + + // We piggyback the information of earliest log file to keep in the + // manifest entry for the last file flushed. + } else { + min_wal_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list); + } + + VersionEdit wal_deletion; + wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep); + if (vset->db_options()->track_and_verify_wals_in_manifest) { + if (min_wal_number_to_keep > + vset->GetWalSet().GetMinWalNumberToKeep()) { + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); + } + TEST_SYNC_POINT_CALLBACK( + "MemTableList::TryInstallMemtableFlushResults:" + "AfterComputeMinWalToKeep", + nullptr); + } + edit_list.push_back(&wal_deletion); + + const auto manifest_write_cb = [this, cfd, batch_count, log_buffer, + to_delete, mu](const Status& status) { + RemoveMemTablesOrRestoreFlags(status, cfd, batch_count, log_buffer, + to_delete, mu); + }; + if (write_edits) { + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, + db_directory, /*new_descriptor_log=*/false, + /*column_family_options=*/nullptr, + manifest_write_cb); + } else { + // If write_edit is false (e.g: successful mempurge), + // then remove old memtables, wake up manifest write queue threads, + // and don't commit anything to the manifest file. + RemoveMemTablesOrRestoreFlags(s, cfd, batch_count, log_buffer, + to_delete, mu); + // Note: cfd->SetLogNumber is only called when a VersionEdit + // is written to MANIFEST. When mempurge is succesful, we skip + // this step, therefore cfd->GetLogNumber is always is + // earliest log with data unflushed. + // Notify new head of manifest write queue. + // wake up all the waiting writers + // TODO(bjlemaire): explain full reason WakeUpWaitingManifestWriters + // needed or investigate more. + vset->WakeUpWaitingManifestWriters(); + } + } + } + commit_in_progress_ = false; + return s; +} + +// New memtables are inserted at the front of the list. +void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) { + assert(static_cast<int>(current_->memlist_.size()) >= num_flush_not_started_); + InstallNewVersion(); + // this method is used to move mutable memtable into an immutable list. + // since mutable memtable is already refcounted by the DBImpl, + // and when moving to the immutable list we don't unref it, + // we don't have to ref the memtable here. we just take over the + // reference from the DBImpl. + current_->Add(m, to_delete); + m->MarkImmutable(); + num_flush_not_started_++; + if (num_flush_not_started_ == 1) { + imm_flush_needed.store(true, std::memory_order_release); + } + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); +} + +bool MemTableList::TrimHistory(autovector<MemTable*>* to_delete, size_t usage) { + InstallNewVersion(); + bool ret = current_->TrimHistory(to_delete, usage); + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); + return ret; +} + +// Returns an estimate of the number of bytes of data in use. +size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() { + size_t total_size = 0; + for (auto& memtable : current_->memlist_) { + total_size += memtable->ApproximateMemoryUsage(); + } + return total_size; +} + +size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; } + +size_t MemTableList::MemoryAllocatedBytesExcludingLast() const { + const size_t usage = current_memory_allocted_bytes_excluding_last_.load( + std::memory_order_relaxed); + return usage; +} + +bool MemTableList::HasHistory() const { + const bool has_history = current_has_history_.load(std::memory_order_relaxed); + return has_history; +} + +void MemTableList::UpdateCachedValuesFromMemTableListVersion() { + const size_t total_memtable_size = + current_->MemoryAllocatedBytesExcludingLast(); + current_memory_allocted_bytes_excluding_last_.store( + total_memtable_size, std::memory_order_relaxed); + + const bool has_history = current_->HasHistory(); + current_has_history_.store(has_history, std::memory_order_relaxed); +} + +uint64_t MemTableList::ApproximateOldestKeyTime() const { + if (!current_->memlist_.empty()) { + return current_->memlist_.back()->ApproximateOldestKeyTime(); + } + return std::numeric_limits<uint64_t>::max(); +} + +void MemTableList::InstallNewVersion() { + if (current_->refs_ == 1) { + // we're the only one using the version, just keep using it + } else { + // somebody else holds the current version, we need to create new one + MemTableListVersion* version = current_; + current_ = new MemTableListVersion(¤t_memory_usage_, *version); + current_->Ref(); + version->Unref(); + } +} + +void MemTableList::RemoveMemTablesOrRestoreFlags( + const Status& s, ColumnFamilyData* cfd, size_t batch_count, + LogBuffer* log_buffer, autovector<MemTable*>* to_delete, + InstrumentedMutex* mu) { + assert(mu); + mu->AssertHeld(); + assert(to_delete); + // we will be changing the version in the next code path, + // so we better create a new one, since versions are immutable + InstallNewVersion(); + + // All the later memtables that have the same filenum + // are part of the same batch. They can be committed now. + uint64_t mem_id = 1; // how many memtables have been flushed. + + // commit new state only if the column family is NOT dropped. + // The reason is as follows (refer to + // ColumnFamilyTest.FlushAndDropRaceCondition). + // If the column family is dropped, then according to LogAndApply, its + // corresponding flush operation is NOT written to the MANIFEST. This + // means the DB is not aware of the L0 files generated from the flush. + // By committing the new state, we remove the memtable from the memtable + // list. Creating an iterator on this column family will not be able to + // read full data since the memtable is removed, and the DB is not aware + // of the L0 files, causing MergingIterator unable to build child + // iterators. RocksDB contract requires that the iterator can be created + // on a dropped column family, and we must be able to + // read full data as long as column family handle is not deleted, even if + // the column family is dropped. + if (s.ok() && !cfd->IsDropped()) { // commit new state + while (batch_count-- > 0) { + MemTable* m = current_->memlist_.back(); + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, + m->edit_.GetBlobFileAdditions().size(), mem_id); + } + + assert(m->file_number_ > 0); + current_->Remove(m, to_delete); + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); + ++mem_id; + } + } else { + for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) { + MemTable* m = *it; + // commit failed. setup state so that we can flush again. + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "Level-0 commit table #%" PRIu64 ": memtable #%" PRIu64 + " failed", + m->file_number_, mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " failed", + m->file_number_, + m->edit_.GetBlobFileAdditions().size(), mem_id); + } + + m->flush_completed_ = false; + m->flush_in_progress_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + m->file_number_ = 0; + imm_flush_needed.store(true, std::memory_order_release); + ++mem_id; + } + } +} + +uint64_t MemTableList::PrecomputeMinLogContainingPrepSection( + const std::unordered_set<MemTable*>* memtables_to_flush) { + uint64_t min_log = 0; + + for (auto& m : current_->memlist_) { + if (memtables_to_flush && memtables_to_flush->count(m)) { + continue; + } + + auto log = m->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +// Commit a successful atomic flush in the manifest file. +Status InstallMemtableAtomicFlushResults( + const autovector<MemTableList*>* imm_lists, + const autovector<ColumnFamilyData*>& cfds, + const autovector<const MutableCFOptions*>& mutable_cf_options_list, + const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset, + LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu, + const autovector<FileMetaData*>& file_metas, + const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>& + committed_flush_jobs_info, + autovector<MemTable*>* to_delete, FSDirectory* db_directory, + LogBuffer* log_buffer) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); + mu->AssertHeld(); + + size_t num = mems_list.size(); + assert(cfds.size() == num); + if (imm_lists != nullptr) { + assert(imm_lists->size() == num); + } + if (num == 0) { + return Status::OK(); + } + + for (size_t k = 0; k != num; ++k) { +#ifndef NDEBUG + const auto* imm = + (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); + if (!mems_list[k]->empty()) { + assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID()); + } +#endif + assert(nullptr != file_metas[k]); + for (size_t i = 0; i != mems_list[k]->size(); ++i) { + assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0); + (*mems_list[k])[i]->SetFlushCompleted(true); + (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber()); + } +#ifndef ROCKSDB_LITE + if (committed_flush_jobs_info[k]) { + assert(!mems_list[k]->empty()); + assert((*mems_list[k])[0]); + std::unique_ptr<FlushJobInfo> flush_job_info = + (*mems_list[k])[0]->ReleaseFlushJobInfo(); + committed_flush_jobs_info[k]->push_back(std::move(flush_job_info)); + } +#else //! ROCKSDB_LITE + (void)committed_flush_jobs_info; +#endif // ROCKSDB_LITE + } + + Status s; + + autovector<autovector<VersionEdit*>> edit_lists; + uint32_t num_entries = 0; + for (const auto mems : mems_list) { + assert(mems != nullptr); + autovector<VersionEdit*> edits; + assert(!mems->empty()); + edits.emplace_back((*mems)[0]->GetEdits()); + ++num_entries; + edit_lists.emplace_back(edits); + } + + WalNumber min_wal_number_to_keep = 0; + if (vset->db_options()->allow_2pc) { + min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC( + vset, cfds, edit_lists, mems_list, prep_tracker); + } else { + min_wal_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists); + } + + VersionEdit wal_deletion; + wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep); + if (vset->db_options()->track_and_verify_wals_in_manifest && + min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) { + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); + } + edit_lists.back().push_back(&wal_deletion); + ++num_entries; + + // Mark the version edits as an atomic group if the number of version edits + // exceeds 1. + if (cfds.size() > 1) { + for (size_t i = 0; i < edit_lists.size(); i++) { + assert((edit_lists[i].size() == 1) || + ((edit_lists[i].size() == 2) && (i == edit_lists.size() - 1))); + for (auto& e : edit_lists[i]) { + e->MarkAtomicGroup(--num_entries); + } + } + assert(0 == num_entries); + } + + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, + db_directory); + + for (size_t k = 0; k != cfds.size(); ++k) { + auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); + imm->InstallNewVersion(); + } + + if (s.ok() || s.IsColumnFamilyDropped()) { + for (size_t i = 0; i != cfds.size(); ++i) { + if (cfds[i]->IsDropped()) { + continue; + } + auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i); + for (auto m : *mems_list[i]) { + assert(m->GetFileNumber() > 0); + uint64_t mem_id = m->GetID(); + + const VersionEdit* const edit = m->GetEdits(); + assert(edit); + + if (edit->GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " done", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + edit->GetBlobFileAdditions().size(), mem_id); + } + + imm->current_->Remove(m, to_delete); + imm->UpdateCachedValuesFromMemTableListVersion(); + imm->ResetTrimHistoryNeeded(); + } + } + } else { + for (size_t i = 0; i != cfds.size(); ++i) { + auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i); + for (auto m : *mems_list[i]) { + uint64_t mem_id = m->GetID(); + + const VersionEdit* const edit = m->GetEdits(); + assert(edit); + + if (edit->GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " failed", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " failed", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + edit->GetBlobFileAdditions().size(), mem_id); + } + + m->SetFlushCompleted(false); + m->SetFlushInProgress(false); + m->GetEdits()->Clear(); + m->SetFileNumber(0); + imm->num_flush_not_started_++; + } + imm->imm_flush_needed.store(true, std::memory_order_release); + } + } + + return s; +} + +void MemTableList::RemoveOldMemTables(uint64_t log_number, + autovector<MemTable*>* to_delete) { + assert(to_delete != nullptr); + InstallNewVersion(); + auto& memlist = current_->memlist_; + autovector<MemTable*> old_memtables; + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* mem = *it; + if (mem->GetNextLogNumber() > log_number) { + break; + } + old_memtables.push_back(mem); + } + + for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) { + MemTable* mem = *it; + current_->Remove(mem, to_delete); + --num_flush_not_started_; + if (0 == num_flush_not_started_) { + imm_flush_needed.store(false, std::memory_order_release); + } + } + + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); +} + +} // namespace ROCKSDB_NAMESPACE |