summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/db/db_impl
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/rocksdb/db/db_impl
parentInitial commit. (diff)
downloadceph-upstream.tar.xz
ceph-upstream.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/rocksdb/db/db_impl')
-rw-r--r--src/rocksdb/db/db_impl/db_impl.cc4550
-rw-r--r--src/rocksdb/db/db_impl/db_impl.h2107
-rw-r--r--src/rocksdb/db/db_impl/db_impl_compaction_flush.cc3116
-rw-r--r--src/rocksdb/db/db_impl/db_impl_debug.cc294
-rw-r--r--src/rocksdb/db/db_impl/db_impl_experimental.cc151
-rw-r--r--src/rocksdb/db/db_impl/db_impl_files.cc667
-rw-r--r--src/rocksdb/db/db_impl/db_impl_open.cc1651
-rw-r--r--src/rocksdb/db/db_impl/db_impl_readonly.cc221
-rw-r--r--src/rocksdb/db/db_impl/db_impl_readonly.h137
-rw-r--r--src/rocksdb/db/db_impl/db_impl_secondary.cc671
-rw-r--r--src/rocksdb/db/db_impl/db_impl_secondary.h333
-rw-r--r--src/rocksdb/db/db_impl/db_impl_write.cc1839
-rw-r--r--src/rocksdb/db/db_impl/db_secondary_test.cc869
13 files changed, 16606 insertions, 0 deletions
diff --git a/src/rocksdb/db/db_impl/db_impl.cc b/src/rocksdb/db/db_impl/db_impl.cc
new file mode 100644
index 000000000..d7880fc1a
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.cc
@@ -0,0 +1,4550 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <stdint.h>
+#ifdef OS_SOLARIS
+#include <alloca.h>
+#endif
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <map>
+#include <set>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/builder.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_info_dumper.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/forward_iterator.h"
+#include "db/import_column_family_job.h"
+#include "db/job_context.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/malloc_stats.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/transaction_log_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "db/write_callback.h"
+#include "env/composite_env_wrapper.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/auto_roll_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "memtable/hash_linklist_rep.h"
+#include "memtable/hash_skiplist_rep.h"
+#include "monitoring/in_memory_stats_history.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "options/cf_options.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/stats_history.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/get_context.h"
+#include "table/merging_iterator.h"
+#include "table/multiget_context.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "tools/sst_dump_tool_imp.h"
+#include "util/autovector.h"
+#include "util/build_version.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kDefaultColumnFamilyName("default");
+const std::string kPersistentStatsColumnFamilyName(
+ "___rocksdb_stats_history___");
+void DumpRocksDBBuildVersion(Logger* log);
+
+CompressionType GetCompressionFlush(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options) {
+ // Compressing memtable flushes might not help unless the sequential load
+ // optimization is used for leveled compaction. Otherwise the CPU and
+ // latency overhead is not offset by saving much space.
+ if (ioptions.compaction_style == kCompactionStyleUniversal) {
+ if (mutable_cf_options.compaction_options_universal
+ .compression_size_percent < 0) {
+ return mutable_cf_options.compression;
+ } else {
+ return kNoCompression;
+ }
+ } else if (!ioptions.compression_per_level.empty()) {
+ // For leveled compress when min_level_to_compress != 0.
+ return ioptions.compression_per_level[0];
+ } else {
+ return mutable_cf_options.compression;
+ }
+}
+
+namespace {
+void DumpSupportInfo(Logger* logger) {
+ ROCKS_LOG_HEADER(logger, "Compression algorithms supported:");
+ for (auto& compression : OptionsHelper::compression_type_string_map) {
+ if (compression.second != kNoCompression &&
+ compression.second != kDisableCompressionOption) {
+ ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(),
+ CompressionTypeSupported(compression.second));
+ }
+ }
+ ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s",
+ crc32c::IsFastCrc32Supported().c_str());
+}
+} // namespace
+
+DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
+ const bool seq_per_batch, const bool batch_per_txn)
+ : dbname_(dbname),
+ own_info_log_(options.info_log == nullptr),
+ initial_db_options_(SanitizeOptions(dbname, options)),
+ env_(initial_db_options_.env),
+ fs_(initial_db_options_.file_system),
+ immutable_db_options_(initial_db_options_),
+ mutable_db_options_(initial_db_options_),
+ stats_(immutable_db_options_.statistics.get()),
+ mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS,
+ immutable_db_options_.use_adaptive_mutex),
+ default_cf_handle_(nullptr),
+ max_total_in_memory_state_(0),
+ file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+ file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite(
+ file_options_, immutable_db_options_)),
+ seq_per_batch_(seq_per_batch),
+ batch_per_txn_(batch_per_txn),
+ db_lock_(nullptr),
+ shutting_down_(false),
+ manual_compaction_paused_(false),
+ bg_cv_(&mutex_),
+ logfile_number_(0),
+ log_dir_synced_(false),
+ log_empty_(true),
+ persist_stats_cf_handle_(nullptr),
+ log_sync_cv_(&mutex_),
+ total_log_size_(0),
+ is_snapshot_supported_(true),
+ write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
+ write_thread_(immutable_db_options_),
+ nonmem_write_thread_(immutable_db_options_),
+ write_controller_(mutable_db_options_.delayed_write_rate),
+ last_batch_group_size_(0),
+ unscheduled_flushes_(0),
+ unscheduled_compactions_(0),
+ bg_bottom_compaction_scheduled_(0),
+ bg_compaction_scheduled_(0),
+ num_running_compactions_(0),
+ bg_flush_scheduled_(0),
+ num_running_flushes_(0),
+ bg_purge_scheduled_(0),
+ disable_delete_obsolete_files_(0),
+ pending_purge_obsolete_files_(0),
+ delete_obsolete_files_last_run_(env_->NowMicros()),
+ last_stats_dump_time_microsec_(0),
+ next_job_id_(1),
+ has_unpersisted_data_(false),
+ unable_to_release_oldest_log_(false),
+ num_running_ingest_file_(0),
+#ifndef ROCKSDB_LITE
+ wal_manager_(immutable_db_options_, file_options_, seq_per_batch),
+#endif // ROCKSDB_LITE
+ event_logger_(immutable_db_options_.info_log.get()),
+ bg_work_paused_(0),
+ bg_compaction_paused_(0),
+ refitting_level_(false),
+ opened_successfully_(false),
+ two_write_queues_(options.two_write_queues),
+ manual_wal_flush_(options.manual_wal_flush),
+ // last_sequencee_ is always maintained by the main queue that also writes
+ // to the memtable. When two_write_queues_ is disabled last seq in
+ // memtable is the same as last seq published to the readers. When it is
+ // enabled but seq_per_batch_ is disabled, last seq in memtable still
+ // indicates last published seq since wal-only writes that go to the 2nd
+ // queue do not consume a sequence number. Otherwise writes performed by
+ // the 2nd queue could change what is visible to the readers. In this
+ // cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a
+ // separate variable to indicate the last published sequence.
+ last_seq_same_as_publish_seq_(
+ !(seq_per_batch && options.two_write_queues)),
+ // Since seq_per_batch_ is currently set only by WritePreparedTxn which
+ // requires a custom gc for compaction, we use that to set use_custom_gc_
+ // as well.
+ use_custom_gc_(seq_per_batch),
+ shutdown_initiated_(false),
+ own_sfm_(options.sst_file_manager == nullptr),
+ preserve_deletes_(options.preserve_deletes),
+ closed_(false),
+ error_handler_(this, immutable_db_options_, &mutex_),
+ atomic_flush_install_cv_(&mutex_) {
+ // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
+ // WriteUnprepared, which should use seq_per_batch_.
+ assert(batch_per_txn_ || seq_per_batch_);
+ env_->GetAbsolutePath(dbname, &db_absolute_path_);
+
+ // Reserve ten files or so for other uses and give the rest to TableCache.
+ // Give a large number for setting of "infinite" open files.
+ const int table_cache_size = (mutable_db_options_.max_open_files == -1)
+ ? TableCache::kInfiniteCapacity
+ : mutable_db_options_.max_open_files - 10;
+ LRUCacheOptions co;
+ co.capacity = table_cache_size;
+ co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
+ co.metadata_charge_policy = kDontChargeCacheMetadata;
+ table_cache_ = NewLRUCache(co);
+
+ versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_,
+ table_cache_.get(), write_buffer_manager_,
+ &write_controller_, &block_cache_tracer_));
+ column_family_memtables_.reset(
+ new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+
+ DumpRocksDBBuildVersion(immutable_db_options_.info_log.get());
+ DumpDBFileSummary(immutable_db_options_, dbname_);
+ immutable_db_options_.Dump(immutable_db_options_.info_log.get());
+ mutable_db_options_.Dump(immutable_db_options_.info_log.get());
+ DumpSupportInfo(immutable_db_options_.info_log.get());
+
+ // always open the DB with 0 here, which means if preserve_deletes_==true
+ // we won't drop any deletion markers until SetPreserveDeletesSequenceNumber()
+ // is called by client and this seqnum is advanced.
+ preserve_deletes_seqnum_.store(0);
+}
+
+Status DBImpl::Resume() {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB");
+
+ InstrumentedMutexLock db_mutex(&mutex_);
+
+ if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) {
+ // Nothing to do
+ return Status::OK();
+ }
+
+ if (error_handler_.IsRecoveryInProgress()) {
+ // Don't allow a mix of manual and automatic recovery
+ return Status::Busy();
+ }
+
+ mutex_.Unlock();
+ Status s = error_handler_.RecoverFromBGError(true);
+ mutex_.Lock();
+ return s;
+}
+
+// This function implements the guts of recovery from a background error. It
+// is eventually called for both manual as well as automatic recovery. It does
+// the following -
+// 1. Wait for currently scheduled background flush/compaction to exit, in
+// order to inadvertently causing an error and thinking recovery failed
+// 2. Flush memtables if there's any data for all the CFs. This may result
+// another error, which will be saved by error_handler_ and reported later
+// as the recovery status
+// 3. Find and delete any obsolete files
+// 4. Schedule compactions if needed for all the CFs. This is needed as the
+// flush in the prior step might have been a no-op for some CFs, which
+// means a new super version wouldn't have been installed
+Status DBImpl::ResumeImpl() {
+ mutex_.AssertHeld();
+ WaitForBackgroundWork();
+
+ Status bg_error = error_handler_.GetBGError();
+ Status s;
+ if (shutdown_initiated_) {
+ // Returning shutdown status to SFM during auto recovery will cause it
+ // to abort the recovery and allow the shutdown to progress
+ s = Status::ShutdownInProgress();
+ }
+ if (s.ok() && bg_error.severity() > Status::Severity::kHardError) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "DB resume requested but failed due to Fatal/Unrecoverable error");
+ s = bg_error;
+ }
+
+ // We cannot guarantee consistency of the WAL. So force flush Memtables of
+ // all the column families
+ if (s.ok()) {
+ FlushOptions flush_opts;
+ // We allow flush to stall write since we are trying to resume from error.
+ flush_opts.allow_write_stall = true;
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds;
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ mutex_.Unlock();
+ s = AtomicFlushMemTables(cfds, flush_opts, FlushReason::kErrorRecovery);
+ mutex_.Lock();
+ } else {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ cfd->Ref();
+ mutex_.Unlock();
+ s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery);
+ mutex_.Lock();
+ cfd->UnrefAndTryDelete();
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "DB resume requested but failed due to Flush failure [%s]",
+ s.ToString().c_str());
+ }
+ }
+
+ JobContext job_context(0);
+ FindObsoleteFiles(&job_context, true);
+ if (s.ok()) {
+ s = error_handler_.ClearBGError();
+ }
+ mutex_.Unlock();
+
+ job_context.manifest_file_number = 1;
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+
+ if (s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+ }
+ mutex_.Lock();
+ // Check for shutdown again before scheduling further compactions,
+ // since we released and re-acquired the lock above
+ if (shutdown_initiated_) {
+ s = Status::ShutdownInProgress();
+ }
+ if (s.ok()) {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ SchedulePendingCompaction(cfd);
+ }
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ // Wake up any waiters - in this case, it could be the shutdown thread
+ bg_cv_.SignalAll();
+
+ // No need to check BGError again. If something happened, event listener would
+ // be notified and the operation causing it would have failed
+ return s;
+}
+
+void DBImpl::WaitForBackgroundWork() {
+ // Wait for background work to finish
+ while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+ bg_flush_scheduled_) {
+ bg_cv_.Wait();
+ }
+}
+
+// Will lock the mutex_, will wait for completion if wait is true
+void DBImpl::CancelAllBackgroundWork(bool wait) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Shutdown: canceling all background work");
+
+ if (thread_dump_stats_ != nullptr) {
+ thread_dump_stats_->cancel();
+ thread_dump_stats_.reset();
+ }
+ if (thread_persist_stats_ != nullptr) {
+ thread_persist_stats_->cancel();
+ thread_persist_stats_.reset();
+ }
+ InstrumentedMutexLock l(&mutex_);
+ if (!shutting_down_.load(std::memory_order_acquire) &&
+ has_unpersisted_data_.load(std::memory_order_relaxed) &&
+ !mutable_db_options_.avoid_flush_during_shutdown) {
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds;
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ mutex_.Unlock();
+ AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+ mutex_.Lock();
+ } else {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
+ cfd->Ref();
+ mutex_.Unlock();
+ FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
+ mutex_.Lock();
+ cfd->UnrefAndTryDelete();
+ }
+ }
+ }
+ versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+ }
+
+ shutting_down_.store(true, std::memory_order_release);
+ bg_cv_.SignalAll();
+ if (!wait) {
+ return;
+ }
+ WaitForBackgroundWork();
+}
+
+Status DBImpl::CloseHelper() {
+ // Guarantee that there is no background error recovery in progress before
+ // continuing with the shutdown
+ mutex_.Lock();
+ shutdown_initiated_ = true;
+ error_handler_.CancelErrorRecovery();
+ while (error_handler_.IsRecoveryInProgress()) {
+ bg_cv_.Wait();
+ }
+ mutex_.Unlock();
+
+ // CancelAllBackgroundWork called with false means we just set the shutdown
+ // marker. After this we do a variant of the waiting and unschedule work
+ // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
+ CancelAllBackgroundWork(false);
+ int bottom_compactions_unscheduled =
+ env_->UnSchedule(this, Env::Priority::BOTTOM);
+ int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
+ int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
+ Status ret;
+ mutex_.Lock();
+ bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled;
+ bg_compaction_scheduled_ -= compactions_unscheduled;
+ bg_flush_scheduled_ -= flushes_unscheduled;
+
+ // Wait for background work to finish
+ while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+ bg_flush_scheduled_ || bg_purge_scheduled_ ||
+ pending_purge_obsolete_files_ ||
+ error_handler_.IsRecoveryInProgress()) {
+ TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob");
+ bg_cv_.Wait();
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished",
+ &files_grabbed_for_purge_);
+ EraseThreadStatusDbInfo();
+ flush_scheduler_.Clear();
+ trim_history_scheduler_.Clear();
+
+ while (!flush_queue_.empty()) {
+ const FlushRequest& flush_req = PopFirstFromFlushQueue();
+ for (const auto& iter : flush_req) {
+ iter.first->UnrefAndTryDelete();
+ }
+ }
+ while (!compaction_queue_.empty()) {
+ auto cfd = PopFirstFromCompactionQueue();
+ cfd->UnrefAndTryDelete();
+ }
+
+ if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) {
+ // we need to delete handle outside of lock because it does its own locking
+ mutex_.Unlock();
+ if (default_cf_handle_) {
+ delete default_cf_handle_;
+ default_cf_handle_ = nullptr;
+ }
+ if (persist_stats_cf_handle_) {
+ delete persist_stats_cf_handle_;
+ persist_stats_cf_handle_ = nullptr;
+ }
+ mutex_.Lock();
+ }
+
+ // Clean up obsolete files due to SuperVersion release.
+ // (1) Need to delete to obsolete files before closing because RepairDB()
+ // scans all existing files in the file system and builds manifest file.
+ // Keeping obsolete files confuses the repair process.
+ // (2) Need to check if we Open()/Recover() the DB successfully before
+ // deleting because if VersionSet recover fails (may be due to corrupted
+ // manifest file), it is not able to identify live files correctly. As a
+ // result, all "live" files can get deleted by accident. However, corrupted
+ // manifest is recoverable by RepairDB().
+ if (opened_successfully_) {
+ JobContext job_context(next_job_id_.fetch_add(1));
+ FindObsoleteFiles(&job_context, true);
+
+ mutex_.Unlock();
+ // manifest number starting from 2
+ job_context.manifest_file_number = 1;
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ mutex_.Lock();
+ }
+
+ for (auto l : logs_to_free_) {
+ delete l;
+ }
+ for (auto& log : logs_) {
+ uint64_t log_number = log.writer->get_log_number();
+ Status s = log.ClearWriter();
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(
+ immutable_db_options_.info_log,
+ "Unable to Sync WAL file %s with error -- %s",
+ LogFileName(immutable_db_options_.wal_dir, log_number).c_str(),
+ s.ToString().c_str());
+ // Retain the first error
+ if (ret.ok()) {
+ ret = s;
+ }
+ }
+ }
+ logs_.clear();
+
+ // Table cache may have table handles holding blocks from the block cache.
+ // We need to release them before the block cache is destroyed. The block
+ // cache may be destroyed inside versions_.reset(), when column family data
+ // list is destroyed, so leaving handles in table cache after
+ // versions_.reset() may cause issues.
+ // Here we clean all unreferenced handles in table cache.
+ // Now we assume all user queries have finished, so only version set itself
+ // can possibly hold the blocks from block cache. After releasing unreferenced
+ // handles here, only handles held by version set left and inside
+ // versions_.reset(), we will release them. There, we need to make sure every
+ // time a handle is released, we erase it from the cache too. By doing that,
+ // we can guarantee that after versions_.reset(), table cache is empty
+ // so the cache can be safely destroyed.
+ table_cache_->EraseUnRefEntries();
+
+ for (auto& txn_entry : recovered_transactions_) {
+ delete txn_entry.second;
+ }
+
+ // versions need to be destroyed before table_cache since it can hold
+ // references to table_cache.
+ versions_.reset();
+ mutex_.Unlock();
+ if (db_lock_ != nullptr) {
+ env_->UnlockFile(db_lock_);
+ }
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete");
+ LogFlush(immutable_db_options_.info_log);
+
+#ifndef ROCKSDB_LITE
+ // If the sst_file_manager was allocated by us during DB::Open(), ccall
+ // Close() on it before closing the info_log. Otherwise, background thread
+ // in SstFileManagerImpl might try to log something
+ if (immutable_db_options_.sst_file_manager && own_sfm_) {
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ sfm->Close();
+ }
+#endif // ROCKSDB_LITE
+
+ if (immutable_db_options_.info_log && own_info_log_) {
+ Status s = immutable_db_options_.info_log->Close();
+ if (ret.ok()) {
+ ret = s;
+ }
+ }
+
+ if (ret.IsAborted()) {
+ // Reserve IsAborted() error for those where users didn't release
+ // certain resource and they can release them and come back and
+ // retry. In this case, we wrap this exception to something else.
+ return Status::Incomplete(ret.ToString());
+ }
+ return ret;
+}
+
+Status DBImpl::CloseImpl() { return CloseHelper(); }
+
+DBImpl::~DBImpl() {
+ if (!closed_) {
+ closed_ = true;
+ CloseHelper();
+ }
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+ if (s->ok() || immutable_db_options_.paranoid_checks) {
+ // No change needed
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "Ignoring error %s",
+ s->ToString().c_str());
+ *s = Status::OK();
+ }
+}
+
+const Status DBImpl::CreateArchivalDirectory() {
+ if (immutable_db_options_.wal_ttl_seconds > 0 ||
+ immutable_db_options_.wal_size_limit_mb > 0) {
+ std::string archivalPath = ArchivalDirectory(immutable_db_options_.wal_dir);
+ return env_->CreateDirIfMissing(archivalPath);
+ }
+ return Status::OK();
+}
+
+void DBImpl::PrintStatistics() {
+ auto dbstats = immutable_db_options_.statistics.get();
+ if (dbstats) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s",
+ dbstats->ToString().c_str());
+ }
+}
+
+void DBImpl::StartTimedTasks() {
+ unsigned int stats_dump_period_sec = 0;
+ unsigned int stats_persist_period_sec = 0;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ stats_dump_period_sec = mutable_db_options_.stats_dump_period_sec;
+ if (stats_dump_period_sec > 0) {
+ if (!thread_dump_stats_) {
+ thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+ [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
+ static_cast<uint64_t>(stats_dump_period_sec) * kMicrosInSecond));
+ }
+ }
+ stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec;
+ if (stats_persist_period_sec > 0) {
+ if (!thread_persist_stats_) {
+ thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+ [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
+ static_cast<uint64_t>(stats_persist_period_sec) * kMicrosInSecond));
+ }
+ }
+ }
+}
+
+// esitmate the total size of stats_history_
+size_t DBImpl::EstimateInMemoryStatsHistorySize() const {
+ size_t size_total =
+ sizeof(std::map<uint64_t, std::map<std::string, uint64_t>>);
+ if (stats_history_.size() == 0) return size_total;
+ size_t size_per_slice =
+ sizeof(uint64_t) + sizeof(std::map<std::string, uint64_t>);
+ // non-empty map, stats_history_.begin() guaranteed to exist
+ std::map<std::string, uint64_t> sample_slice(stats_history_.begin()->second);
+ for (const auto& pairs : sample_slice) {
+ size_per_slice +=
+ pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second);
+ }
+ size_total = size_per_slice * stats_history_.size();
+ return size_total;
+}
+
+void DBImpl::PersistStats() {
+ TEST_SYNC_POINT("DBImpl::PersistStats:Entry");
+#ifndef ROCKSDB_LITE
+ if (shutdown_initiated_) {
+ return;
+ }
+ uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond;
+ Statistics* statistics = immutable_db_options_.statistics.get();
+ if (!statistics) {
+ return;
+ }
+ size_t stats_history_size_limit = 0;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ stats_history_size_limit = mutable_db_options_.stats_history_buffer_size;
+ }
+
+ std::map<std::string, uint64_t> stats_map;
+ if (!statistics->getTickerMap(&stats_map)) {
+ return;
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "------- PERSISTING STATS -------");
+
+ if (immutable_db_options_.persist_stats_to_disk) {
+ WriteBatch batch;
+ if (stats_slice_initialized_) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Reading %" ROCKSDB_PRIszt " stats from statistics\n",
+ stats_slice_.size());
+ for (const auto& stat : stats_map) {
+ char key[100];
+ int length =
+ EncodePersistentStatsKey(now_seconds, stat.first, 100, key);
+ // calculate the delta from last time
+ if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+ uint64_t delta = stat.second - stats_slice_[stat.first];
+ batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)),
+ ToString(delta));
+ }
+ }
+ }
+ stats_slice_initialized_ = true;
+ std::swap(stats_slice_, stats_map);
+ WriteOptions wo;
+ wo.low_pri = true;
+ wo.no_slowdown = true;
+ wo.sync = false;
+ Status s = Write(wo, &batch);
+ if (!s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Writing to persistent stats CF failed -- %s",
+ s.ToString().c_str());
+ } else {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+ " to persistent stats CF succeeded",
+ stats_slice_.size(), now_seconds);
+ }
+ // TODO(Zhongyi): add purging for persisted data
+ } else {
+ InstrumentedMutexLock l(&stats_history_mutex_);
+ // calculate the delta from last time
+ if (stats_slice_initialized_) {
+ std::map<std::string, uint64_t> stats_delta;
+ for (const auto& stat : stats_map) {
+ if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+ stats_delta[stat.first] = stat.second - stats_slice_[stat.first];
+ }
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+ " to in-memory stats history",
+ stats_slice_.size(), now_seconds);
+ stats_history_[now_seconds] = stats_delta;
+ }
+ stats_slice_initialized_ = true;
+ std::swap(stats_slice_, stats_map);
+ TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied");
+
+ // delete older stats snapshots to control memory consumption
+ size_t stats_history_size = EstimateInMemoryStatsHistorySize();
+ bool purge_needed = stats_history_size > stats_history_size_limit;
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+ " bytes, slice count: %" ROCKSDB_PRIszt,
+ stats_history_size, stats_history_.size());
+ while (purge_needed && !stats_history_.empty()) {
+ stats_history_.erase(stats_history_.begin());
+ purge_needed =
+ EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+ " bytes, slice count: %" ROCKSDB_PRIszt,
+ stats_history_size, stats_history_.size());
+ }
+#endif // !ROCKSDB_LITE
+}
+
+bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time,
+ uint64_t* new_time,
+ std::map<std::string, uint64_t>* stats_map) {
+ assert(new_time);
+ assert(stats_map);
+ if (!new_time || !stats_map) return false;
+ // lock when search for start_time
+ {
+ InstrumentedMutexLock l(&stats_history_mutex_);
+ auto it = stats_history_.lower_bound(start_time);
+ if (it != stats_history_.end() && it->first < end_time) {
+ // make a copy for timestamp and stats_map
+ *new_time = it->first;
+ *stats_map = it->second;
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
+
+Status DBImpl::GetStatsHistory(
+ uint64_t start_time, uint64_t end_time,
+ std::unique_ptr<StatsHistoryIterator>* stats_iterator) {
+ if (!stats_iterator) {
+ return Status::InvalidArgument("stats_iterator not preallocated.");
+ }
+ if (immutable_db_options_.persist_stats_to_disk) {
+ stats_iterator->reset(
+ new PersistentStatsHistoryIterator(start_time, end_time, this));
+ } else {
+ stats_iterator->reset(
+ new InMemoryStatsHistoryIterator(start_time, end_time, this));
+ }
+ return (*stats_iterator)->status();
+}
+
+void DBImpl::DumpStats() {
+ TEST_SYNC_POINT("DBImpl::DumpStats:1");
+#ifndef ROCKSDB_LITE
+ const DBPropertyInfo* cf_property_info =
+ GetPropertyInfo(DB::Properties::kCFStats);
+ assert(cf_property_info != nullptr);
+ const DBPropertyInfo* db_property_info =
+ GetPropertyInfo(DB::Properties::kDBStats);
+ assert(db_property_info != nullptr);
+
+ std::string stats;
+ if (shutdown_initiated_) {
+ return;
+ }
+ {
+ InstrumentedMutexLock l(&mutex_);
+ default_cf_internal_stats_->GetStringProperty(
+ *db_property_info, DB::Properties::kDBStats, &stats);
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->initialized()) {
+ cfd->internal_stats()->GetStringProperty(
+ *cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats);
+ }
+ }
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->initialized()) {
+ cfd->internal_stats()->GetStringProperty(
+ *cf_property_info, DB::Properties::kCFFileHistogram, &stats);
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::DumpStats:2");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "------- DUMPING STATS -------");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+ if (immutable_db_options_.dump_malloc_stats) {
+ stats.clear();
+ DumpMallocStats(&stats);
+ if (!stats.empty()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "------- Malloc STATS -------");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+ }
+ }
+#endif // !ROCKSDB_LITE
+
+ PrintStatistics();
+}
+
+Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+ int max_entries_to_print,
+ std::string* out_str) {
+ auto* cfh =
+ static_cast_with_check<ColumnFamilyHandleImpl, ColumnFamilyHandle>(
+ column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ Version* version = super_version->current;
+
+ Status s =
+ version->TablesRangeTombstoneSummary(max_entries_to_print, out_str);
+
+ CleanupSuperVersion(super_version);
+ return s;
+}
+
+void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
+ if (!job_context->logs_to_free.empty()) {
+ for (auto l : job_context->logs_to_free) {
+ AddToLogsToFreeQueue(l);
+ }
+ job_context->logs_to_free.clear();
+ }
+}
+
+Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
+ assert(cfd);
+ Directory* ret_dir = cfd->GetDataDir(path_id);
+ if (ret_dir == nullptr) {
+ return directories_.GetDataDir(path_id);
+ }
+ return ret_dir;
+}
+
+Status DBImpl::SetOptions(
+ ColumnFamilyHandle* column_family,
+ const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+ (void)column_family;
+ (void)options_map;
+ return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ if (options_map.empty()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "SetOptions() on column family [%s], empty input",
+ cfd->GetName().c_str());
+ return Status::InvalidArgument("empty input");
+ }
+
+ MutableCFOptions new_options;
+ Status s;
+ Status persist_options_status;
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ {
+ auto db_options = GetDBOptions();
+ InstrumentedMutexLock l(&mutex_);
+ s = cfd->SetOptions(db_options, options_map);
+ if (s.ok()) {
+ new_options = *cfd->GetLatestMutableCFOptions();
+ // Append new version to recompute compaction score.
+ VersionEdit dummy_edit;
+ versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_,
+ directories_.GetDbDir());
+ // Trigger possible flush/compactions. This has to be before we persist
+ // options to file, otherwise there will be a deadlock with writer
+ // thread.
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options);
+
+ persist_options_status = WriteOptionsFile(
+ false /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ bg_cv_.SignalAll();
+ }
+ }
+ sv_context.Clean();
+
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str());
+ for (const auto& o : options_map) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+ o.second.c_str());
+ }
+ if (s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] SetOptions() succeeded", cfd->GetName().c_str());
+ new_options.Dump(immutable_db_options_.info_log.get());
+ if (!persist_options_status.ok()) {
+ s = persist_options_status;
+ }
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed",
+ cfd->GetName().c_str());
+ }
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+#endif // ROCKSDB_LITE
+}
+
+Status DBImpl::SetDBOptions(
+ const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+ (void)options_map;
+ return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+ if (options_map.empty()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "SetDBOptions(), empty input.");
+ return Status::InvalidArgument("empty input");
+ }
+
+ MutableDBOptions new_options;
+ Status s;
+ Status persist_options_status;
+ bool wal_changed = false;
+ WriteContext write_context;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map,
+ &new_options);
+ if (new_options.bytes_per_sync == 0) {
+ new_options.bytes_per_sync = 1024 * 1024;
+ }
+ DBOptions new_db_options =
+ BuildDBOptions(immutable_db_options_, new_options);
+ if (s.ok()) {
+ s = ValidateOptions(new_db_options);
+ }
+ if (s.ok()) {
+ for (auto c : *versions_->GetColumnFamilySet()) {
+ if (!c->IsDropped()) {
+ auto cf_options = c->GetLatestCFOptions();
+ s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ }
+ if (s.ok()) {
+ const BGJobLimits current_bg_job_limits =
+ GetBGJobLimits(immutable_db_options_.max_background_flushes,
+ mutable_db_options_.max_background_compactions,
+ mutable_db_options_.max_background_jobs,
+ /* parallelize_compactions */ true);
+ const BGJobLimits new_bg_job_limits = GetBGJobLimits(
+ immutable_db_options_.max_background_flushes,
+ new_options.max_background_compactions,
+ new_options.max_background_jobs, /* parallelize_compactions */ true);
+
+ const bool max_flushes_increased =
+ new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes;
+ const bool max_compactions_increased =
+ new_bg_job_limits.max_compactions >
+ current_bg_job_limits.max_compactions;
+
+ if (max_flushes_increased || max_compactions_increased) {
+ if (max_flushes_increased) {
+ env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_flushes,
+ Env::Priority::HIGH);
+ }
+
+ if (max_compactions_increased) {
+ env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_compactions,
+ Env::Priority::LOW);
+ }
+
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ if (new_options.stats_dump_period_sec !=
+ mutable_db_options_.stats_dump_period_sec) {
+ if (thread_dump_stats_) {
+ mutex_.Unlock();
+ thread_dump_stats_->cancel();
+ mutex_.Lock();
+ }
+ if (new_options.stats_dump_period_sec > 0) {
+ thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+ [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
+ static_cast<uint64_t>(new_options.stats_dump_period_sec) *
+ kMicrosInSecond));
+ } else {
+ thread_dump_stats_.reset();
+ }
+ }
+ if (new_options.stats_persist_period_sec !=
+ mutable_db_options_.stats_persist_period_sec) {
+ if (thread_persist_stats_) {
+ mutex_.Unlock();
+ thread_persist_stats_->cancel();
+ mutex_.Lock();
+ }
+ if (new_options.stats_persist_period_sec > 0) {
+ thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
+ [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
+ static_cast<uint64_t>(new_options.stats_persist_period_sec) *
+ kMicrosInSecond));
+ } else {
+ thread_persist_stats_.reset();
+ }
+ }
+ write_controller_.set_max_delayed_write_rate(
+ new_options.delayed_write_rate);
+ table_cache_.get()->SetCapacity(new_options.max_open_files == -1
+ ? TableCache::kInfiniteCapacity
+ : new_options.max_open_files - 10);
+ wal_changed = mutable_db_options_.wal_bytes_per_sync !=
+ new_options.wal_bytes_per_sync;
+ mutable_db_options_ = new_options;
+ file_options_for_compaction_ = FileOptions(new_db_options);
+ file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite(
+ file_options_for_compaction_, immutable_db_options_);
+ versions_->ChangeFileOptions(mutable_db_options_);
+ //TODO(xiez): clarify why apply optimize for read to write options
+ file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
+ file_options_for_compaction_, immutable_db_options_);
+ file_options_for_compaction_.compaction_readahead_size =
+ mutable_db_options_.compaction_readahead_size;
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) {
+ Status purge_wal_status = SwitchWAL(&write_context);
+ if (!purge_wal_status.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Unable to purge WAL files in SetDBOptions() -- %s",
+ purge_wal_status.ToString().c_str());
+ }
+ }
+ persist_options_status = WriteOptionsFile(
+ false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+ write_thread_.ExitUnbatched(&w);
+ }
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:");
+ for (const auto& o : options_map) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+ o.second.c_str());
+ }
+ if (s.ok()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded");
+ new_options.Dump(immutable_db_options_.info_log.get());
+ if (!persist_options_status.ok()) {
+ if (immutable_db_options_.fail_if_options_file_error) {
+ s = Status::IOError(
+ "SetDBOptions() succeeded, but unable to persist options",
+ persist_options_status.ToString());
+ }
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Unable to persist options in SetDBOptions() -- %s",
+ persist_options_status.ToString().c_str());
+ }
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed");
+ }
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+#endif // ROCKSDB_LITE
+}
+
+// return the same level if it cannot be moved
+int DBImpl::FindMinimumEmptyLevelFitting(
+ ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/,
+ int level) {
+ mutex_.AssertHeld();
+ const auto* vstorage = cfd->current()->storage_info();
+ int minimum_level = level;
+ for (int i = level - 1; i > 0; --i) {
+ // stop if level i is not empty
+ if (vstorage->NumLevelFiles(i) > 0) break;
+ // stop if level i is too small (cannot fit the level files)
+ if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) {
+ break;
+ }
+
+ minimum_level = i;
+ }
+ return minimum_level;
+}
+
+Status DBImpl::FlushWAL(bool sync) {
+ if (manual_wal_flush_) {
+ Status s;
+ {
+ // We need to lock log_write_mutex_ since logs_ might change concurrently
+ InstrumentedMutexLock wl(&log_write_mutex_);
+ log::Writer* cur_log_writer = logs_.back().writer;
+ s = cur_log_writer->WriteBuffer();
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+ s.ToString().c_str());
+ // In case there is a fs error we should set it globally to prevent the
+ // future writes
+ WriteStatusCheck(s);
+ // whether sync or not, we should abort the rest of function upon error
+ return s;
+ }
+ if (!sync) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false");
+ return s;
+ }
+ }
+ if (!sync) {
+ return Status::OK();
+ }
+ // sync = true
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true");
+ return SyncWAL();
+}
+
+Status DBImpl::SyncWAL() {
+ autovector<log::Writer*, 1> logs_to_sync;
+ bool need_log_dir_sync;
+ uint64_t current_log_number;
+
+ {
+ InstrumentedMutexLock l(&mutex_);
+ assert(!logs_.empty());
+
+ // This SyncWAL() call only cares about logs up to this number.
+ current_log_number = logfile_number_;
+
+ while (logs_.front().number <= current_log_number &&
+ logs_.front().getting_synced) {
+ log_sync_cv_.Wait();
+ }
+ // First check that logs are safe to sync in background.
+ for (auto it = logs_.begin();
+ it != logs_.end() && it->number <= current_log_number; ++it) {
+ if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) {
+ return Status::NotSupported(
+ "SyncWAL() is not supported for this implementation of WAL file",
+ immutable_db_options_.allow_mmap_writes
+ ? "try setting Options::allow_mmap_writes to false"
+ : Slice());
+ }
+ }
+ for (auto it = logs_.begin();
+ it != logs_.end() && it->number <= current_log_number; ++it) {
+ auto& log = *it;
+ assert(!log.getting_synced);
+ log.getting_synced = true;
+ logs_to_sync.push_back(log.writer);
+ }
+
+ need_log_dir_sync = !log_dir_synced_;
+ }
+
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+ RecordTick(stats_, WAL_FILE_SYNCED);
+ Status status;
+ for (log::Writer* log : logs_to_sync) {
+ status = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (status.ok() && need_log_dir_sync) {
+ status = directories_.GetWalDir()->Fsync();
+ }
+ TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+ TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
+ {
+ InstrumentedMutexLock l(&mutex_);
+ MarkLogsSynced(current_log_number, need_log_dir_sync, status);
+ }
+ TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
+
+ return status;
+}
+
+Status DBImpl::LockWAL() {
+ log_write_mutex_.Lock();
+ auto cur_log_writer = logs_.back().writer;
+ auto status = cur_log_writer->WriteBuffer();
+ if (!status.ok()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+ status.ToString().c_str());
+ // In case there is a fs error we should set it globally to prevent the
+ // future writes
+ WriteStatusCheck(status);
+ }
+ return status;
+}
+
+Status DBImpl::UnlockWAL() {
+ log_write_mutex_.Unlock();
+ return Status::OK();
+}
+
+void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
+ const Status& status) {
+ mutex_.AssertHeld();
+ if (synced_dir && logfile_number_ == up_to && status.ok()) {
+ log_dir_synced_ = true;
+ }
+ for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
+ auto& log = *it;
+ assert(log.getting_synced);
+ if (status.ok() && logs_.size() > 1) {
+ logs_to_free_.push_back(log.ReleaseWriter());
+ // To modify logs_ both mutex_ and log_write_mutex_ must be held
+ InstrumentedMutexLock l(&log_write_mutex_);
+ it = logs_.erase(it);
+ } else {
+ log.getting_synced = false;
+ ++it;
+ }
+ }
+ assert(!status.ok() || logs_.empty() || logs_[0].number > up_to ||
+ (logs_.size() == 1 && !logs_[0].getting_synced));
+ log_sync_cv_.SignalAll();
+}
+
+SequenceNumber DBImpl::GetLatestSequenceNumber() const {
+ return versions_->LastSequence();
+}
+
+void DBImpl::SetLastPublishedSequence(SequenceNumber seq) {
+ versions_->SetLastPublishedSequence(seq);
+}
+
+bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) {
+ if (seqnum > preserve_deletes_seqnum_.load()) {
+ preserve_deletes_seqnum_.store(seqnum);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+InternalIterator* DBImpl::NewInternalIterator(
+ Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
+ ColumnFamilyHandle* column_family) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ cfd = cfh->cfd();
+ }
+
+ mutex_.Lock();
+ SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+ mutex_.Unlock();
+ ReadOptions roptions;
+ return NewInternalIterator(roptions, cfd, super_version, arena, range_del_agg,
+ sequence);
+}
+
+void DBImpl::SchedulePurge() {
+ mutex_.AssertHeld();
+ assert(opened_successfully_);
+
+ // Purge operations are put into High priority queue
+ bg_purge_scheduled_++;
+ env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr);
+}
+
+void DBImpl::BackgroundCallPurge() {
+ mutex_.Lock();
+
+ while (!logs_to_free_queue_.empty()) {
+ assert(!logs_to_free_queue_.empty());
+ log::Writer* log_writer = *(logs_to_free_queue_.begin());
+ logs_to_free_queue_.pop_front();
+ mutex_.Unlock();
+ delete log_writer;
+ mutex_.Lock();
+ }
+ while (!superversions_to_free_queue_.empty()) {
+ assert(!superversions_to_free_queue_.empty());
+ SuperVersion* sv = superversions_to_free_queue_.front();
+ superversions_to_free_queue_.pop_front();
+ mutex_.Unlock();
+ delete sv;
+ mutex_.Lock();
+ }
+
+ // Can't use iterator to go over purge_files_ because inside the loop we're
+ // unlocking the mutex that protects purge_files_.
+ while (!purge_files_.empty()) {
+ auto it = purge_files_.begin();
+ // Need to make a copy of the PurgeFilesInfo before unlocking the mutex.
+ PurgeFileInfo purge_file = it->second;
+
+ const std::string& fname = purge_file.fname;
+ const std::string& dir_to_sync = purge_file.dir_to_sync;
+ FileType type = purge_file.type;
+ uint64_t number = purge_file.number;
+ int job_id = purge_file.job_id;
+
+ purge_files_.erase(it);
+
+ mutex_.Unlock();
+ DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
+ mutex_.Lock();
+ }
+
+ bg_purge_scheduled_--;
+
+ bg_cv_.SignalAll();
+ // IMPORTANT:there should be no code after calling SignalAll. This call may
+ // signal the DB destructor that it's OK to proceed with destruction. In
+ // that case, all DB variables will be dealloacated and referencing them
+ // will cause trouble.
+ mutex_.Unlock();
+}
+
+namespace {
+struct IterState {
+ IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version,
+ bool _background_purge)
+ : db(_db),
+ mu(_mu),
+ super_version(_super_version),
+ background_purge(_background_purge) {}
+
+ DBImpl* db;
+ InstrumentedMutex* mu;
+ SuperVersion* super_version;
+ bool background_purge;
+};
+
+static void CleanupIteratorState(void* arg1, void* /*arg2*/) {
+ IterState* state = reinterpret_cast<IterState*>(arg1);
+
+ if (state->super_version->Unref()) {
+ // Job id == 0 means that this is not our background process, but rather
+ // user thread
+ JobContext job_context(0);
+
+ state->mu->Lock();
+ state->super_version->Cleanup();
+ state->db->FindObsoleteFiles(&job_context, false, true);
+ if (state->background_purge) {
+ state->db->ScheduleBgLogWriterClose(&job_context);
+ state->db->AddSuperVersionsToFreeQueue(state->super_version);
+ state->db->SchedulePurge();
+ }
+ state->mu->Unlock();
+
+ if (!state->background_purge) {
+ delete state->super_version;
+ }
+ if (job_context.HaveSomethingToDelete()) {
+ if (state->background_purge) {
+ // PurgeObsoleteFiles here does not delete files. Instead, it adds the
+ // files to be deleted to a job queue, and deletes it in a separate
+ // background thread.
+ state->db->PurgeObsoleteFiles(job_context, true /* schedule only */);
+ state->mu->Lock();
+ state->db->SchedulePurge();
+ state->mu->Unlock();
+ } else {
+ state->db->PurgeObsoleteFiles(job_context);
+ }
+ }
+ job_context.Clean();
+ }
+
+ delete state;
+}
+} // namespace
+
+InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
+ ColumnFamilyData* cfd,
+ SuperVersion* super_version,
+ Arena* arena,
+ RangeDelAggregator* range_del_agg,
+ SequenceNumber sequence) {
+ InternalIterator* internal_iter;
+ assert(arena != nullptr);
+ assert(range_del_agg != nullptr);
+ // Need to create internal iterator from the arena.
+ MergeIteratorBuilder merge_iter_builder(
+ &cfd->internal_comparator(), arena,
+ !read_options.total_order_seek &&
+ super_version->mutable_cf_options.prefix_extractor != nullptr);
+ // Collect iterator for mutable mem
+ merge_iter_builder.AddIterator(
+ super_version->mem->NewIterator(read_options, arena));
+ std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter;
+ Status s;
+ if (!read_options.ignore_range_deletions) {
+ range_del_iter.reset(
+ super_version->mem->NewRangeTombstoneIterator(read_options, sequence));
+ range_del_agg->AddTombstones(std::move(range_del_iter));
+ }
+ // Collect all needed child iterators for immutable memtables
+ if (s.ok()) {
+ super_version->imm->AddIterators(read_options, &merge_iter_builder);
+ if (!read_options.ignore_range_deletions) {
+ s = super_version->imm->AddRangeTombstoneIterators(read_options, arena,
+ range_del_agg);
+ }
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
+ if (s.ok()) {
+ // Collect iterators for files in L0 - Ln
+ if (read_options.read_tier != kMemtableTier) {
+ super_version->current->AddIterators(read_options, file_options_,
+ &merge_iter_builder, range_del_agg);
+ }
+ internal_iter = merge_iter_builder.Finish();
+ IterState* cleanup =
+ new IterState(this, &mutex_, super_version,
+ read_options.background_purge_on_iterator_cleanup ||
+ immutable_db_options_.avoid_unnecessary_blocking_io);
+ internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
+
+ return internal_iter;
+ } else {
+ CleanupSuperVersion(super_version);
+ }
+ return NewErrorInternalIterator<Slice>(s, arena);
+}
+
+ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
+ return default_cf_handle_;
+}
+
+ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
+ return persist_stats_cf_handle_;
+}
+
+Status DBImpl::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) {
+ GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.value = value;
+ return GetImpl(read_options, key, get_impl_options);
+}
+
+Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
+ GetImplOptions get_impl_options) {
+ assert(get_impl_options.value != nullptr ||
+ get_impl_options.merge_operands != nullptr);
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+ StopWatch sw(env_, stats_, DB_GET);
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ auto cfh =
+ reinterpret_cast<ColumnFamilyHandleImpl*>(get_impl_options.column_family);
+ auto cfd = cfh->cfd();
+
+ if (tracer_) {
+ // TODO: This mutex should be removed later, to improve performance when
+ // tracing is enabled.
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ tracer_->Get(get_impl_options.column_family, key);
+ }
+ }
+
+ // Acquire SuperVersion
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+ TEST_SYNC_POINT("DBImpl::GetImpl:1");
+ TEST_SYNC_POINT("DBImpl::GetImpl:2");
+
+ SequenceNumber snapshot;
+ if (read_options.snapshot != nullptr) {
+ if (get_impl_options.callback) {
+ // Already calculated based on read_options.snapshot
+ snapshot = get_impl_options.callback->max_visible_seq();
+ } else {
+ snapshot =
+ reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+ }
+ } else {
+ // Note that the snapshot is assigned AFTER referencing the super
+ // version because otherwise a flush happening in between may compact away
+ // data for the snapshot, so the reader would see neither data that was be
+ // visible to the snapshot before compaction nor the newer data inserted
+ // afterwards.
+ snapshot = last_seq_same_as_publish_seq_
+ ? versions_->LastSequence()
+ : versions_->LastPublishedSequence();
+ if (get_impl_options.callback) {
+ // The unprep_seqs are not published for write unprepared, so it could be
+ // that max_visible_seq is larger. Seek to the std::max of the two.
+ // However, we still want our callback to contain the actual snapshot so
+ // that it can do the correct visibility filtering.
+ get_impl_options.callback->Refresh(snapshot);
+
+ // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+ // max_visible_seq = max(max_visible_seq, snapshot)
+ //
+ // Currently, the commented out assert is broken by
+ // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+ // the regular transaction flow, then this special read callback would not
+ // be needed.
+ //
+ // assert(callback->max_visible_seq() >= snapshot);
+ snapshot = get_impl_options.callback->max_visible_seq();
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::GetImpl:3");
+ TEST_SYNC_POINT("DBImpl::GetImpl:4");
+
+ // Prepare to store a list of merge operations if merge occurs.
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+
+ Status s;
+ // First look in the memtable, then in the immutable memtable (if any).
+ // s is both in/out. When in, s could either be OK or MergeInProgress.
+ // merge_operands will contain the sequence of merges in the latter case.
+ LookupKey lkey(key, snapshot, read_options.timestamp);
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ bool skip_memtable = (read_options.read_tier == kPersistedTier &&
+ has_unpersisted_data_.load(std::memory_order_relaxed));
+ bool done = false;
+ if (!skip_memtable) {
+ // Get value associated with key
+ if (get_impl_options.get_value) {
+ if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s,
+ &merge_context, &max_covering_tombstone_seq,
+ read_options, get_impl_options.callback,
+ get_impl_options.is_blob_index)) {
+ done = true;
+ get_impl_options.value->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if ((s.ok() || s.IsMergeInProgress()) &&
+ sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s,
+ &merge_context, &max_covering_tombstone_seq,
+ read_options, get_impl_options.callback,
+ get_impl_options.is_blob_index)) {
+ done = true;
+ get_impl_options.value->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ } else {
+ // Get Merge Operands associated with key, Merge Operands should not be
+ // merged and raw values should be returned to the user.
+ if (sv->mem->Get(lkey, nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, read_options, nullptr,
+ nullptr, false)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if ((s.ok() || s.IsMergeInProgress()) &&
+ sv->imm->GetMergeOperands(lkey, &s, &merge_context,
+ &max_covering_tombstone_seq,
+ read_options)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ }
+ if (!done && !s.ok() && !s.IsMergeInProgress()) {
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ return s;
+ }
+ }
+ if (!done) {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ sv->current->Get(
+ read_options, lkey, get_impl_options.value, &s, &merge_context,
+ &max_covering_tombstone_seq,
+ get_impl_options.get_value ? get_impl_options.value_found : nullptr,
+ nullptr, nullptr,
+ get_impl_options.get_value ? get_impl_options.callback : nullptr,
+ get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr,
+ get_impl_options.get_value);
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+
+ {
+ PERF_TIMER_GUARD(get_post_process_time);
+
+ ReturnAndCleanupSuperVersion(cfd, sv);
+
+ RecordTick(stats_, NUMBER_KEYS_READ);
+ size_t size = 0;
+ if (s.ok()) {
+ if (get_impl_options.get_value) {
+ size = get_impl_options.value->size();
+ } else {
+ // Return all merge operands for get_impl_options.key
+ *get_impl_options.number_of_operands =
+ static_cast<int>(merge_context.GetNumOperands());
+ if (*get_impl_options.number_of_operands >
+ get_impl_options.get_merge_operands_options
+ ->expected_max_number_of_operands) {
+ s = Status::Incomplete(
+ Status::SubCode::KMergeOperandsInsufficientCapacity);
+ } else {
+ for (const Slice& sl : merge_context.GetOperands()) {
+ size += sl.size();
+ get_impl_options.merge_operands->PinSelf(sl);
+ get_impl_options.merge_operands++;
+ }
+ }
+ }
+ RecordTick(stats_, BYTES_READ, size);
+ PERF_COUNTER_ADD(get_read_bytes, size);
+ }
+ RecordInHistogram(stats_, BYTES_PER_READ, size);
+ }
+ return s;
+}
+
+std::vector<Status> DBImpl::MultiGet(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys, std::vector<std::string>* values) {
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+ StopWatch sw(env_, stats_, DB_MULTIGET);
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ SequenceNumber consistent_seqnum;
+ ;
+
+ std::unordered_map<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
+ column_family.size());
+ for (auto cf : column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cf);
+ auto cfd = cfh->cfd();
+ if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
+ multiget_cf_data.emplace(cfd->GetID(),
+ MultiGetColumnFamilyData(cfh, nullptr));
+ }
+ }
+
+ std::function<MultiGetColumnFamilyData*(
+ std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&)>
+ iter_deref_lambda =
+ [](std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&
+ cf_iter) { return &cf_iter->second; };
+
+ bool unref_only =
+ MultiCFSnapshot<std::unordered_map<uint32_t, MultiGetColumnFamilyData>>(
+ read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+ &consistent_seqnum);
+
+ // Contain a list of merge operations if merge occurs.
+ MergeContext merge_context;
+
+ // Note: this always resizes the values array
+ size_t num_keys = keys.size();
+ std::vector<Status> stat_list(num_keys);
+ values->resize(num_keys);
+
+ // Keep track of bytes that we read for statistics-recording later
+ uint64_t bytes_read = 0;
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ // For each of the given keys, apply the entire "get" process as follows:
+ // First look in the memtable, then in the immutable memtable (if any).
+ // s is both in/out. When in, s could either be OK or MergeInProgress.
+ // merge_operands will contain the sequence of merges in the latter case.
+ size_t num_found = 0;
+ for (size_t i = 0; i < num_keys; ++i) {
+ merge_context.Clear();
+ Status& s = stat_list[i];
+ std::string* value = &(*values)[i];
+
+ LookupKey lkey(keys[i], consistent_seqnum);
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
+ SequenceNumber max_covering_tombstone_seq = 0;
+ auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
+ assert(mgd_iter != multiget_cf_data.end());
+ auto mgd = mgd_iter->second;
+ auto super_version = mgd.super_version;
+ bool skip_memtable =
+ (read_options.read_tier == kPersistedTier &&
+ has_unpersisted_data_.load(std::memory_order_relaxed));
+ bool done = false;
+ if (!skip_memtable) {
+ if (super_version->mem->Get(lkey, value, &s, &merge_context,
+ &max_covering_tombstone_seq, read_options)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if (super_version->imm->Get(lkey, value, &s, &merge_context,
+ &max_covering_tombstone_seq,
+ read_options)) {
+ done = true;
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ }
+ if (!done) {
+ PinnableSlice pinnable_val;
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ super_version->current->Get(read_options, lkey, &pinnable_val, &s,
+ &merge_context, &max_covering_tombstone_seq);
+ value->assign(pinnable_val.data(), pinnable_val.size());
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+
+ if (s.ok()) {
+ bytes_read += value->size();
+ num_found++;
+ }
+ }
+
+ // Post processing (decrement reference counts and record statistics)
+ PERF_TIMER_GUARD(get_post_process_time);
+ autovector<SuperVersion*> superversions_to_delete;
+
+ for (auto mgd_iter : multiget_cf_data) {
+ auto mgd = mgd_iter.second;
+ if (!unref_only) {
+ ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
+ } else {
+ mgd.cfd->GetSuperVersion()->Unref();
+ }
+ }
+ RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+ RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+ RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+ PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+ PERF_TIMER_STOP(get_post_process_time);
+
+ return stat_list;
+}
+
+template <class T>
+bool DBImpl::MultiCFSnapshot(
+ const ReadOptions& read_options, ReadCallback* callback,
+ std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+ iter_deref_func,
+ T* cf_list, SequenceNumber* snapshot) {
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ bool last_try = false;
+ if (cf_list->size() == 1) {
+ // Fast path for a single column family. We can simply get the thread loca
+ // super version
+ auto cf_iter = cf_list->begin();
+ auto node = iter_deref_func(cf_iter);
+ node->super_version = GetAndRefSuperVersion(node->cfd);
+ if (read_options.snapshot != nullptr) {
+ // Note: In WritePrepared txns this is not necessary but not harmful
+ // either. Because prep_seq > snapshot => commit_seq > snapshot so if
+ // a snapshot is specified we should be fine with skipping seq numbers
+ // that are greater than that.
+ //
+ // In WriteUnprepared, we cannot set snapshot in the lookup key because we
+ // may skip uncommitted data that should be visible to the transaction for
+ // reading own writes.
+ *snapshot =
+ static_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+ if (callback) {
+ *snapshot = std::max(*snapshot, callback->max_visible_seq());
+ }
+ } else {
+ // Since we get and reference the super version before getting
+ // the snapshot number, without a mutex protection, it is possible
+ // that a memtable switch happened in the middle and not all the
+ // data for this snapshot is available. But it will contain all
+ // the data available in the super version we have, which is also
+ // a valid snapshot to read from.
+ // We shouldn't get snapshot before finding and referencing the super
+ // version because a flush happening in between may compact away data for
+ // the snapshot, but the snapshot is earlier than the data overwriting it,
+ // so users may see wrong results.
+ *snapshot = last_seq_same_as_publish_seq_
+ ? versions_->LastSequence()
+ : versions_->LastPublishedSequence();
+ }
+ } else {
+ // If we end up with the same issue of memtable geting sealed during 2
+ // consecutive retries, it means the write rate is very high. In that case
+ // its probably ok to take the mutex on the 3rd try so we can succeed for
+ // sure
+ static const int num_retries = 3;
+ for (int i = 0; i < num_retries; ++i) {
+ last_try = (i == num_retries - 1);
+ bool retry = false;
+
+ if (i > 0) {
+ for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+ ++cf_iter) {
+ auto node = iter_deref_func(cf_iter);
+ SuperVersion* super_version = node->super_version;
+ ColumnFamilyData* cfd = node->cfd;
+ if (super_version != nullptr) {
+ ReturnAndCleanupSuperVersion(cfd, super_version);
+ }
+ node->super_version = nullptr;
+ }
+ }
+ if (read_options.snapshot == nullptr) {
+ if (last_try) {
+ TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
+ // We're close to max number of retries. For the last retry,
+ // acquire the lock so we're sure to succeed
+ mutex_.Lock();
+ }
+ *snapshot = last_seq_same_as_publish_seq_
+ ? versions_->LastSequence()
+ : versions_->LastPublishedSequence();
+ } else {
+ *snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+ ->number_;
+ }
+ for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+ ++cf_iter) {
+ auto node = iter_deref_func(cf_iter);
+ if (!last_try) {
+ node->super_version = GetAndRefSuperVersion(node->cfd);
+ } else {
+ node->super_version = node->cfd->GetSuperVersion()->Ref();
+ }
+ TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
+ if (read_options.snapshot != nullptr || last_try) {
+ // If user passed a snapshot, then we don't care if a memtable is
+ // sealed or compaction happens because the snapshot would ensure
+ // that older key versions are kept around. If this is the last
+ // retry, then we have the lock so nothing bad can happen
+ continue;
+ }
+ // We could get the earliest sequence number for the whole list of
+ // memtables, which will include immutable memtables as well, but that
+ // might be tricky to maintain in case we decide, in future, to do
+ // memtable compaction.
+ if (!last_try) {
+ SequenceNumber seq =
+ node->super_version->mem->GetEarliestSequenceNumber();
+ if (seq > *snapshot) {
+ retry = true;
+ break;
+ }
+ }
+ }
+ if (!retry) {
+ if (last_try) {
+ mutex_.Unlock();
+ }
+ break;
+ }
+ }
+ }
+
+ // Keep track of bytes that we read for statistics-recording later
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ return last_try;
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+ ColumnFamilyHandle** column_families, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input) {
+ if (num_keys == 0) {
+ return;
+ }
+ autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+ sorted_keys.resize(num_keys);
+ for (size_t i = 0; i < num_keys; ++i) {
+ key_context.emplace_back(column_families[i], keys[i], &values[i],
+ &statuses[i]);
+ }
+ for (size_t i = 0; i < num_keys; ++i) {
+ sorted_keys[i] = &key_context[i];
+ }
+ PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+
+ autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>
+ multiget_cf_data;
+ size_t cf_start = 0;
+ ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
+ for (size_t i = 0; i < num_keys; ++i) {
+ KeyContext* key_ctx = sorted_keys[i];
+ if (key_ctx->column_family != cf) {
+ multiget_cf_data.emplace_back(
+ MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr));
+ cf_start = i;
+ cf = key_ctx->column_family;
+ }
+ }
+ {
+ // multiget_cf_data.emplace_back(
+ // MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr));
+ multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
+ }
+ std::function<MultiGetColumnFamilyData*(
+ autovector<MultiGetColumnFamilyData,
+ MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
+ iter_deref_lambda =
+ [](autovector<MultiGetColumnFamilyData,
+ MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
+ return &(*cf_iter);
+ };
+
+ SequenceNumber consistent_seqnum;
+ bool unref_only = MultiCFSnapshot<
+ autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>>(
+ read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+ &consistent_seqnum);
+
+ for (auto cf_iter = multiget_cf_data.begin();
+ cf_iter != multiget_cf_data.end(); ++cf_iter) {
+ MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys,
+ cf_iter->super_version, consistent_seqnum, nullptr, nullptr);
+ if (!unref_only) {
+ ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version);
+ } else {
+ cf_iter->cfd->GetSuperVersion()->Unref();
+ }
+ }
+}
+
+namespace {
+// Order keys by CF ID, followed by key contents
+struct CompareKeyContext {
+ inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) {
+ ColumnFamilyHandleImpl* cfh =
+ static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+ uint32_t cfd_id1 = cfh->cfd()->GetID();
+ const Comparator* comparator = cfh->cfd()->user_comparator();
+ cfh = static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+ uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+ if (cfd_id1 < cfd_id2) {
+ return true;
+ } else if (cfd_id1 > cfd_id2) {
+ return false;
+ }
+
+ // Both keys are from the same column family
+ int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
+ if (cmp < 0) {
+ return true;
+ }
+ return false;
+ }
+};
+
+} // anonymous namespace
+
+void DBImpl::PrepareMultiGetKeys(
+ size_t num_keys, bool sorted_input,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+#ifndef NDEBUG
+ if (sorted_input) {
+ for (size_t index = 0; index < sorted_keys->size(); ++index) {
+ if (index > 0) {
+ KeyContext* lhs = (*sorted_keys)[index - 1];
+ KeyContext* rhs = (*sorted_keys)[index];
+ ColumnFamilyHandleImpl* cfh =
+ reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+ uint32_t cfd_id1 = cfh->cfd()->GetID();
+ const Comparator* comparator = cfh->cfd()->user_comparator();
+ cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+ uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+ assert(cfd_id1 <= cfd_id2);
+ if (cfd_id1 < cfd_id2) {
+ continue;
+ }
+
+ // Both keys are from the same column family
+ int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
+ assert(cmp <= 0);
+ }
+ index++;
+ }
+ }
+#endif
+ if (!sorted_input) {
+ CompareKeyContext sort_comparator;
+ std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
+ sort_comparator);
+ }
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const size_t num_keys,
+ const Slice* keys, PinnableSlice* values,
+ Status* statuses, const bool sorted_input) {
+ autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+ sorted_keys.resize(num_keys);
+ for (size_t i = 0; i < num_keys; ++i) {
+ key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
+ }
+ for (size_t i = 0; i < num_keys; ++i) {
+ sorted_keys[i] = &key_context[i];
+ }
+ PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+ MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys);
+}
+
+void DBImpl::MultiGetWithCallback(
+ const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+ ReadCallback* callback,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+ std::array<MultiGetColumnFamilyData, 1> multiget_cf_data;
+ multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr);
+ std::function<MultiGetColumnFamilyData*(
+ std::array<MultiGetColumnFamilyData, 1>::iterator&)>
+ iter_deref_lambda =
+ [](std::array<MultiGetColumnFamilyData, 1>::iterator& cf_iter) {
+ return &(*cf_iter);
+ };
+
+ size_t num_keys = sorted_keys->size();
+ SequenceNumber consistent_seqnum;
+ bool unref_only = MultiCFSnapshot<std::array<MultiGetColumnFamilyData, 1>>(
+ read_options, callback, iter_deref_lambda, &multiget_cf_data,
+ &consistent_seqnum);
+#ifndef NDEBUG
+ assert(!unref_only);
+#else
+ // Silence unused variable warning
+ (void)unref_only;
+#endif // NDEBUG
+
+ if (callback && read_options.snapshot == nullptr) {
+ // The unprep_seqs are not published for write unprepared, so it could be
+ // that max_visible_seq is larger. Seek to the std::max of the two.
+ // However, we still want our callback to contain the actual snapshot so
+ // that it can do the correct visibility filtering.
+ callback->Refresh(consistent_seqnum);
+
+ // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+ // max_visible_seq = max(max_visible_seq, snapshot)
+ //
+ // Currently, the commented out assert is broken by
+ // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+ // the regular transaction flow, then this special read callback would not
+ // be needed.
+ //
+ // assert(callback->max_visible_seq() >= snapshot);
+ consistent_seqnum = callback->max_visible_seq();
+ }
+
+ MultiGetImpl(read_options, 0, num_keys, sorted_keys,
+ multiget_cf_data[0].super_version, consistent_seqnum, nullptr,
+ nullptr);
+ ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
+ multiget_cf_data[0].super_version);
+}
+
+void DBImpl::MultiGetImpl(
+ const ReadOptions& read_options, size_t start_key, size_t num_keys,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+ SuperVersion* super_version, SequenceNumber snapshot,
+ ReadCallback* callback, bool* is_blob_index) {
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+ StopWatch sw(env_, stats_, DB_MULTIGET);
+
+ // For each of the given keys, apply the entire "get" process as follows:
+ // First look in the memtable, then in the immutable memtable (if any).
+ // s is both in/out. When in, s could either be OK or MergeInProgress.
+ // merge_operands will contain the sequence of merges in the latter case.
+ size_t keys_left = num_keys;
+ while (keys_left) {
+ size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
+ ? MultiGetContext::MAX_BATCH_SIZE
+ : keys_left;
+ MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
+ batch_size, snapshot);
+ MultiGetRange range = ctx.GetMultiGetRange();
+ bool lookup_current = false;
+
+ keys_left -= batch_size;
+ for (auto mget_iter = range.begin(); mget_iter != range.end();
+ ++mget_iter) {
+ mget_iter->merge_context.Clear();
+ *mget_iter->s = Status::OK();
+ }
+
+ bool skip_memtable =
+ (read_options.read_tier == kPersistedTier &&
+ has_unpersisted_data_.load(std::memory_order_relaxed));
+ if (!skip_memtable) {
+ super_version->mem->MultiGet(read_options, &range, callback,
+ is_blob_index);
+ if (!range.empty()) {
+ super_version->imm->MultiGet(read_options, &range, callback,
+ is_blob_index);
+ }
+ if (!range.empty()) {
+ lookup_current = true;
+ uint64_t left = range.KeysLeft();
+ RecordTick(stats_, MEMTABLE_MISS, left);
+ }
+ }
+ if (lookup_current) {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ super_version->current->MultiGet(read_options, &range, callback,
+ is_blob_index);
+ }
+ }
+
+ // Post processing (decrement reference counts and record statistics)
+ PERF_TIMER_GUARD(get_post_process_time);
+ size_t num_found = 0;
+ uint64_t bytes_read = 0;
+ for (size_t i = start_key; i < start_key + num_keys; ++i) {
+ KeyContext* key = (*sorted_keys)[i];
+ if (key->s->ok()) {
+ bytes_read += key->value->size();
+ num_found++;
+ }
+ }
+
+ RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+ RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+ RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+ RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+ PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+ PERF_TIMER_STOP(get_post_process_time);
+}
+
+Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+ const std::string& column_family,
+ ColumnFamilyHandle** handle) {
+ assert(handle != nullptr);
+ Status s = CreateColumnFamilyImpl(cf_options, column_family, handle);
+ if (s.ok()) {
+ s = WriteOptionsFile(true /*need_mutex_lock*/,
+ true /*need_enter_write_thread*/);
+ }
+ return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+ const ColumnFamilyOptions& cf_options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles) {
+ assert(handles != nullptr);
+ handles->clear();
+ size_t num_cf = column_family_names.size();
+ Status s;
+ bool success_once = false;
+ for (size_t i = 0; i < num_cf; i++) {
+ ColumnFamilyHandle* handle;
+ s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle);
+ if (!s.ok()) {
+ break;
+ }
+ handles->push_back(handle);
+ success_once = true;
+ }
+ if (success_once) {
+ Status persist_options_status = WriteOptionsFile(
+ true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ if (s.ok() && !persist_options_status.ok()) {
+ s = persist_options_status;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles) {
+ assert(handles != nullptr);
+ handles->clear();
+ size_t num_cf = column_families.size();
+ Status s;
+ bool success_once = false;
+ for (size_t i = 0; i < num_cf; i++) {
+ ColumnFamilyHandle* handle;
+ s = CreateColumnFamilyImpl(column_families[i].options,
+ column_families[i].name, &handle);
+ if (!s.ok()) {
+ break;
+ }
+ handles->push_back(handle);
+ success_once = true;
+ }
+ if (success_once) {
+ Status persist_options_status = WriteOptionsFile(
+ true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ if (s.ok() && !persist_options_status.ok()) {
+ s = persist_options_status;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+ const std::string& column_family_name,
+ ColumnFamilyHandle** handle) {
+ Status s;
+ Status persist_options_status;
+ *handle = nullptr;
+
+ DBOptions db_options =
+ BuildDBOptions(immutable_db_options_, mutable_db_options_);
+ s = ColumnFamilyData::ValidateOptions(db_options, cf_options);
+ if (s.ok()) {
+ for (auto& cf_path : cf_options.cf_paths) {
+ s = env_->CreateDirIfMissing(cf_path.path);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ if (!s.ok()) {
+ return s;
+ }
+
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+
+ if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
+ nullptr) {
+ return Status::InvalidArgument("Column family already exists");
+ }
+ VersionEdit edit;
+ edit.AddColumnFamily(column_family_name);
+ uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+ edit.SetColumnFamily(new_id);
+ edit.SetLogNumber(logfile_number_);
+ edit.SetComparatorName(cf_options.comparator->Name());
+
+ // LogAndApply will both write the creation in MANIFEST and create
+ // ColumnFamilyData object
+ { // write thread
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ // LogAndApply will both write the creation in MANIFEST and create
+ // ColumnFamilyData object
+ s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit,
+ &mutex_, directories_.GetDbDir(), false,
+ &cf_options);
+ write_thread_.ExitUnbatched(&w);
+ }
+ if (s.ok()) {
+ auto* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+ assert(cfd != nullptr);
+ std::map<std::string, std::shared_ptr<Directory>> dummy_created_dirs;
+ s = cfd->AddDirectories(&dummy_created_dirs);
+ }
+ if (s.ok()) {
+ single_column_family_mode_ = false;
+ auto* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+ assert(cfd != nullptr);
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context,
+ *cfd->GetLatestMutableCFOptions());
+
+ if (!cfd->mem()->IsSnapshotSupported()) {
+ is_snapshot_supported_ = false;
+ }
+
+ cfd->set_initialized();
+
+ *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Created column family [%s] (ID %u)",
+ column_family_name.c_str(), (unsigned)cfd->GetID());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Creating column family [%s] FAILED -- %s",
+ column_family_name.c_str(), s.ToString().c_str());
+ }
+ } // InstrumentedMutexLock l(&mutex_)
+
+ sv_context.Clean();
+ // this is outside the mutex
+ if (s.ok()) {
+ NewThreadStatusCfInfo(
+ reinterpret_cast<ColumnFamilyHandleImpl*>(*handle)->cfd());
+ }
+ return s;
+}
+
+Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
+ assert(column_family != nullptr);
+ Status s = DropColumnFamilyImpl(column_family);
+ if (s.ok()) {
+ s = WriteOptionsFile(true /*need_mutex_lock*/,
+ true /*need_enter_write_thread*/);
+ }
+ return s;
+}
+
+Status DBImpl::DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families) {
+ Status s;
+ bool success_once = false;
+ for (auto* handle : column_families) {
+ s = DropColumnFamilyImpl(handle);
+ if (!s.ok()) {
+ break;
+ }
+ success_once = true;
+ }
+ if (success_once) {
+ Status persist_options_status = WriteOptionsFile(
+ true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+ if (s.ok() && !persist_options_status.ok()) {
+ s = persist_options_status;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ if (cfd->GetID() == 0) {
+ return Status::InvalidArgument("Can't drop default column family");
+ }
+
+ bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported();
+
+ VersionEdit edit;
+ edit.DropColumnFamily();
+ edit.SetColumnFamily(cfd->GetID());
+
+ Status s;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ if (cfd->IsDropped()) {
+ s = Status::InvalidArgument("Column family already dropped!\n");
+ }
+ if (s.ok()) {
+ // we drop column family from a single write thread
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
+ &mutex_);
+ write_thread_.ExitUnbatched(&w);
+ }
+ if (s.ok()) {
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
+ mutable_cf_options->max_write_buffer_number;
+ }
+
+ if (!cf_support_snapshot) {
+ // Dropped Column Family doesn't support snapshot. Need to recalculate
+ // is_snapshot_supported_.
+ bool new_is_snapshot_supported = true;
+ for (auto c : *versions_->GetColumnFamilySet()) {
+ if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) {
+ new_is_snapshot_supported = false;
+ break;
+ }
+ }
+ is_snapshot_supported_ = new_is_snapshot_supported;
+ }
+ bg_cv_.SignalAll();
+ }
+
+ if (s.ok()) {
+ // Note that here we erase the associated cf_info of the to-be-dropped
+ // cfd before its ref-count goes to zero to avoid having to erase cf_info
+ // later inside db_mutex.
+ EraseThreadStatusCfInfo(cfd);
+ assert(cfd->IsDropped());
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Dropped column family with id %u\n", cfd->GetID());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Dropping column family with id %u FAILED -- %s\n",
+ cfd->GetID(), s.ToString().c_str());
+ }
+
+ return s;
+}
+
+bool DBImpl::KeyMayExist(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value, bool* value_found) {
+ assert(value != nullptr);
+ if (value_found != nullptr) {
+ // falsify later if key-may-exist but can't fetch value
+ *value_found = true;
+ }
+ ReadOptions roptions = read_options;
+ roptions.read_tier = kBlockCacheTier; // read from block cache only
+ PinnableSlice pinnable_val;
+ GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.value = &pinnable_val;
+ get_impl_options.value_found = value_found;
+ auto s = GetImpl(roptions, key, get_impl_options);
+ value->assign(pinnable_val.data(), pinnable_val.size());
+
+ // If block_cache is enabled and the index block of the table didn't
+ // not present in block_cache, the return value will be Status::Incomplete.
+ // In this case, key may still exist in the table.
+ return s.ok() || s.IsIncomplete();
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) {
+ if (read_options.managed) {
+ return NewErrorIterator(
+ Status::NotSupported("Managed iterator is not supported anymore."));
+ }
+ Iterator* result = nullptr;
+ if (read_options.read_tier == kPersistedTier) {
+ return NewErrorIterator(Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators."));
+ }
+ // if iterator wants internal keys, we can only proceed if
+ // we can guarantee the deletes haven't been processed yet
+ if (immutable_db_options_.preserve_deletes &&
+ read_options.iter_start_seqnum > 0 &&
+ read_options.iter_start_seqnum < preserve_deletes_seqnum_.load()) {
+ return NewErrorIterator(Status::InvalidArgument(
+ "Iterator requested internal keys which are too old and are not"
+ " guaranteed to be preserved, try larger iter_start_seqnum opt."));
+ }
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+ // not supported in lite version
+ result = nullptr;
+
+#else
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+ auto iter = new ForwardIterator(this, read_options, cfd, sv);
+ result = NewDBIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+ cfd->user_comparator(), iter, kMaxSequenceNumber,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback,
+ this, cfd);
+#endif
+ } else {
+ // Note: no need to consider the special case of
+ // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
+ // WritePreparedTxnDB
+ auto snapshot = read_options.snapshot != nullptr
+ ? read_options.snapshot->GetSequenceNumber()
+ : versions_->LastSequence();
+ result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
+ }
+ return result;
+}
+
+ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
+ ColumnFamilyData* cfd,
+ SequenceNumber snapshot,
+ ReadCallback* read_callback,
+ bool allow_blob,
+ bool allow_refresh) {
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+
+ // Try to generate a DB iterator tree in continuous memory area to be
+ // cache friendly. Here is an example of result:
+ // +-------------------------------+
+ // | |
+ // | ArenaWrappedDBIter |
+ // | + |
+ // | +---> Inner Iterator ------------+
+ // | | | |
+ // | | +-- -- -- -- -- -- -- --+ |
+ // | +--- | Arena | |
+ // | | | |
+ // | Allocated Memory: | |
+ // | | +-------------------+ |
+ // | | | DBIter | <---+
+ // | | + |
+ // | | | +-> iter_ ------------+
+ // | | | | |
+ // | | +-------------------+ |
+ // | | | MergingIterator | <---+
+ // | | + |
+ // | | | +->child iter1 ------------+
+ // | | | | | |
+ // | | +->child iter2 ----------+ |
+ // | | | | | | |
+ // | | | +->child iter3 --------+ | |
+ // | | | | | |
+ // | | +-------------------+ | | |
+ // | | | Iterator1 | <--------+
+ // | | +-------------------+ | |
+ // | | | Iterator2 | <------+
+ // | | +-------------------+ |
+ // | | | Iterator3 | <----+
+ // | | +-------------------+
+ // | | |
+ // +-------+-----------------------+
+ //
+ // ArenaWrappedDBIter inlines an arena area where all the iterators in
+ // the iterator tree are allocated in the order of being accessed when
+ // querying.
+ // Laying out the iterators in the order of being accessed makes it more
+ // likely that any iterator pointer is close to the iterator it points to so
+ // that they are likely to be in the same cache line and/or page.
+ ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations,
+ sv->version_number, read_callback, this, cfd, allow_blob,
+ read_options.snapshot != nullptr ? false : allow_refresh);
+
+ InternalIterator* internal_iter =
+ NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
+ db_iter->GetRangeDelAggregator(), snapshot);
+ db_iter->SetIterUnderDBIter(internal_iter);
+
+ return db_iter;
+}
+
+Status DBImpl::NewIterators(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) {
+ if (read_options.managed) {
+ return Status::NotSupported("Managed iterator is not supported anymore.");
+ }
+ if (read_options.read_tier == kPersistedTier) {
+ return Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators.");
+ }
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ iterators->clear();
+ iterators->reserve(column_families.size());
+ if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+ return Status::InvalidArgument(
+ "Tailing iterator not supported in RocksDB lite");
+#else
+ for (auto cfh : column_families) {
+ auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+ auto iter = new ForwardIterator(this, read_options, cfd, sv);
+ iterators->push_back(NewDBIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+ cfd->user_comparator(), iter, kMaxSequenceNumber,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations,
+ read_callback, this, cfd));
+ }
+#endif
+ } else {
+ // Note: no need to consider the special case of
+ // last_seq_same_as_publish_seq_==false since NewIterators is overridden in
+ // WritePreparedTxnDB
+ auto snapshot = read_options.snapshot != nullptr
+ ? read_options.snapshot->GetSequenceNumber()
+ : versions_->LastSequence();
+ for (size_t i = 0; i < column_families.size(); ++i) {
+ auto* cfd =
+ reinterpret_cast<ColumnFamilyHandleImpl*>(column_families[i])->cfd();
+ iterators->push_back(
+ NewIteratorImpl(read_options, cfd, snapshot, read_callback));
+ }
+ }
+
+ return Status::OK();
+}
+
+const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); }
+
+#ifndef ROCKSDB_LITE
+const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() {
+ return GetSnapshotImpl(true);
+}
+#endif // ROCKSDB_LITE
+
+SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
+ bool lock) {
+ int64_t unix_time = 0;
+ env_->GetCurrentTime(&unix_time); // Ignore error
+ SnapshotImpl* s = new SnapshotImpl;
+
+ if (lock) {
+ mutex_.Lock();
+ }
+ // returns null if the underlying memtable does not support snapshot.
+ if (!is_snapshot_supported_) {
+ if (lock) {
+ mutex_.Unlock();
+ }
+ delete s;
+ return nullptr;
+ }
+ auto snapshot_seq = last_seq_same_as_publish_seq_
+ ? versions_->LastSequence()
+ : versions_->LastPublishedSequence();
+ SnapshotImpl* snapshot =
+ snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary);
+ if (lock) {
+ mutex_.Unlock();
+ }
+ return snapshot;
+}
+
+namespace {
+typedef autovector<ColumnFamilyData*, 2> CfdList;
+bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
+ for (const ColumnFamilyData* t : list) {
+ if (t == cfd) {
+ return true;
+ }
+ }
+ return false;
+}
+} // namespace
+
+void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+ const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ snapshots_.Delete(casted_s);
+ uint64_t oldest_snapshot;
+ if (snapshots_.empty()) {
+ oldest_snapshot = last_seq_same_as_publish_seq_
+ ? versions_->LastSequence()
+ : versions_->LastPublishedSequence();
+ } else {
+ oldest_snapshot = snapshots_.oldest()->number_;
+ }
+ // Avoid to go through every column family by checking a global threshold
+ // first.
+ if (oldest_snapshot > bottommost_files_mark_threshold_) {
+ CfdList cf_scheduled;
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
+ if (!cfd->current()
+ ->storage_info()
+ ->BottommostFilesMarkedForCompaction()
+ .empty()) {
+ SchedulePendingCompaction(cfd);
+ MaybeScheduleFlushOrCompaction();
+ cf_scheduled.push_back(cfd);
+ }
+ }
+
+ // Calculate a new threshold, skipping those CFs where compactions are
+ // scheduled. We do not do the same pass as the previous loop because
+ // mutex might be unlocked during the loop, making the result inaccurate.
+ SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ if (CfdListContains(cf_scheduled, cfd)) {
+ continue;
+ }
+ new_bottommost_files_mark_threshold = std::min(
+ new_bottommost_files_mark_threshold,
+ cfd->current()->storage_info()->bottommost_files_mark_threshold());
+ }
+ bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
+ }
+ }
+ delete casted_s;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+ TablePropertiesCollection* props) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+
+ // Increment the ref count
+ mutex_.Lock();
+ auto version = cfd->current();
+ version->Ref();
+ mutex_.Unlock();
+
+ auto s = version->GetPropertiesOfAllTables(props);
+
+ // Decrement the ref count
+ mutex_.Lock();
+ version->Unref();
+ mutex_.Unlock();
+
+ return s;
+}
+
+Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
+ const Range* range, std::size_t n,
+ TablePropertiesCollection* props) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+
+ // Increment the ref count
+ mutex_.Lock();
+ auto version = cfd->current();
+ version->Ref();
+ mutex_.Unlock();
+
+ auto s = version->GetPropertiesOfTablesInRange(range, n, props);
+
+ // Decrement the ref count
+ mutex_.Lock();
+ version->Unref();
+ mutex_.Unlock();
+
+ return s;
+}
+
+#endif // ROCKSDB_LITE
+
+const std::string& DBImpl::GetName() const { return dbname_; }
+
+Env* DBImpl::GetEnv() const { return env_; }
+
+FileSystem* DB::GetFileSystem() const {
+ static LegacyFileSystemWrapper fs_wrap(GetEnv());
+ return &fs_wrap;
+}
+
+FileSystem* DBImpl::GetFileSystem() const {
+ return immutable_db_options_.fs.get();
+}
+
+Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
+ InstrumentedMutexLock l(&mutex_);
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+ cfh->cfd()->GetLatestCFOptions());
+}
+
+DBOptions DBImpl::GetDBOptions() const {
+ InstrumentedMutexLock l(&mutex_);
+ return BuildDBOptions(immutable_db_options_, mutable_db_options_);
+}
+
+bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ value->clear();
+ auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ if (property_info == nullptr) {
+ return false;
+ } else if (property_info->handle_int) {
+ uint64_t int_value;
+ bool ret_value =
+ GetIntPropertyInternal(cfd, *property_info, false, &int_value);
+ if (ret_value) {
+ *value = ToString(int_value);
+ }
+ return ret_value;
+ } else if (property_info->handle_string) {
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->internal_stats()->GetStringProperty(*property_info, property,
+ value);
+ } else if (property_info->handle_string_dbimpl) {
+ std::string tmp_value;
+ bool ret_value = (this->*(property_info->handle_string_dbimpl))(&tmp_value);
+ if (ret_value) {
+ *value = tmp_value;
+ }
+ return ret_value;
+ }
+ // Shouldn't reach here since exactly one of handle_string and handle_int
+ // should be non-nullptr.
+ assert(false);
+ return false;
+}
+
+bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family,
+ const Slice& property,
+ std::map<std::string, std::string>* value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ value->clear();
+ auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ if (property_info == nullptr) {
+ return false;
+ } else if (property_info->handle_map) {
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->internal_stats()->GetMapProperty(*property_info, property,
+ value);
+ }
+ // If we reach this point it means that handle_map is not provided for the
+ // requested property
+ return false;
+}
+
+bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, uint64_t* value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ if (property_info == nullptr || property_info->handle_int == nullptr) {
+ return false;
+ }
+ auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ return GetIntPropertyInternal(cfd, *property_info, false, value);
+}
+
+bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd,
+ const DBPropertyInfo& property_info,
+ bool is_locked, uint64_t* value) {
+ assert(property_info.handle_int != nullptr);
+ if (!property_info.need_out_of_mutex) {
+ if (is_locked) {
+ mutex_.AssertHeld();
+ return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+ } else {
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+ }
+ } else {
+ SuperVersion* sv = nullptr;
+ if (!is_locked) {
+ sv = GetAndRefSuperVersion(cfd);
+ } else {
+ sv = cfd->GetSuperVersion();
+ }
+
+ bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex(
+ property_info, sv->current, value);
+
+ if (!is_locked) {
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ }
+
+ return ret;
+ }
+}
+
+bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) {
+ assert(value != nullptr);
+ Statistics* statistics = immutable_db_options_.statistics.get();
+ if (!statistics) {
+ return false;
+ }
+ *value = statistics->ToString();
+ return true;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::ResetStats() {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->initialized()) {
+ cfd->internal_stats()->Clear();
+ }
+ }
+ return Status::OK();
+}
+#endif // ROCKSDB_LITE
+
+bool DBImpl::GetAggregatedIntProperty(const Slice& property,
+ uint64_t* aggregated_value) {
+ const DBPropertyInfo* property_info = GetPropertyInfo(property);
+ if (property_info == nullptr || property_info->handle_int == nullptr) {
+ return false;
+ }
+
+ uint64_t sum = 0;
+ {
+ // Needs mutex to protect the list of column families.
+ InstrumentedMutexLock l(&mutex_);
+ uint64_t value;
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ if (!cfd->initialized()) {
+ continue;
+ }
+ if (GetIntPropertyInternal(cfd, *property_info, true, &value)) {
+ sum += value;
+ } else {
+ return false;
+ }
+ }
+ }
+ *aggregated_value = sum;
+ return true;
+}
+
+SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
+ // TODO(ljin): consider using GetReferencedSuperVersion() directly
+ return cfd->GetThreadLocalSuperVersion(this);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) {
+ auto column_family_set = versions_->GetColumnFamilySet();
+ auto cfd = column_family_set->GetColumnFamily(column_family_id);
+ if (!cfd) {
+ return nullptr;
+ }
+
+ return GetAndRefSuperVersion(cfd);
+}
+
+void DBImpl::CleanupSuperVersion(SuperVersion* sv) {
+ // Release SuperVersion
+ if (sv->Unref()) {
+ bool defer_purge =
+ immutable_db_options().avoid_unnecessary_blocking_io;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ sv->Cleanup();
+ if (defer_purge) {
+ AddSuperVersionsToFreeQueue(sv);
+ SchedulePurge();
+ }
+ }
+ if (!defer_purge) {
+ delete sv;
+ }
+ RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
+ }
+ RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
+}
+
+void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
+ SuperVersion* sv) {
+ if (!cfd->ReturnThreadLocalSuperVersion(sv)) {
+ CleanupSuperVersion(sv);
+ }
+}
+
+// REQUIRED: this function should only be called on the write thread.
+void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id,
+ SuperVersion* sv) {
+ auto column_family_set = versions_->GetColumnFamilySet();
+ auto cfd = column_family_set->GetColumnFamily(column_family_id);
+
+ // If SuperVersion is held, and we successfully fetched a cfd using
+ // GetAndRefSuperVersion(), it must still exist.
+ assert(cfd != nullptr);
+ ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
+ ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
+
+ if (!cf_memtables->Seek(column_family_id)) {
+ return nullptr;
+ }
+
+ return cf_memtables->GetColumnFamilyHandle();
+}
+
+// REQUIRED: mutex is NOT held.
+std::unique_ptr<ColumnFamilyHandle> DBImpl::GetColumnFamilyHandleUnlocked(
+ uint32_t column_family_id) {
+ InstrumentedMutexLock l(&mutex_);
+
+ auto* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id);
+ if (cfd == nullptr) {
+ return nullptr;
+ }
+
+ return std::unique_ptr<ColumnFamilyHandleImpl>(
+ new ColumnFamilyHandleImpl(cfd, this, &mutex_));
+}
+
+void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+ const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) {
+ ColumnFamilyHandleImpl* cfh =
+ reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+ // Convert user_key into a corresponding internal key.
+ InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek);
+ MemTable::MemTableStats memStats =
+ sv->mem->ApproximateStats(k1.Encode(), k2.Encode());
+ MemTable::MemTableStats immStats =
+ sv->imm->ApproximateStats(k1.Encode(), k2.Encode());
+ *count = memStats.count + immStats.count;
+ *size = memStats.size + immStats.size;
+
+ ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Range* range, int n, uint64_t* sizes) {
+ if (!options.include_memtabtles && !options.include_files) {
+ return Status::InvalidArgument("Invalid options");
+ }
+
+ Version* v;
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+ v = sv->current;
+
+ for (int i = 0; i < n; i++) {
+ // Convert user_key into a corresponding internal key.
+ InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+ sizes[i] = 0;
+ if (options.include_files) {
+ sizes[i] += versions_->ApproximateSize(
+ options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
+ /*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
+ }
+ if (options.include_memtabtles) {
+ sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
+ sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size;
+ }
+ }
+
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ return Status::OK();
+}
+
+std::list<uint64_t>::iterator
+DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
+ // We need to remember the iterator of our insert, because after the
+ // background job is done, we need to remove that element from
+ // pending_outputs_.
+ pending_outputs_.push_back(versions_->current_next_file_number());
+ auto pending_outputs_inserted_elem = pending_outputs_.end();
+ --pending_outputs_inserted_elem;
+ return pending_outputs_inserted_elem;
+}
+
+void DBImpl::ReleaseFileNumberFromPendingOutputs(
+ std::unique_ptr<std::list<uint64_t>::iterator>& v) {
+ if (v.get() != nullptr) {
+ pending_outputs_.erase(*v.get());
+ v.reset();
+ }
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetUpdatesSince(
+ SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options) {
+ RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
+ if (seq > versions_->LastSequence()) {
+ return Status::NotFound("Requested sequence not yet written in the db");
+ }
+ return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
+}
+
+Status DBImpl::DeleteFile(std::string name) {
+ uint64_t number;
+ FileType type;
+ WalFileType log_type;
+ if (!ParseFileName(name, &number, &type, &log_type) ||
+ (type != kTableFile && type != kLogFile)) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n",
+ name.c_str());
+ return Status::InvalidArgument("Invalid file name");
+ }
+
+ Status status;
+ if (type == kLogFile) {
+ // Only allow deleting archived log files
+ if (log_type != kArchivedLogFile) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "DeleteFile %s failed - not archived log.\n",
+ name.c_str());
+ return Status::NotSupported("Delete only supported for archived logs");
+ }
+ status = wal_manager_.DeleteFile(name, number);
+ if (!status.ok()) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "DeleteFile %s failed -- %s.\n", name.c_str(),
+ status.ToString().c_str());
+ }
+ return status;
+ }
+
+ int level;
+ FileMetaData* metadata;
+ ColumnFamilyData* cfd;
+ VersionEdit edit;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
+ if (!status.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "DeleteFile %s failed. File not found\n", name.c_str());
+ job_context.Clean();
+ return Status::InvalidArgument("File not found");
+ }
+ assert(level < cfd->NumberLevels());
+
+ // If the file is being compacted no need to delete.
+ if (metadata->being_compacted) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "DeleteFile %s Skipped. File about to be compacted\n",
+ name.c_str());
+ job_context.Clean();
+ return Status::OK();
+ }
+
+ // Only the files in the last level can be deleted externally.
+ // This is to make sure that any deletion tombstones are not
+ // lost. Check that the level passed is the last level.
+ auto* vstoreage = cfd->current()->storage_info();
+ for (int i = level + 1; i < cfd->NumberLevels(); i++) {
+ if (vstoreage->NumLevelFiles(i) != 0) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "DeleteFile %s FAILED. File not in last level\n",
+ name.c_str());
+ job_context.Clean();
+ return Status::InvalidArgument("File not in last level");
+ }
+ }
+ // if level == 0, it has to be the oldest file
+ if (level == 0 &&
+ vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "DeleteFile %s failed ---"
+ " target file in level 0 must be the oldest.",
+ name.c_str());
+ job_context.Clean();
+ return Status::InvalidArgument("File in level 0, but not oldest");
+ }
+ edit.SetColumnFamily(cfd->GetID());
+ edit.DeleteFile(level, number);
+ status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ &edit, &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd,
+ &job_context.superversion_contexts[0],
+ *cfd->GetLatestMutableCFOptions());
+ }
+ FindObsoleteFiles(&job_context, false);
+ } // lock released here
+
+ LogFlush(immutable_db_options_.info_log);
+ // remove files outside the db-lock
+ if (job_context.HaveSomethingToDelete()) {
+ // Call PurgeObsoleteFiles() without holding mutex.
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ return status;
+}
+
+Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+ const RangePtr* ranges, size_t n,
+ bool include_end) {
+ Status status;
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+ VersionEdit edit;
+ std::set<FileMetaData*> deleted_files;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ Version* input_version = cfd->current();
+
+ auto* vstorage = input_version->storage_info();
+ for (size_t r = 0; r < n; r++) {
+ auto begin = ranges[r].start, end = ranges[r].limit;
+ for (int i = 1; i < cfd->NumberLevels(); i++) {
+ if (vstorage->LevelFiles(i).empty() ||
+ !vstorage->OverlapInLevel(i, begin, end)) {
+ continue;
+ }
+ std::vector<FileMetaData*> level_files;
+ InternalKey begin_storage, end_storage, *begin_key, *end_key;
+ if (begin == nullptr) {
+ begin_key = nullptr;
+ } else {
+ begin_storage.SetMinPossibleForUserKey(*begin);
+ begin_key = &begin_storage;
+ }
+ if (end == nullptr) {
+ end_key = nullptr;
+ } else {
+ end_storage.SetMaxPossibleForUserKey(*end);
+ end_key = &end_storage;
+ }
+
+ vstorage->GetCleanInputsWithinInterval(
+ i, begin_key, end_key, &level_files, -1 /* hint_index */,
+ nullptr /* file_index */);
+ FileMetaData* level_file;
+ for (uint32_t j = 0; j < level_files.size(); j++) {
+ level_file = level_files[j];
+ if (level_file->being_compacted) {
+ continue;
+ }
+ if (deleted_files.find(level_file) != deleted_files.end()) {
+ continue;
+ }
+ if (!include_end && end != nullptr &&
+ cfd->user_comparator()->Compare(level_file->largest.user_key(),
+ *end) == 0) {
+ continue;
+ }
+ edit.SetColumnFamily(cfd->GetID());
+ edit.DeleteFile(i, level_file->fd.GetNumber());
+ deleted_files.insert(level_file);
+ level_file->being_compacted = true;
+ }
+ }
+ }
+ if (edit.GetDeletedFiles().empty()) {
+ job_context.Clean();
+ return Status::OK();
+ }
+ input_version->Ref();
+ status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ &edit, &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd,
+ &job_context.superversion_contexts[0],
+ *cfd->GetLatestMutableCFOptions());
+ }
+ for (auto* deleted_file : deleted_files) {
+ deleted_file->being_compacted = false;
+ }
+ input_version->Unref();
+ FindObsoleteFiles(&job_context, false);
+ } // lock released here
+
+ LogFlush(immutable_db_options_.info_log);
+ // remove files outside the db-lock
+ if (job_context.HaveSomethingToDelete()) {
+ // Call PurgeObsoleteFiles() without holding mutex.
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ return status;
+}
+
+void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+ InstrumentedMutexLock l(&mutex_);
+ versions_->GetLiveFilesMetaData(metadata);
+}
+
+void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+ ColumnFamilyMetaData* cf_meta) {
+ assert(column_family);
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ auto* sv = GetAndRefSuperVersion(cfd);
+ {
+ // Without mutex, Version::GetColumnFamilyMetaData will have data race with
+ // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but
+ // this may cause regression. An alternative is to make
+ // FileMetaData::being_compacted atomic, but it will make FileMetaData
+ // non-copy-able. Another option is to separate these variables from
+ // original FileMetaData struct, and this requires re-organization of data
+ // structures. For now, we take the easy approach. If
+ // DB::GetColumnFamilyMetaData is not called frequently, the regression
+ // should not be big. We still need to keep an eye on it.
+ InstrumentedMutexLock l(&mutex_);
+ sv->current->GetColumnFamilyMetaData(cf_meta);
+ }
+ ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+#endif // ROCKSDB_LITE
+
+Status DBImpl::CheckConsistency() {
+ mutex_.AssertHeld();
+ std::vector<LiveFileMetaData> metadata;
+ versions_->GetLiveFilesMetaData(&metadata);
+ TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
+
+ std::string corruption_messages;
+
+ if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+ // Instead of calling GetFileSize() for each expected file, call
+ // GetChildren() for the DB directory and check that all expected files
+ // are listed, without checking their sizes.
+ // Since sst files might be in different directories, do it for each
+ // directory separately.
+ std::map<std::string, std::vector<std::string>> files_by_directory;
+ for (const auto& md : metadata) {
+ // md.name has a leading "/". Remove it.
+ std::string fname = md.name;
+ if (!fname.empty() && fname[0] == '/') {
+ fname = fname.substr(1);
+ }
+ files_by_directory[md.db_path].push_back(fname);
+ }
+ for (const auto& dir_files : files_by_directory) {
+ std::string directory = dir_files.first;
+ std::vector<std::string> existing_files;
+ Status s = env_->GetChildren(directory, &existing_files);
+ if (!s.ok()) {
+ corruption_messages +=
+ "Can't list files in " + directory + ": " + s.ToString() + "\n";
+ continue;
+ }
+ std::sort(existing_files.begin(), existing_files.end());
+
+ for (const std::string& fname : dir_files.second) {
+ if (!std::binary_search(existing_files.begin(), existing_files.end(),
+ fname) &&
+ !std::binary_search(existing_files.begin(), existing_files.end(),
+ Rocks2LevelTableFileName(fname))) {
+ corruption_messages +=
+ "Missing sst file " + fname + " in " + directory + "\n";
+ }
+ }
+ }
+ } else {
+ for (const auto& md : metadata) {
+ // md.name has a leading "/".
+ std::string file_path = md.db_path + md.name;
+
+ uint64_t fsize = 0;
+ TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
+ Status s = env_->GetFileSize(file_path, &fsize);
+ if (!s.ok() &&
+ env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
+ s = Status::OK();
+ }
+ if (!s.ok()) {
+ corruption_messages +=
+ "Can't access " + md.name + ": " + s.ToString() + "\n";
+ } else if (fsize != md.size) {
+ corruption_messages += "Sst file size mismatch: " + file_path +
+ ". Size recorded in manifest " +
+ ToString(md.size) + ", actual size " +
+ ToString(fsize) + "\n";
+ }
+ }
+ }
+
+ if (corruption_messages.size() == 0) {
+ return Status::OK();
+ } else {
+ return Status::Corruption(corruption_messages);
+ }
+}
+
+Status DBImpl::GetDbIdentity(std::string& identity) const {
+ identity.assign(db_id_);
+ return Status::OK();
+}
+
+Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const {
+ std::string idfilename = IdentityFileName(dbname_);
+ const FileOptions soptions;
+
+ Status s = ReadFileToString(fs_.get(), idfilename, identity);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // If last character is '\n' remove it from identity
+ if (identity->size() > 0 && identity->back() == '\n') {
+ identity->pop_back();
+ }
+ return s;
+}
+
+// Default implementation -- returns not supported status
+Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/,
+ const std::string& /*column_family_name*/,
+ ColumnFamilyHandle** /*handle*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+ const ColumnFamilyOptions& /*cf_options*/,
+ const std::vector<std::string>& /*column_family_names*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& /*column_families*/) {
+ return Status::NotSupported("");
+}
+
+Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) {
+ delete column_family;
+ return Status::OK();
+}
+
+DB::~DB() {}
+
+Status DBImpl::Close() {
+ if (!closed_) {
+ {
+ InstrumentedMutexLock l(&mutex_);
+ // If there is unreleased snapshot, fail the close call
+ if (!snapshots_.empty()) {
+ return Status::Aborted("Cannot close DB with unreleased snapshot.");
+ }
+ }
+
+ closed_ = true;
+ return CloseImpl();
+ }
+ return Status::OK();
+}
+
+Status DB::ListColumnFamilies(const DBOptions& db_options,
+ const std::string& name,
+ std::vector<std::string>* column_families) {
+ FileSystem* fs = db_options.file_system.get();
+ LegacyFileSystemWrapper legacy_fs(db_options.env);
+ if (!fs) {
+ fs = &legacy_fs;
+ }
+ return VersionSet::ListColumnFamilies(column_families, name, fs);
+}
+
+Snapshot::~Snapshot() {}
+
+Status DestroyDB(const std::string& dbname, const Options& options,
+ const std::vector<ColumnFamilyDescriptor>& column_families) {
+ ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
+ Env* env = soptions.env;
+ std::vector<std::string> filenames;
+ bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions);
+
+ // Reset the logger because it holds a handle to the
+ // log file and prevents cleanup and directory removal
+ soptions.info_log.reset();
+ // Ignore error in case directory does not exist
+ env->GetChildren(dbname, &filenames);
+
+ FileLock* lock;
+ const std::string lockname = LockFileName(dbname);
+ Status result = env->LockFile(lockname, &lock);
+ if (result.ok()) {
+ uint64_t number;
+ FileType type;
+ InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname);
+ for (const auto& fname : filenames) {
+ if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) &&
+ type != kDBLockFile) { // Lock file will be deleted at end
+ Status del;
+ std::string path_to_delete = dbname + "/" + fname;
+ if (type == kMetaDatabase) {
+ del = DestroyDB(path_to_delete, options);
+ } else if (type == kTableFile || type == kLogFile) {
+ del = DeleteDBFile(&soptions, path_to_delete, dbname,
+ /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+ } else {
+ del = env->DeleteFile(path_to_delete);
+ }
+ if (result.ok() && !del.ok()) {
+ result = del;
+ }
+ }
+ }
+
+ std::vector<std::string> paths;
+
+ for (const auto& path : options.db_paths) {
+ paths.emplace_back(path.path);
+ }
+ for (const auto& cf : column_families) {
+ for (const auto& path : cf.options.cf_paths) {
+ paths.emplace_back(path.path);
+ }
+ }
+
+ // Remove duplicate paths.
+ // Note that we compare only the actual paths but not path ids.
+ // This reason is that same path can appear at different path_ids
+ // for different column families.
+ std::sort(paths.begin(), paths.end());
+ paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+
+ for (const auto& path : paths) {
+ if (env->GetChildren(path, &filenames).ok()) {
+ for (const auto& fname : filenames) {
+ if (ParseFileName(fname, &number, &type) &&
+ type == kTableFile) { // Lock file will be deleted at end
+ std::string table_path = path + "/" + fname;
+ Status del = DeleteDBFile(&soptions, table_path, dbname,
+ /*force_bg=*/false, /*force_fg=*/false);
+ if (result.ok() && !del.ok()) {
+ result = del;
+ }
+ }
+ }
+ env->DeleteDir(path);
+ }
+ }
+
+ std::vector<std::string> walDirFiles;
+ std::string archivedir = ArchivalDirectory(dbname);
+ bool wal_dir_exists = false;
+ if (dbname != soptions.wal_dir) {
+ wal_dir_exists = env->GetChildren(soptions.wal_dir, &walDirFiles).ok();
+ archivedir = ArchivalDirectory(soptions.wal_dir);
+ }
+
+ // Archive dir may be inside wal dir or dbname and should be
+ // processed and removed before those otherwise we have issues
+ // removing them
+ std::vector<std::string> archiveFiles;
+ if (env->GetChildren(archivedir, &archiveFiles).ok()) {
+ // Delete archival files.
+ for (const auto& file : archiveFiles) {
+ if (ParseFileName(file, &number, &type) && type == kLogFile) {
+ Status del =
+ DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
+ /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+ if (result.ok() && !del.ok()) {
+ result = del;
+ }
+ }
+ }
+ env->DeleteDir(archivedir);
+ }
+
+ // Delete log files in the WAL dir
+ if (wal_dir_exists) {
+ for (const auto& file : walDirFiles) {
+ if (ParseFileName(file, &number, &type) && type == kLogFile) {
+ Status del =
+ DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
+ soptions.wal_dir, /*force_bg=*/false,
+ /*force_fg=*/!wal_in_db_path);
+ if (result.ok() && !del.ok()) {
+ result = del;
+ }
+ }
+ }
+ env->DeleteDir(soptions.wal_dir);
+ }
+
+ env->UnlockFile(lock); // Ignore error since state is already gone
+ env->DeleteFile(lockname);
+
+ // sst_file_manager holds a ref to the logger. Make sure the logger is
+ // gone before trying to remove the directory.
+ soptions.sst_file_manager.reset();
+
+ env->DeleteDir(dbname); // Ignore error in case dir contains other files
+ }
+ return result;
+}
+
+Status DBImpl::WriteOptionsFile(bool need_mutex_lock,
+ bool need_enter_write_thread) {
+#ifndef ROCKSDB_LITE
+ WriteThread::Writer w;
+ if (need_mutex_lock) {
+ mutex_.Lock();
+ } else {
+ mutex_.AssertHeld();
+ }
+ if (need_enter_write_thread) {
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ }
+
+ std::vector<std::string> cf_names;
+ std::vector<ColumnFamilyOptions> cf_opts;
+
+ // This part requires mutex to protect the column family options
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ cf_names.push_back(cfd->GetName());
+ cf_opts.push_back(cfd->GetLatestCFOptions());
+ }
+
+ // Unlock during expensive operations. New writes cannot get here
+ // because the single write thread ensures all new writes get queued.
+ DBOptions db_options =
+ BuildDBOptions(immutable_db_options_, mutable_db_options_);
+ mutex_.Unlock();
+
+ TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1");
+ TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2");
+
+ std::string file_name =
+ TempOptionsFileName(GetName(), versions_->NewFileNumber());
+ Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name,
+ GetFileSystem());
+
+ if (s.ok()) {
+ s = RenameTempFileToOptionsFile(file_name);
+ }
+ // restore lock
+ if (!need_mutex_lock) {
+ mutex_.Lock();
+ }
+ if (need_enter_write_thread) {
+ write_thread_.ExitUnbatched(&w);
+ }
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Unnable to persist options -- %s", s.ToString().c_str());
+ if (immutable_db_options_.fail_if_options_file_error) {
+ return Status::IOError("Unable to persist options.",
+ s.ToString().c_str());
+ }
+ }
+#else
+ (void)need_mutex_lock;
+ (void)need_enter_write_thread;
+#endif // !ROCKSDB_LITE
+ return Status::OK();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void DeleteOptionsFilesHelper(const std::map<uint64_t, std::string>& filenames,
+ const size_t num_files_to_keep,
+ const std::shared_ptr<Logger>& info_log,
+ Env* env) {
+ if (filenames.size() <= num_files_to_keep) {
+ return;
+ }
+ for (auto iter = std::next(filenames.begin(), num_files_to_keep);
+ iter != filenames.end(); ++iter) {
+ if (!env->DeleteFile(iter->second).ok()) {
+ ROCKS_LOG_WARN(info_log, "Unable to delete options file %s",
+ iter->second.c_str());
+ }
+ }
+}
+} // namespace
+#endif // !ROCKSDB_LITE
+
+Status DBImpl::DeleteObsoleteOptionsFiles() {
+#ifndef ROCKSDB_LITE
+ std::vector<std::string> filenames;
+ // use ordered map to store keep the filenames sorted from the newest
+ // to the oldest.
+ std::map<uint64_t, std::string> options_filenames;
+ Status s;
+ s = GetEnv()->GetChildren(GetName(), &filenames);
+ if (!s.ok()) {
+ return s;
+ }
+ for (auto& filename : filenames) {
+ uint64_t file_number;
+ FileType type;
+ if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) {
+ options_filenames.insert(
+ {std::numeric_limits<uint64_t>::max() - file_number,
+ GetName() + "/" + filename});
+ }
+ }
+
+ // Keeps the latest 2 Options file
+ const size_t kNumOptionsFilesKept = 2;
+ DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept,
+ immutable_db_options_.info_log, GetEnv());
+ return Status::OK();
+#else
+ return Status::OK();
+#endif // !ROCKSDB_LITE
+}
+
+Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) {
+#ifndef ROCKSDB_LITE
+ Status s;
+
+ uint64_t options_file_number = versions_->NewFileNumber();
+ std::string options_file_name =
+ OptionsFileName(GetName(), options_file_number);
+ // Retry if the file name happen to conflict with an existing one.
+ s = GetEnv()->RenameFile(file_name, options_file_name);
+ if (s.ok()) {
+ InstrumentedMutexLock l(&mutex_);
+ versions_->options_file_number_ = options_file_number;
+ }
+
+ if (0 == disable_delete_obsolete_files_) {
+ DeleteObsoleteOptionsFiles();
+ }
+ return s;
+#else
+ (void)file_name;
+ return Status::OK();
+#endif // !ROCKSDB_LITE
+}
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+ if (immutable_db_options_.enable_thread_tracking) {
+ ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(),
+ cfd->ioptions()->env);
+ }
+}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+ if (immutable_db_options_.enable_thread_tracking) {
+ ThreadStatusUtil::EraseColumnFamilyInfo(cfd);
+ }
+}
+
+void DBImpl::EraseThreadStatusDbInfo() const {
+ if (immutable_db_options_.enable_thread_tracking) {
+ ThreadStatusUtil::EraseDatabaseInfo(this);
+ }
+}
+
+#else
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusDbInfo() const {}
+#endif // ROCKSDB_USING_THREAD_STATUS
+
+//
+// A global method that can dump out the build version
+void DumpRocksDBBuildVersion(Logger* log) {
+#if !defined(IOS_CROSS_COMPILE)
+ // if we compile with Xcode, we don't run build_detect_version, so we don't
+ // generate util/build_version.cc
+ ROCKS_LOG_HEADER(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR,
+ ROCKSDB_MINOR, ROCKSDB_PATCH);
+ ROCKS_LOG_HEADER(log, "Git sha %s", rocksdb_build_git_sha);
+ ROCKS_LOG_HEADER(log, "Compile date %s", rocksdb_build_compile_date);
+#else
+ (void)log; // ignore "-Wunused-parameter"
+#endif
+}
+
+#ifndef ROCKSDB_LITE
+SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+ bool include_history) {
+ // Find the earliest sequence number that we know we can rely on reading
+ // from the memtable without needing to check sst files.
+ SequenceNumber earliest_seq =
+ sv->imm->GetEarliestSequenceNumber(include_history);
+ if (earliest_seq == kMaxSequenceNumber) {
+ earliest_seq = sv->mem->GetEarliestSequenceNumber();
+ }
+ assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq);
+
+ return earliest_seq;
+}
+#endif // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
+ bool cache_only,
+ SequenceNumber lower_bound_seq,
+ SequenceNumber* seq,
+ bool* found_record_for_key,
+ bool* is_blob_index) {
+ Status s;
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+
+ ReadOptions read_options;
+ SequenceNumber current_seq = versions_->LastSequence();
+ LookupKey lkey(key, current_seq);
+
+ *seq = kMaxSequenceNumber;
+ *found_record_for_key = false;
+
+ // Check if there is a record for this key in the latest memtable
+ sv->mem->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+ seq, read_options, nullptr /*read_callback*/, is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading memtable.
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Unexpected status returned from MemTable::Get: %s\n",
+ s.ToString().c_str());
+
+ return s;
+ }
+
+ if (*seq != kMaxSequenceNumber) {
+ // Found a sequence number, no need to check immutable memtables
+ *found_record_for_key = true;
+ return Status::OK();
+ }
+
+ SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber();
+ if (lower_bound_in_mem != kMaxSequenceNumber &&
+ lower_bound_in_mem < lower_bound_seq) {
+ *found_record_for_key = false;
+ return Status::OK();
+ }
+
+ // Check if there is a record for this key in the immutable memtables
+ sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+ seq, read_options, nullptr /*read_callback*/, is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading memtable.
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Unexpected status returned from MemTableList::Get: %s\n",
+ s.ToString().c_str());
+
+ return s;
+ }
+
+ if (*seq != kMaxSequenceNumber) {
+ // Found a sequence number, no need to check memtable history
+ *found_record_for_key = true;
+ return Status::OK();
+ }
+
+ SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber();
+ if (lower_bound_in_imm != kMaxSequenceNumber &&
+ lower_bound_in_imm < lower_bound_seq) {
+ *found_record_for_key = false;
+ return Status::OK();
+ }
+
+ // Check if there is a record for this key in the immutable memtables
+ sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, seq, read_options,
+ is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading memtable.
+ ROCKS_LOG_ERROR(
+ immutable_db_options_.info_log,
+ "Unexpected status returned from MemTableList::GetFromHistory: %s\n",
+ s.ToString().c_str());
+
+ return s;
+ }
+
+ if (*seq != kMaxSequenceNumber) {
+ // Found a sequence number, no need to check SST files
+ *found_record_for_key = true;
+ return Status::OK();
+ }
+
+ // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true)
+ // check here to skip the history if possible. But currently the caller
+ // already does that. Maybe we should move the logic here later.
+
+ // TODO(agiardullo): possible optimization: consider checking cached
+ // SST files if cache_only=true?
+ if (!cache_only) {
+ // Check tables
+ sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
+ &max_covering_tombstone_seq, nullptr /* value_found */,
+ found_record_for_key, seq, nullptr /*read_callback*/,
+ is_blob_index);
+
+ if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+ // unexpected error reading SST files
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Unexpected status returned from Version::Get: %s\n",
+ s.ToString().c_str());
+ }
+ }
+
+ return s;
+}
+
+Status DBImpl::IngestExternalFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& ingestion_options) {
+ IngestExternalFileArg arg;
+ arg.column_family = column_family;
+ arg.external_files = external_files;
+ arg.options = ingestion_options;
+ return IngestExternalFiles({arg});
+}
+
+Status DBImpl::IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& args) {
+ if (args.empty()) {
+ return Status::InvalidArgument("ingestion arg list is empty");
+ }
+ {
+ std::unordered_set<ColumnFamilyHandle*> unique_cfhs;
+ for (const auto& arg : args) {
+ if (arg.column_family == nullptr) {
+ return Status::InvalidArgument("column family handle is null");
+ } else if (unique_cfhs.count(arg.column_family) > 0) {
+ return Status::InvalidArgument(
+ "ingestion args have duplicate column families");
+ }
+ unique_cfhs.insert(arg.column_family);
+ }
+ }
+ // Ingest multiple external SST files atomically.
+ size_t num_cfs = args.size();
+ for (size_t i = 0; i != num_cfs; ++i) {
+ if (args[i].external_files.empty()) {
+ char err_msg[128] = {0};
+ snprintf(err_msg, 128, "external_files[%zu] is empty", i);
+ return Status::InvalidArgument(err_msg);
+ }
+ }
+ for (const auto& arg : args) {
+ const IngestExternalFileOptions& ingest_opts = arg.options;
+ if (ingest_opts.ingest_behind &&
+ !immutable_db_options_.allow_ingest_behind) {
+ return Status::InvalidArgument(
+ "can't ingest_behind file in DB with allow_ingest_behind=false");
+ }
+ }
+
+ // TODO (yanqin) maybe handle the case in which column_families have
+ // duplicates
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+ size_t total = 0;
+ for (const auto& arg : args) {
+ total += arg.external_files.size();
+ }
+ uint64_t next_file_number = 0;
+ Status status = ReserveFileNumbersBeforeIngestion(
+ static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd(), total,
+ pending_output_elem, &next_file_number);
+ if (!status.ok()) {
+ InstrumentedMutexLock l(&mutex_);
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ return status;
+ }
+
+ std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
+ for (const auto& arg : args) {
+ auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
+ ingestion_jobs.emplace_back(
+ env_, versions_.get(), cfd, immutable_db_options_, file_options_,
+ &snapshots_, arg.options, &directories_, &event_logger_);
+ }
+ std::vector<std::pair<bool, Status>> exec_results;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ exec_results.emplace_back(false, Status::OK());
+ }
+ // TODO(yanqin) maybe make jobs run in parallel
+ uint64_t start_file_number = next_file_number;
+ for (size_t i = 1; i != num_cfs; ++i) {
+ start_file_number += args[i - 1].external_files.size();
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ exec_results[i].second = ingestion_jobs[i].Prepare(
+ args[i].external_files, start_file_number, super_version);
+ exec_results[i].first = true;
+ CleanupSuperVersion(super_version);
+ }
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1");
+ {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd();
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ exec_results[0].second = ingestion_jobs[0].Prepare(
+ args[0].external_files, next_file_number, super_version);
+ exec_results[0].first = true;
+ CleanupSuperVersion(super_version);
+ }
+ for (const auto& exec_result : exec_results) {
+ if (!exec_result.second.ok()) {
+ status = exec_result.second;
+ break;
+ }
+ }
+ if (!status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ if (exec_results[i].first) {
+ ingestion_jobs[i].Cleanup(status);
+ }
+ }
+ InstrumentedMutexLock l(&mutex_);
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ return status;
+ }
+
+ std::vector<SuperVersionContext> sv_ctxs;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ sv_ctxs.emplace_back(true /* create_superversion */);
+ }
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0");
+ TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1");
+ TEST_SYNC_POINT("DBImpl::AddFile:Start");
+ {
+ InstrumentedMutexLock l(&mutex_);
+ TEST_SYNC_POINT("DBImpl::AddFile:MutexLock");
+
+ // Stop writes to the DB by entering both write threads
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ // When unordered_write is enabled, the keys are writing to memtable in an
+ // unordered way. If the ingestion job checks memtable key range before the
+ // key landing in memtable, the ingestion job may skip the necessary
+ // memtable flush.
+ // So wait here to ensure there is no pending write to memtable.
+ WaitForPendingWrites();
+
+ num_running_ingest_file_ += static_cast<int>(num_cfs);
+ TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter");
+
+ bool at_least_one_cf_need_flush = false;
+ std::vector<bool> need_flush(num_cfs, false);
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (cfd->IsDropped()) {
+ // TODO (yanqin) investigate whether we should abort ingestion or
+ // proceed with other non-dropped column families.
+ status = Status::InvalidArgument(
+ "cannot ingest an external file into a dropped CF");
+ break;
+ }
+ bool tmp = false;
+ status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion());
+ need_flush[i] = tmp;
+ at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush",
+ &at_least_one_cf_need_flush);
+
+ if (status.ok() && at_least_one_cf_need_flush) {
+ FlushOptions flush_opts;
+ flush_opts.allow_write_stall = true;
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds_to_flush;
+ SelectColumnFamiliesForAtomicFlush(&cfds_to_flush);
+ mutex_.Unlock();
+ status = AtomicFlushMemTables(cfds_to_flush, flush_opts,
+ FlushReason::kExternalFileIngestion,
+ true /* writes_stopped */);
+ mutex_.Lock();
+ } else {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ if (need_flush[i]) {
+ mutex_.Unlock();
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)
+ ->cfd();
+ status = FlushMemTable(cfd, flush_opts,
+ FlushReason::kExternalFileIngestion,
+ true /* writes_stopped */);
+ mutex_.Lock();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ }
+ }
+ }
+ // Run ingestion jobs.
+ if (status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ status = ingestion_jobs[i].Run();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ }
+ if (status.ok()) {
+ int consumed_seqno_count =
+ ingestion_jobs[0].ConsumedSequenceNumbersCount();
+#ifndef NDEBUG
+ for (size_t i = 1; i != num_cfs; ++i) {
+ assert(!!consumed_seqno_count ==
+ !!ingestion_jobs[i].ConsumedSequenceNumbersCount());
+ consumed_seqno_count +=
+ ingestion_jobs[i].ConsumedSequenceNumbersCount();
+ }
+#endif
+ if (consumed_seqno_count > 0) {
+ const SequenceNumber last_seqno = versions_->LastSequence();
+ versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
+ versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count);
+ versions_->SetLastSequence(last_seqno + consumed_seqno_count);
+ }
+ autovector<ColumnFamilyData*> cfds_to_commit;
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ autovector<autovector<VersionEdit*>> edit_lists;
+ uint32_t num_entries = 0;
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ cfds_to_commit.push_back(cfd);
+ mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+ autovector<VersionEdit*> edit_list;
+ edit_list.push_back(ingestion_jobs[i].edit());
+ edit_lists.push_back(edit_list);
+ ++num_entries;
+ }
+ // Mark the version edits as an atomic group if the number of version
+ // edits exceeds 1.
+ if (cfds_to_commit.size() > 1) {
+ for (auto& edits : edit_lists) {
+ assert(edits.size() == 1);
+ edits[0]->MarkAtomicGroup(--num_entries);
+ }
+ assert(0 == num_entries);
+ }
+ status =
+ versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list,
+ edit_lists, &mutex_, directories_.GetDbDir());
+ }
+
+ if (status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (!cfd->IsDropped()) {
+ InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
+ *cfd->GetLatestMutableCFOptions());
+#ifndef NDEBUG
+ if (0 == i && num_cfs > 1) {
+ TEST_SYNC_POINT(
+ "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
+ TEST_SYNC_POINT(
+ "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
+ }
+#endif // !NDEBUG
+ }
+ }
+ }
+
+ // Resume writes to the DB
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ write_thread_.ExitUnbatched(&w);
+
+ if (status.ok()) {
+ for (auto& job : ingestion_jobs) {
+ job.UpdateStats();
+ }
+ }
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ num_running_ingest_file_ -= static_cast<int>(num_cfs);
+ if (0 == num_running_ingest_file_) {
+ bg_cv_.SignalAll();
+ }
+ TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock");
+ }
+ // mutex_ is unlocked here
+
+ // Cleanup
+ for (size_t i = 0; i != num_cfs; ++i) {
+ sv_ctxs[i].Clean();
+ // This may rollback jobs that have completed successfully. This is
+ // intended for atomicity.
+ ingestion_jobs[i].Cleanup(status);
+ }
+ if (status.ok()) {
+ for (size_t i = 0; i != num_cfs; ++i) {
+ auto* cfd =
+ static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+ if (!cfd->IsDropped()) {
+ NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]);
+ }
+ }
+ }
+ return status;
+}
+
+Status DBImpl::CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ const ImportColumnFamilyOptions& import_options,
+ const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) {
+ assert(handle != nullptr);
+ assert(*handle == nullptr);
+ std::string cf_comparator_name = options.comparator->Name();
+ if (cf_comparator_name != metadata.db_comparator_name) {
+ return Status::InvalidArgument("Comparator name mismatch");
+ }
+
+ // Create column family.
+ auto status = CreateColumnFamily(options, column_family_name, handle);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Import sst files from metadata.
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(*handle);
+ auto cfd = cfh->cfd();
+ ImportColumnFamilyJob import_job(env_, versions_.get(), cfd,
+ immutable_db_options_, file_options_,
+ import_options, metadata.files);
+
+ SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
+ VersionEdit dummy_edit;
+ uint64_t next_file_number = 0;
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+ {
+ // Lock db mutex
+ InstrumentedMutexLock l(&mutex_);
+ if (error_handler_.IsDBStopped()) {
+ // Don't import files when there is a bg_error
+ status = error_handler_.GetBGError();
+ }
+
+ // Make sure that bg cleanup wont delete the files that we are importing
+ pending_output_elem.reset(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+
+ if (status.ok()) {
+ // If crash happen after a hard link established, Recover function may
+ // reuse the file number that has already assigned to the internal file,
+ // and this will overwrite the external file. To protect the external
+ // file, we have to make sure the file number will never being reused.
+ next_file_number = versions_->FetchAddFileNumber(metadata.files.size());
+ auto cf_options = cfd->GetLatestMutableCFOptions();
+ status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+ directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+ }
+ }
+ }
+ dummy_sv_ctx.Clean();
+
+ if (status.ok()) {
+ SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+ status = import_job.Prepare(next_file_number, sv);
+ CleanupSuperVersion(sv);
+ }
+
+ if (status.ok()) {
+ SuperVersionContext sv_context(true /*create_superversion*/);
+ {
+ // Lock db mutex
+ InstrumentedMutexLock l(&mutex_);
+
+ // Stop writes to the DB by entering both write threads
+ WriteThread::Writer w;
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ num_running_ingest_file_++;
+ assert(!cfd->IsDropped());
+ status = import_job.Run();
+
+ // Install job edit [Mutex will be unlocked here]
+ if (status.ok()) {
+ auto cf_options = cfd->GetLatestMutableCFOptions();
+ status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(),
+ &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options);
+ }
+ }
+
+ // Resume writes to the DB
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ write_thread_.ExitUnbatched(&w);
+
+ num_running_ingest_file_--;
+ if (num_running_ingest_file_ == 0) {
+ bg_cv_.SignalAll();
+ }
+ }
+ // mutex_ is unlocked here
+
+ sv_context.Clean();
+ }
+
+ {
+ InstrumentedMutexLock l(&mutex_);
+ ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+ }
+
+ import_job.Cleanup(status);
+ if (!status.ok()) {
+ DropColumnFamily(*handle);
+ DestroyColumnFamilyHandle(*handle);
+ *handle = nullptr;
+ }
+ return status;
+}
+
+Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
+ Status s;
+ std::vector<ColumnFamilyData*> cfd_list;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (!cfd->IsDropped() && cfd->initialized()) {
+ cfd->Ref();
+ cfd_list.push_back(cfd);
+ }
+ }
+ }
+ std::vector<SuperVersion*> sv_list;
+ for (auto cfd : cfd_list) {
+ sv_list.push_back(cfd->GetReferencedSuperVersion(this));
+ }
+ for (auto& sv : sv_list) {
+ VersionStorageInfo* vstorage = sv->current->storage_info();
+ ColumnFamilyData* cfd = sv->current->cfd();
+ Options opts;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+ cfd->GetLatestCFOptions());
+ }
+ for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) {
+ for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok();
+ j++) {
+ const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
+ std::string fname = TableFileName(cfd->ioptions()->cf_paths,
+ fd.GetNumber(), fd.GetPathId());
+ s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(opts, file_options_,
+ read_options, fname);
+ }
+ }
+ if (!s.ok()) {
+ break;
+ }
+ }
+ bool defer_purge =
+ immutable_db_options().avoid_unnecessary_blocking_io;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto sv : sv_list) {
+ if (sv && sv->Unref()) {
+ sv->Cleanup();
+ if (defer_purge) {
+ AddSuperVersionsToFreeQueue(sv);
+ } else {
+ delete sv;
+ }
+ }
+ }
+ if (defer_purge) {
+ SchedulePurge();
+ }
+ for (auto cfd : cfd_list) {
+ cfd->UnrefAndTryDelete();
+ }
+ }
+ return s;
+}
+
+void DBImpl::NotifyOnExternalFileIngested(
+ ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
+ if (immutable_db_options_.listeners.empty()) {
+ return;
+ }
+
+ for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) {
+ ExternalFileIngestionInfo info;
+ info.cf_name = cfd->GetName();
+ info.external_file_path = f.external_file_path;
+ info.internal_file_path = f.internal_file_path;
+ info.global_seqno = f.assigned_seqno;
+ info.table_properties = f.table_properties;
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnExternalFileIngested(this, info);
+ }
+ }
+}
+
+void DBImpl::WaitForIngestFile() {
+ mutex_.AssertHeld();
+ while (num_running_ingest_file_ > 0) {
+ bg_cv_.Wait();
+ }
+}
+
+Status DBImpl::StartTrace(const TraceOptions& trace_options,
+ std::unique_ptr<TraceWriter>&& trace_writer) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer)));
+ return Status::OK();
+}
+
+Status DBImpl::EndTrace() {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ Status s;
+ if (tracer_ != nullptr) {
+ s = tracer_->Close();
+ tracer_.reset();
+ } else {
+ return Status::IOError("No trace file to close");
+ }
+ return s;
+}
+
+Status DBImpl::StartBlockCacheTrace(
+ const TraceOptions& trace_options,
+ std::unique_ptr<TraceWriter>&& trace_writer) {
+ return block_cache_tracer_.StartTrace(env_, trace_options,
+ std::move(trace_writer));
+}
+
+Status DBImpl::EndBlockCacheTrace() {
+ block_cache_tracer_.EndTrace();
+ return Status::OK();
+}
+
+Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) {
+ Status s;
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ s = tracer_->IteratorSeek(cf_id, key);
+ }
+ }
+ return s;
+}
+
+Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id,
+ const Slice& key) {
+ Status s;
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ s = tracer_->IteratorSeekForPrev(cf_id, key);
+ }
+ }
+ return s;
+}
+
+Status DBImpl::ReserveFileNumbersBeforeIngestion(
+ ColumnFamilyData* cfd, uint64_t num,
+ std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+ uint64_t* next_file_number) {
+ Status s;
+ SuperVersionContext dummy_sv_ctx(true /* create_superversion */);
+ assert(nullptr != next_file_number);
+ InstrumentedMutexLock l(&mutex_);
+ if (error_handler_.IsDBStopped()) {
+ // Do not ingest files when there is a bg_error
+ return error_handler_.GetBGError();
+ }
+ pending_output_elem.reset(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+ *next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
+ auto cf_options = cfd->GetLatestMutableCFOptions();
+ VersionEdit dummy_edit;
+ // If crash happen after a hard link established, Recover function may
+ // reuse the file number that has already assigned to the internal file,
+ // and this will overwrite the external file. To protect the external
+ // file, we have to make sure the file number will never being reused.
+ s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+ directories_.GetDbDir());
+ if (s.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+ }
+ dummy_sv_ctx.Clean();
+ return s;
+}
+
+Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+ if (mutable_db_options_.max_open_files == -1) {
+ uint64_t oldest_time = port::kMaxUint64;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (!cfd->IsDropped()) {
+ uint64_t ctime;
+ {
+ SuperVersion* sv = GetAndRefSuperVersion(cfd);
+ Version* version = sv->current;
+ version->GetCreationTimeOfOldestFile(&ctime);
+ ReturnAndCleanupSuperVersion(cfd, sv);
+ }
+
+ if (ctime < oldest_time) {
+ oldest_time = ctime;
+ }
+ if (oldest_time == 0) {
+ break;
+ }
+ }
+ }
+ *creation_time = oldest_time;
+ return Status::OK();
+ } else {
+ return Status::NotSupported("This API only works if max_open_files = -1");
+ }
+}
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl.h b/src/rocksdb/db/db_impl/db_impl.h
new file mode 100644
index 000000000..119555cb4
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.h
@@ -0,0 +1,2107 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction_job.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/flush_scheduler.h"
+#include "db/import_column_family_job.h"
+#include "db/internal_stats.h"
+#include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
+#include "db/pre_release_callback.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/snapshot_checker.h"
+#include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
+#include "db/version_edit.h"
+#include "db/wal_manager.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "trace_replay/trace_replay.h"
+#include "util/autovector.h"
+#include "util/hash.h"
+#include "util/repeatable_thread.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class ArenaWrappedDBIter;
+class InMemoryStatsHistoryIterator;
+class MemTable;
+class PersistentStatsHistoryIterator;
+class TableCache;
+class TaskLimiterToken;
+class Version;
+class VersionEdit;
+class VersionSet;
+class WriteCallback;
+struct JobContext;
+struct ExternalSstFileInfo;
+struct MemTableInfo;
+
+// Class to maintain directories for all database paths other than main one.
+class Directories {
+ public:
+ Status SetDirectories(Env* env, const std::string& dbname,
+ const std::string& wal_dir,
+ const std::vector<DbPath>& data_paths);
+
+ Directory* GetDataDir(size_t path_id) const {
+ assert(path_id < data_dirs_.size());
+ Directory* ret_dir = data_dirs_[path_id].get();
+ if (ret_dir == nullptr) {
+ // Should use db_dir_
+ return db_dir_.get();
+ }
+ return ret_dir;
+ }
+
+ Directory* GetWalDir() {
+ if (wal_dir_) {
+ return wal_dir_.get();
+ }
+ return db_dir_.get();
+ }
+
+ Directory* GetDbDir() { return db_dir_.get(); }
+
+ private:
+ std::unique_ptr<Directory> db_dir_;
+ std::vector<std::unique_ptr<Directory>> data_dirs_;
+ std::unique_ptr<Directory> wal_dir_;
+};
+
+// While DB is the public interface of RocksDB, and DBImpl is the actual
+// class implementing it. It's the entrance of the core RocksdB engine.
+// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a
+// DBImpl internally.
+// Other than functions implementing the DB interface, some public
+// functions are there for other internal components to call. For
+// example, TransactionDB directly calls DBImpl::WriteImpl() and
+// BlobDB directly calls DBImpl::GetImpl(). Some other functions
+// are for sub-components to call. For example, ColumnFamilyHandleImpl
+// calls DBImpl::FindObsoleteFiles().
+//
+// Since it's a very large class, the definition of the functions is
+// divided in several db_impl_*.cc files, besides db_impl.cc.
+class DBImpl : public DB {
+ public:
+ DBImpl(const DBOptions& options, const std::string& dbname,
+ const bool seq_per_batch = false, const bool batch_per_txn = true);
+ // No copying allowed
+ DBImpl(const DBImpl&) = delete;
+ void operator=(const DBImpl&) = delete;
+
+ virtual ~DBImpl();
+
+ // ---- Implementations of the DB interface ----
+
+ using DB::Resume;
+ virtual Status Resume() override;
+
+ using DB::Put;
+ virtual Status Put(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+ using DB::Merge;
+ virtual Status Merge(const WriteOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ const Slice& value) override;
+ using DB::Delete;
+ virtual Status Delete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ using DB::SingleDelete;
+ virtual Status SingleDelete(const WriteOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) override;
+ using DB::Write;
+ virtual Status Write(const WriteOptions& options,
+ WriteBatch* updates) override;
+
+ using DB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+
+ using DB::GetMergeOperands;
+ Status GetMergeOperands(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* merge_operands,
+ GetMergeOperandsOptions* get_merge_operands_options,
+ int* number_of_operands) override {
+ GetImplOptions get_impl_options;
+ get_impl_options.column_family = column_family;
+ get_impl_options.merge_operands = merge_operands;
+ get_impl_options.get_merge_operands_options = get_merge_operands_options;
+ get_impl_options.number_of_operands = number_of_operands;
+ get_impl_options.get_value = false;
+ return GetImpl(options, key, get_impl_options);
+ }
+
+ using DB::MultiGet;
+ virtual std::vector<Status> MultiGet(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_family,
+ const std::vector<Slice>& keys,
+ std::vector<std::string>* values) override;
+
+ // This MultiGet is a batched version, which may be faster than calling Get
+ // multiple times, especially if the keys have some spatial locality that
+ // enables them to be queried in the same SST files/set of files. The larger
+ // the batch size, the more scope for batching and performance improvement
+ // The values and statuses parameters are arrays with number of elements
+ // equal to keys.size(). This allows the storage for those to be alloacted
+ // by the caller on the stack for small batches
+ virtual void MultiGet(const ReadOptions& options,
+ ColumnFamilyHandle* column_family,
+ const size_t num_keys, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input = false) override;
+
+ virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+ ColumnFamilyHandle** column_families, const Slice* keys,
+ PinnableSlice* values, Status* statuses,
+ const bool sorted_input = false) override;
+
+ virtual void MultiGetWithCallback(
+ const ReadOptions& options, ColumnFamilyHandle* column_family,
+ ReadCallback* callback,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
+
+ virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+ const std::string& column_family,
+ ColumnFamilyHandle** handle) override;
+ virtual Status CreateColumnFamilies(
+ const ColumnFamilyOptions& cf_options,
+ const std::vector<std::string>& column_family_names,
+ std::vector<ColumnFamilyHandle*>* handles) override;
+ virtual Status CreateColumnFamilies(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles) override;
+ virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
+ virtual Status DropColumnFamilies(
+ const std::vector<ColumnFamilyHandle*>& column_families) override;
+
+ // Returns false if key doesn't exist in the database and true if it may.
+ // If value_found is not passed in as null, then return the value if found in
+ // memory. On return, if value was found, then value_found will be set to true
+ // , otherwise false.
+ using DB::KeyMayExist;
+ virtual bool KeyMayExist(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ std::string* value,
+ bool* value_found = nullptr) override;
+
+ using DB::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions& options,
+ ColumnFamilyHandle* column_family) override;
+ virtual Status NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override;
+
+ virtual const Snapshot* GetSnapshot() override;
+ virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
+ using DB::GetProperty;
+ virtual bool GetProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, std::string* value) override;
+ using DB::GetMapProperty;
+ virtual bool GetMapProperty(
+ ColumnFamilyHandle* column_family, const Slice& property,
+ std::map<std::string, std::string>* value) override;
+ using DB::GetIntProperty;
+ virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+ const Slice& property, uint64_t* value) override;
+ using DB::GetAggregatedIntProperty;
+ virtual bool GetAggregatedIntProperty(const Slice& property,
+ uint64_t* aggregated_value) override;
+ using DB::GetApproximateSizes;
+ virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Range* range, int n,
+ uint64_t* sizes) override;
+ using DB::GetApproximateMemTableStats;
+ virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+ const Range& range,
+ uint64_t* const count,
+ uint64_t* const size) override;
+ using DB::CompactRange;
+ virtual Status CompactRange(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) override;
+
+ using DB::CompactFiles;
+ virtual Status CompactFiles(
+ const CompactionOptions& compact_options,
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& input_file_names, const int output_level,
+ const int output_path_id = -1,
+ std::vector<std::string>* const output_file_names = nullptr,
+ CompactionJobInfo* compaction_job_info = nullptr) override;
+
+ virtual Status PauseBackgroundWork() override;
+ virtual Status ContinueBackgroundWork() override;
+
+ virtual Status EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& column_family_handles) override;
+
+ virtual void EnableManualCompaction() override;
+ virtual void DisableManualCompaction() override;
+
+ using DB::SetOptions;
+ Status SetOptions(
+ ColumnFamilyHandle* column_family,
+ const std::unordered_map<std::string, std::string>& options_map) override;
+
+ virtual Status SetDBOptions(
+ const std::unordered_map<std::string, std::string>& options_map) override;
+
+ using DB::NumberLevels;
+ virtual int NumberLevels(ColumnFamilyHandle* column_family) override;
+ using DB::MaxMemCompactionLevel;
+ virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
+ using DB::Level0StopWriteTrigger;
+ virtual int Level0StopWriteTrigger(
+ ColumnFamilyHandle* column_family) override;
+ virtual const std::string& GetName() const override;
+ virtual Env* GetEnv() const override;
+ virtual FileSystem* GetFileSystem() const override;
+ using DB::GetOptions;
+ virtual Options GetOptions(ColumnFamilyHandle* column_family) const override;
+ using DB::GetDBOptions;
+ virtual DBOptions GetDBOptions() const override;
+ using DB::Flush;
+ virtual Status Flush(const FlushOptions& options,
+ ColumnFamilyHandle* column_family) override;
+ virtual Status Flush(
+ const FlushOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families) override;
+ virtual Status FlushWAL(bool sync) override;
+ bool TEST_WALBufferIsEmpty(bool lock = true);
+ virtual Status SyncWAL() override;
+ virtual Status LockWAL() override;
+ virtual Status UnlockWAL() override;
+
+ virtual SequenceNumber GetLatestSequenceNumber() const override;
+
+ virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override;
+
+ virtual Status GetDbIdentity(std::string& identity) const override;
+
+ virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
+
+ ColumnFamilyHandle* DefaultColumnFamily() const override;
+
+ ColumnFamilyHandle* PersistentStatsColumnFamily() const;
+
+ virtual Status Close() override;
+
+ Status GetStatsHistory(
+ uint64_t start_time, uint64_t end_time,
+ std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
+
+#ifndef ROCKSDB_LITE
+ using DB::ResetStats;
+ virtual Status ResetStats() override;
+ virtual Status DisableFileDeletions() override;
+ virtual Status EnableFileDeletions(bool force) override;
+ virtual int IsFileDeletionsEnabled() const;
+ // All the returned filenames start with "/"
+ virtual Status GetLiveFiles(std::vector<std::string>&,
+ uint64_t* manifest_file_size,
+ bool flush_memtable = true) override;
+ virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
+ virtual Status GetCurrentWalFile(
+ std::unique_ptr<LogFile>* current_log_file) override;
+ virtual Status GetCreationTimeOfOldestFile(
+ uint64_t* creation_time) override;
+
+ virtual Status GetUpdatesSince(
+ SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+ const TransactionLogIterator::ReadOptions& read_options =
+ TransactionLogIterator::ReadOptions()) override;
+ virtual Status DeleteFile(std::string name) override;
+ Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+ const RangePtr* ranges, size_t n,
+ bool include_end = true);
+
+ virtual void GetLiveFilesMetaData(
+ std::vector<LiveFileMetaData>* metadata) override;
+
+ // Obtains the meta data of the specified column family of the DB.
+ // Status::NotFound() will be returned if the current DB does not have
+ // any column family match the specified name.
+ // TODO(yhchiang): output parameter is placed in the end in this codebase.
+ virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+ ColumnFamilyMetaData* metadata) override;
+
+ Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) override;
+
+ Status PromoteL0(ColumnFamilyHandle* column_family,
+ int target_level) override;
+
+ using DB::IngestExternalFile;
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& external_files,
+ const IngestExternalFileOptions& ingestion_options) override;
+
+ using DB::IngestExternalFiles;
+ virtual Status IngestExternalFiles(
+ const std::vector<IngestExternalFileArg>& args) override;
+
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& options, const std::string& column_family_name,
+ const ImportColumnFamilyOptions& import_options,
+ const ExportImportFilesMetaData& metadata,
+ ColumnFamilyHandle** handle) override;
+
+ using DB::VerifyChecksum;
+ virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
+
+ using DB::StartTrace;
+ virtual Status StartTrace(
+ const TraceOptions& options,
+ std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+ using DB::EndTrace;
+ virtual Status EndTrace() override;
+
+ using DB::StartBlockCacheTrace;
+ Status StartBlockCacheTrace(
+ const TraceOptions& options,
+ std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+ using DB::EndBlockCacheTrace;
+ Status EndBlockCacheTrace() override;
+
+ using DB::GetPropertiesOfAllTables;
+ virtual Status GetPropertiesOfAllTables(
+ ColumnFamilyHandle* column_family,
+ TablePropertiesCollection* props) override;
+ virtual Status GetPropertiesOfTablesInRange(
+ ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+ TablePropertiesCollection* props) override;
+
+#endif // ROCKSDB_LITE
+
+ // ---- End of implementations of the DB interface ----
+
+ struct GetImplOptions {
+ ColumnFamilyHandle* column_family = nullptr;
+ PinnableSlice* value = nullptr;
+ bool* value_found = nullptr;
+ ReadCallback* callback = nullptr;
+ bool* is_blob_index = nullptr;
+ // If true return value associated with key via value pointer else return
+ // all merge operands for key via merge_operands pointer
+ bool get_value = true;
+ // Pointer to an array of size
+ // get_merge_operands_options.expected_max_number_of_operands allocated by
+ // user
+ PinnableSlice* merge_operands = nullptr;
+ GetMergeOperandsOptions* get_merge_operands_options = nullptr;
+ int* number_of_operands = nullptr;
+ };
+
+ // Function that Get and KeyMayExist call with no_io true or false
+ // Note: 'value_found' from KeyMayExist propagates here
+ // This function is also called by GetMergeOperands
+ // If get_impl_options.get_value = true get value associated with
+ // get_impl_options.key via get_impl_options.value
+ // If get_impl_options.get_value = false get merge operands associated with
+ // get_impl_options.key via get_impl_options.merge_operands
+ Status GetImpl(const ReadOptions& options, const Slice& key,
+ GetImplOptions get_impl_options);
+
+ ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
+ ColumnFamilyData* cfd,
+ SequenceNumber snapshot,
+ ReadCallback* read_callback,
+ bool allow_blob = false,
+ bool allow_refresh = true);
+
+ virtual SequenceNumber GetLastPublishedSequence() const {
+ if (last_seq_same_as_publish_seq_) {
+ return versions_->LastSequence();
+ } else {
+ return versions_->LastPublishedSequence();
+ }
+ }
+
+ // REQUIRES: joined the main write queue if two_write_queues is disabled, and
+ // the second write queue otherwise.
+ virtual void SetLastPublishedSequence(SequenceNumber seq);
+ // Returns LastSequence in last_seq_same_as_publish_seq_
+ // mode and LastAllocatedSequence otherwise. This is useful when visiblility
+ // depends also on data written to the WAL but not to the memtable.
+ SequenceNumber TEST_GetLastVisibleSequence() const;
+
+#ifndef ROCKSDB_LITE
+ // Similar to Write() but will call the callback once on the single write
+ // thread to determine whether it is safe to perform the write.
+ virtual Status WriteWithCallback(const WriteOptions& write_options,
+ WriteBatch* my_batch,
+ WriteCallback* callback);
+
+ // Returns the sequence number that is guaranteed to be smaller than or equal
+ // to the sequence number of any key that could be inserted into the current
+ // memtables. It can then be assumed that any write with a larger(or equal)
+ // sequence number will be present in this memtable or a later memtable.
+ //
+ // If the earliest sequence number could not be determined,
+ // kMaxSequenceNumber will be returned.
+ //
+ // If include_history=true, will also search Memtables in MemTableList
+ // History.
+ SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+ bool include_history);
+
+ // For a given key, check to see if there are any records for this key
+ // in the memtables, including memtable history. If cache_only is false,
+ // SST files will also be checked.
+ //
+ // If a key is found, *found_record_for_key will be set to true and
+ // *seq will be set to the stored sequence number for the latest
+ // operation on this key or kMaxSequenceNumber if unknown.
+ // If no key is found, *found_record_for_key will be set to false.
+ //
+ // Note: If cache_only=false, it is possible for *seq to be set to 0 if
+ // the sequence number has been cleared from the record. If the caller is
+ // holding an active db snapshot, we know the missing sequence must be less
+ // than the snapshot's sequence number (sequence numbers are only cleared
+ // when there are no earlier active snapshots).
+ //
+ // If NotFound is returned and found_record_for_key is set to false, then no
+ // record for this key was found. If the caller is holding an active db
+ // snapshot, we know that no key could have existing after this snapshot
+ // (since we do not compact keys that have an earlier snapshot).
+ //
+ // Only records newer than or at `lower_bound_seq` are guaranteed to be
+ // returned. Memtables and files may not be checked if it only contains data
+ // older than `lower_bound_seq`.
+ //
+ // Returns OK or NotFound on success,
+ // other status on unexpected error.
+ // TODO(andrewkr): this API need to be aware of range deletion operations
+ Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
+ bool cache_only,
+ SequenceNumber lower_bound_seq,
+ SequenceNumber* seq,
+ bool* found_record_for_key,
+ bool* is_blob_index = nullptr);
+
+ Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key);
+ Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+#endif // ROCKSDB_LITE
+
+ // Similar to GetSnapshot(), but also lets the db know that this snapshot
+ // will be used for transaction write-conflict checking. The DB can then
+ // make sure not to compact any keys that would prevent a write-conflict from
+ // being detected.
+ const Snapshot* GetSnapshotForWriteConflictBoundary();
+
+ // checks if all live files exist on file system and that their file sizes
+ // match to our in-memory records
+ virtual Status CheckConsistency();
+
+ // max_file_num_to_ignore allows bottom level compaction to filter out newly
+ // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
+ // disable the filtering
+ Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
+ int output_level,
+ const CompactRangeOptions& compact_range_options,
+ const Slice* begin, const Slice* end,
+ bool exclusive, bool disallow_trivial_move,
+ uint64_t max_file_num_to_ignore);
+
+ // Return an internal iterator over the current state of the database.
+ // The keys of this iterator are internal keys (see format.h).
+ // The returned iterator should be deleted when no longer needed.
+ InternalIterator* NewInternalIterator(
+ Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
+ ColumnFamilyHandle* column_family = nullptr);
+
+ LogsWithPrepTracker* logs_with_prep_tracker() {
+ return &logs_with_prep_tracker_;
+ }
+
+ struct BGJobLimits {
+ int max_flushes;
+ int max_compactions;
+ };
+ // Returns maximum background flushes and compactions allowed to be scheduled
+ BGJobLimits GetBGJobLimits() const;
+ // Need a static version that can be called during SanitizeOptions().
+ static BGJobLimits GetBGJobLimits(int max_background_flushes,
+ int max_background_compactions,
+ int max_background_jobs,
+ bool parallelize_compactions);
+
+ // move logs pending closing from job_context to the DB queue and
+ // schedule a purge
+ void ScheduleBgLogWriterClose(JobContext* job_context);
+
+ uint64_t MinLogNumberToKeep();
+
+ // Returns the lower bound file number for SSTs that won't be deleted, even if
+ // they're obsolete. This lower bound is used internally to prevent newly
+ // created flush/compaction output files from being deleted before they're
+ // installed. This technique avoids the need for tracking the exact numbers of
+ // files pending creation, although it prevents more files than necessary from
+ // being deleted.
+ uint64_t MinObsoleteSstNumberToKeep();
+
+ // Returns the list of live files in 'live' and the list
+ // of all files in the filesystem in 'candidate_files'.
+ // If force == false and the last call was less than
+ // db_options_.delete_obsolete_files_period_micros microseconds ago,
+ // it will not fill up the job_context
+ void FindObsoleteFiles(JobContext* job_context, bool force,
+ bool no_full_scan = false);
+
+ // Diffs the files listed in filenames and those that do not
+ // belong to live files are possibly removed. Also, removes all the
+ // files in sst_delete_files and log_delete_files.
+ // It is not necessary to hold the mutex when invoking this method.
+ // If FindObsoleteFiles() was run, we need to also run
+ // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
+ void PurgeObsoleteFiles(JobContext& background_contet,
+ bool schedule_only = false);
+
+ // Schedule a background job to actually delete obsolete files.
+ void SchedulePurge();
+
+ const SnapshotList& snapshots() const { return snapshots_; }
+
+ // load list of snapshots to `snap_vector` that is no newer than `max_seq`
+ // in ascending order.
+ // `oldest_write_conflict_snapshot` is filled with the oldest snapshot
+ // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true.
+ void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
+ SequenceNumber* oldest_write_conflict_snapshot,
+ const SequenceNumber& max_seq) const {
+ InstrumentedMutexLock l(mutex());
+ snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq);
+ }
+
+ const ImmutableDBOptions& immutable_db_options() const {
+ return immutable_db_options_;
+ }
+
+ // Cancel all background jobs, including flush, compaction, background
+ // purging, stats dumping threads, etc. If `wait` = true, wait for the
+ // running jobs to abort or finish before returning. Otherwise, only
+ // sends the signals.
+ void CancelAllBackgroundWork(bool wait);
+
+ // Find Super version and reference it. Based on options, it might return
+ // the thread local cached one.
+ // Call ReturnAndCleanupSuperVersion() when it is no longer needed.
+ SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
+
+ // Similar to the previous function but looks up based on a column family id.
+ // nullptr will be returned if this column family no longer exists.
+ // REQUIRED: this function should only be called on the write thread or if the
+ // mutex is held.
+ SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
+
+ // Un-reference the super version and clean it up if it is the last reference.
+ void CleanupSuperVersion(SuperVersion* sv);
+
+ // Un-reference the super version and return it to thread local cache if
+ // needed. If it is the last reference of the super version. Clean it up
+ // after un-referencing it.
+ void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv);
+
+ // Similar to the previous function but looks up based on a column family id.
+ // nullptr will be returned if this column family no longer exists.
+ // REQUIRED: this function should only be called on the write thread.
+ void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);
+
+ // REQUIRED: this function should only be called on the write thread or if the
+ // mutex is held. Return value only valid until next call to this function or
+ // mutex is released.
+ ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
+
+ // Same as above, should called without mutex held and not on write thread.
+ std::unique_ptr<ColumnFamilyHandle> GetColumnFamilyHandleUnlocked(
+ uint32_t column_family_id);
+
+ // Returns the number of currently running flushes.
+ // REQUIREMENT: mutex_ must be held when calling this function.
+ int num_running_flushes() {
+ mutex_.AssertHeld();
+ return num_running_flushes_;
+ }
+
+ // Returns the number of currently running compactions.
+ // REQUIREMENT: mutex_ must be held when calling this function.
+ int num_running_compactions() {
+ mutex_.AssertHeld();
+ return num_running_compactions_;
+ }
+
+ const WriteController& write_controller() { return write_controller_; }
+
+ InternalIterator* NewInternalIterator(
+ const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version,
+ Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence);
+
+ // hollow transactions shell used for recovery.
+ // these will then be passed to TransactionDB so that
+ // locks can be reacquired before writing can resume.
+ struct RecoveredTransaction {
+ std::string name_;
+ bool unprepared_;
+
+ struct BatchInfo {
+ uint64_t log_number_;
+ // TODO(lth): For unprepared, the memory usage here can be big for
+ // unprepared transactions. This is only useful for rollbacks, and we
+ // can in theory just keep keyset for that.
+ WriteBatch* batch_;
+ // Number of sub-batches. A new sub-batch is created if txn attempts to
+ // insert a duplicate key,seq to memtable. This is currently used in
+ // WritePreparedTxn/WriteUnpreparedTxn.
+ size_t batch_cnt_;
+ };
+
+ // This maps the seq of the first key in the batch to BatchInfo, which
+ // contains WriteBatch and other information relevant to the batch.
+ //
+ // For WriteUnprepared, batches_ can have size greater than 1, but for
+ // other write policies, it must be of size 1.
+ std::map<SequenceNumber, BatchInfo> batches_;
+
+ explicit RecoveredTransaction(const uint64_t log, const std::string& name,
+ WriteBatch* batch, SequenceNumber seq,
+ size_t batch_cnt, bool unprepared)
+ : name_(name), unprepared_(unprepared) {
+ batches_[seq] = {log, batch, batch_cnt};
+ }
+
+ ~RecoveredTransaction() {
+ for (auto& it : batches_) {
+ delete it.second.batch_;
+ }
+ }
+
+ void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch,
+ size_t batch_cnt, bool unprepared) {
+ assert(batches_.count(seq) == 0);
+ batches_[seq] = {log_number, batch, batch_cnt};
+ // Prior state must be unprepared, since the prepare batch must be the
+ // last batch.
+ assert(unprepared_);
+ unprepared_ = unprepared;
+ }
+ };
+
+ bool allow_2pc() const { return immutable_db_options_.allow_2pc; }
+
+ std::unordered_map<std::string, RecoveredTransaction*>
+ recovered_transactions() {
+ return recovered_transactions_;
+ }
+
+ RecoveredTransaction* GetRecoveredTransaction(const std::string& name) {
+ auto it = recovered_transactions_.find(name);
+ if (it == recovered_transactions_.end()) {
+ return nullptr;
+ } else {
+ return it->second;
+ }
+ }
+
+ void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
+ WriteBatch* batch, SequenceNumber seq,
+ size_t batch_cnt, bool unprepared_batch) {
+ // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple
+ // times for every unprepared batch encountered during recovery.
+ //
+ // If the transaction is prepared, then the last call to
+ // InsertRecoveredTransaction will have unprepared_batch = false.
+ auto rtxn = recovered_transactions_.find(name);
+ if (rtxn == recovered_transactions_.end()) {
+ recovered_transactions_[name] = new RecoveredTransaction(
+ log, name, batch, seq, batch_cnt, unprepared_batch);
+ } else {
+ rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch);
+ }
+ logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log);
+ }
+
+ void DeleteRecoveredTransaction(const std::string& name) {
+ auto it = recovered_transactions_.find(name);
+ assert(it != recovered_transactions_.end());
+ auto* trx = it->second;
+ recovered_transactions_.erase(it);
+ for (const auto& info : trx->batches_) {
+ logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(
+ info.second.log_number_);
+ }
+ delete trx;
+ }
+
+ void DeleteAllRecoveredTransactions() {
+ for (auto it = recovered_transactions_.begin();
+ it != recovered_transactions_.end(); ++it) {
+ delete it->second;
+ }
+ recovered_transactions_.clear();
+ }
+
+ void AddToLogsToFreeQueue(log::Writer* log_writer) {
+ logs_to_free_queue_.push_back(log_writer);
+ }
+
+ void AddSuperVersionsToFreeQueue(SuperVersion* sv) {
+ superversions_to_free_queue_.push_back(sv);
+ }
+
+ void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
+
+ // Fill JobContext with snapshot information needed by flush and compaction.
+ void GetSnapshotContext(JobContext* job_context,
+ std::vector<SequenceNumber>* snapshot_seqs,
+ SequenceNumber* earliest_write_conflict_snapshot,
+ SnapshotChecker** snapshot_checker);
+
+ // Not thread-safe.
+ void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
+
+ InstrumentedMutex* mutex() const { return &mutex_; }
+
+ // Initialize a brand new DB. The DB directory is expected to be empty before
+ // calling it.
+ Status NewDB();
+
+ // This is to be used only by internal rocksdb classes.
+ static Status Open(const DBOptions& db_options, const std::string& name,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ const bool seq_per_batch, const bool batch_per_txn);
+
+ static Status CreateAndNewDirectory(Env* env, const std::string& dirname,
+ std::unique_ptr<Directory>* directory);
+
+ // find stats map from stats_history_ with smallest timestamp in
+ // the range of [start_time, end_time)
+ bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
+ uint64_t* new_time,
+ std::map<std::string, uint64_t>* stats_map);
+
+ // Print information of all tombstones of all iterators to the std::string
+ // This is only used by ldb. The output might be capped. Tombstones
+ // printed out are not guaranteed to be in any order.
+ Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+ int max_entries_to_print,
+ std::string* out_str);
+
+#ifndef NDEBUG
+ // Compact any files in the named level that overlap [*begin, *end]
+ Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
+ ColumnFamilyHandle* column_family = nullptr,
+ bool disallow_trivial_move = false);
+
+ void TEST_SwitchWAL();
+
+ bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
+
+ bool TEST_IsLogGettingFlushed() {
+ return alive_log_files_.begin()->getting_flushed;
+ }
+
+ Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
+
+ // Force current memtable contents to be flushed.
+ Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
+ ColumnFamilyHandle* cfh = nullptr);
+
+ Status TEST_FlushMemTable(ColumnFamilyData* cfd,
+ const FlushOptions& flush_opts);
+
+ // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This
+ // is because in certain cases, we can flush column families, wait for the
+ // flush to complete, but delete the column family handle before the wait
+ // finishes. For example in CompactRange.
+ Status TEST_AtomicFlushMemTables(const autovector<ColumnFamilyData*>& cfds,
+ const FlushOptions& flush_opts);
+
+ // Wait for memtable compaction
+ Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
+
+ // Wait for any compaction
+ // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+ // is only for the special test of CancelledCompactions
+ Status TEST_WaitForCompact(bool waitUnscheduled = false);
+
+ // Return the maximum overlapping data (in bytes) at next level for any
+ // file at a level >= 1.
+ int64_t TEST_MaxNextLevelOverlappingBytes(
+ ColumnFamilyHandle* column_family = nullptr);
+
+ // Return the current manifest file no.
+ uint64_t TEST_Current_Manifest_FileNo();
+
+ // Returns the number that'll be assigned to the next file that's created.
+ uint64_t TEST_Current_Next_FileNo();
+
+ // get total level0 file size. Only for testing.
+ uint64_t TEST_GetLevel0TotalSize();
+
+ void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
+ std::vector<std::vector<FileMetaData>>* metadata);
+
+ void TEST_LockMutex();
+
+ void TEST_UnlockMutex();
+
+ // REQUIRES: mutex locked
+ void* TEST_BeginWrite();
+
+ // REQUIRES: mutex locked
+ // pass the pointer that you got from TEST_BeginWrite()
+ void TEST_EndWrite(void* w);
+
+ uint64_t TEST_MaxTotalInMemoryState() const {
+ return max_total_in_memory_state_;
+ }
+
+ size_t TEST_LogsToFreeSize();
+
+ uint64_t TEST_LogfileNumber();
+
+ uint64_t TEST_total_log_size() const { return total_log_size_; }
+
+ // Returns column family name to ImmutableCFOptions map.
+ Status TEST_GetAllImmutableCFOptions(
+ std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
+
+ // Return the lastest MutableCFOptions of a column family
+ Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
+ MutableCFOptions* mutable_cf_options);
+
+ Cache* TEST_table_cache() { return table_cache_.get(); }
+
+ WriteController& TEST_write_controler() { return write_controller_; }
+
+ uint64_t TEST_FindMinLogContainingOutstandingPrep();
+ uint64_t TEST_FindMinPrepLogReferencedByMemTable();
+ size_t TEST_PreparedSectionCompletedSize();
+ size_t TEST_LogsWithPrepSize();
+
+ int TEST_BGCompactionsAllowed() const;
+ int TEST_BGFlushesAllowed() const;
+ size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+ void TEST_WaitForDumpStatsRun(std::function<void()> callback) const;
+ void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
+ bool TEST_IsPersistentStatsEnabled() const;
+ size_t TEST_EstimateInMemoryStatsHistorySize() const;
+#endif // NDEBUG
+
+ protected:
+ const std::string dbname_;
+ std::string db_id_;
+ std::unique_ptr<VersionSet> versions_;
+ // Flag to check whether we allocated and own the info log file
+ bool own_info_log_;
+ const DBOptions initial_db_options_;
+ Env* const env_;
+ std::shared_ptr<FileSystem> fs_;
+ const ImmutableDBOptions immutable_db_options_;
+ MutableDBOptions mutable_db_options_;
+ Statistics* stats_;
+ std::unordered_map<std::string, RecoveredTransaction*>
+ recovered_transactions_;
+ std::unique_ptr<Tracer> tracer_;
+ InstrumentedMutex trace_mutex_;
+ BlockCacheTracer block_cache_tracer_;
+
+ // State below is protected by mutex_
+ // With two_write_queues enabled, some of the variables that accessed during
+ // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
+ // logs_, logfile_number_. Refer to the definition of each variable below for
+ // more description.
+ mutable InstrumentedMutex mutex_;
+
+ ColumnFamilyHandleImpl* default_cf_handle_;
+ InternalStats* default_cf_internal_stats_;
+
+ // only used for dynamically adjusting max_total_wal_size. it is a sum of
+ // [write_buffer_size * max_write_buffer_number] over all column families
+ uint64_t max_total_in_memory_state_;
+ // If true, we have only one (default) column family. We use this to optimize
+ // some code-paths
+ bool single_column_family_mode_;
+
+ // The options to access storage files
+ const FileOptions file_options_;
+
+ // Additonal options for compaction and flush
+ FileOptions file_options_for_compaction_;
+
+ std::unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
+
+ // Increase the sequence number after writing each batch, whether memtable is
+ // disabled for that or not. Otherwise the sequence number is increased after
+ // writing each key into memtable. This implies that when disable_memtable is
+ // set, the seq is not increased at all.
+ //
+ // Default: false
+ const bool seq_per_batch_;
+ // This determines during recovery whether we expect one writebatch per
+ // recovered transaction, or potentially multiple writebatches per
+ // transaction. For WriteUnprepared, this is set to false, since multiple
+ // batches can exist per transaction.
+ //
+ // Default: true
+ const bool batch_per_txn_;
+
+ // Except in DB::Open(), WriteOptionsFile can only be called when:
+ // Persist options to options file.
+ // If need_mutex_lock = false, the method will lock DB mutex.
+ // If need_enter_write_thread = false, the method will enter write thread.
+ Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread);
+
+ // The following two functions can only be called when:
+ // 1. WriteThread::Writer::EnterUnbatched() is used.
+ // 2. db_mutex is NOT held
+ Status RenameTempFileToOptionsFile(const std::string& file_name);
+ Status DeleteObsoleteOptionsFiles();
+
+ void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+ const MutableCFOptions& mutable_cf_options,
+ int job_id);
+
+ void NotifyOnFlushCompleted(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info);
+
+ void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& job_stats, int job_id);
+
+ void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& job_stats,
+ int job_id);
+ void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
+ const MemTableInfo& mem_table_info);
+
+#ifndef ROCKSDB_LITE
+ void NotifyOnExternalFileIngested(
+ ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
+#endif // !ROCKSDB_LITE
+
+ void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+ void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+ void EraseThreadStatusDbInfo() const;
+
+ // If disable_memtable is set the application logic must guarantee that the
+ // batch will still be skipped from memtable during the recovery. An excption
+ // to this is seq_per_batch_ mode, in which since each batch already takes one
+ // seq, it is ok for the batch to write to memtable during recovery as long as
+ // it only takes one sequence number: i.e., no duplicate keys.
+ // In WriteCommitted it is guarnateed since disable_memtable is used for
+ // prepare batch which will be written to memtable later during the commit,
+ // and in WritePrepared it is guaranteed since it will be used only for WAL
+ // markers which will never be written to memtable. If the commit marker is
+ // accompanied with CommitTimeWriteBatch that is not written to memtable as
+ // long as it has no duplicate keys, it does not violate the one-seq-per-batch
+ // policy.
+ // batch_cnt is expected to be non-zero in seq_per_batch mode and
+ // indicates the number of sub-patches. A sub-patch is a subset of the write
+ // batch that does not have duplicate keys.
+ Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
+ WriteCallback* callback = nullptr,
+ uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+ bool disable_memtable = false, uint64_t* seq_used = nullptr,
+ size_t batch_cnt = 0,
+ PreReleaseCallback* pre_release_callback = nullptr);
+
+ Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
+ WriteCallback* callback = nullptr,
+ uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+ bool disable_memtable = false,
+ uint64_t* seq_used = nullptr);
+
+ // Write only to memtables without joining any write queue
+ Status UnorderedWriteMemtable(const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback,
+ uint64_t log_ref, SequenceNumber seq,
+ const size_t sub_batch_cnt);
+
+ // Whether the batch requires to be assigned with an order
+ enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder };
+ // Whether it requires publishing last sequence or not
+ enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq };
+
+ // Join the write_thread to write the batch only to the WAL. It is the
+ // responsibility of the caller to also write the write batch to the memtable
+ // if it required.
+ //
+ // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder
+ // indicating the number of sub-batches in my_batch. A sub-patch is a subset
+ // of the write batch that does not have duplicate keys. When seq_per_batch is
+ // not set, each key is a separate sub_batch. Otherwise each duplicate key
+ // marks start of a new sub-batch.
+ Status WriteImplWALOnly(
+ WriteThread* write_thread, const WriteOptions& options,
+ WriteBatch* updates, WriteCallback* callback, uint64_t* log_used,
+ const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+ PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+ const PublishLastSeq publish_last_seq, const bool disable_memtable);
+
+ // write cached_recoverable_state_ to memtable if it is not empty
+ // The writer must be the leader in write_thread_ and holding mutex_
+ Status WriteRecoverableState();
+
+ // Actual implementation of Close()
+ Status CloseImpl();
+
+ // Recover the descriptor from persistent storage. May do a significant
+ // amount of work to recover recently logged updates. Any changes to
+ // be made to the descriptor are added to *edit.
+ // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
+ // skipped.
+ virtual Status Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool read_only = false, bool error_if_log_file_exist = false,
+ bool error_if_data_exists_in_logs = false,
+ uint64_t* recovered_seq = nullptr);
+
+ virtual bool OwnTablesAndLogs() const { return true; }
+
+ private:
+ friend class DB;
+ friend class ErrorHandler;
+ friend class InternalStats;
+ friend class PessimisticTransaction;
+ friend class TransactionBaseImpl;
+ friend class WriteCommittedTxn;
+ friend class WritePreparedTxn;
+ friend class WritePreparedTxnDB;
+ friend class WriteBatchWithIndex;
+ friend class WriteUnpreparedTxnDB;
+ friend class WriteUnpreparedTxn;
+
+#ifndef ROCKSDB_LITE
+ friend class ForwardIterator;
+#endif
+ friend struct SuperVersion;
+ friend class CompactedDBImpl;
+ friend class DBTest_ConcurrentFlushWAL_Test;
+ friend class DBTest_MixedSlowdownOptionsStop_Test;
+ friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
+ friend class DBCompactionTest_CompactionDuringShutdown_Test;
+ friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test;
+#ifndef NDEBUG
+ friend class DBTest2_ReadCallbackTest_Test;
+ friend class WriteCallbackTest_WriteWithCallbackTest_Test;
+ friend class XFTransactionWriteHandler;
+ friend class DBBlobIndexTest;
+ friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+#endif
+
+ struct CompactionState;
+ struct PrepickedCompaction;
+ struct PurgeFileInfo;
+
+ struct WriteContext {
+ SuperVersionContext superversion_context;
+ autovector<MemTable*> memtables_to_free_;
+
+ explicit WriteContext(bool create_superversion = false)
+ : superversion_context(create_superversion) {}
+
+ ~WriteContext() {
+ superversion_context.Clean();
+ for (auto& m : memtables_to_free_) {
+ delete m;
+ }
+ }
+ };
+
+ struct LogFileNumberSize {
+ explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
+ void AddSize(uint64_t new_size) { size += new_size; }
+ uint64_t number;
+ uint64_t size = 0;
+ bool getting_flushed = false;
+ };
+
+ struct LogWriterNumber {
+ // pass ownership of _writer
+ LogWriterNumber(uint64_t _number, log::Writer* _writer)
+ : number(_number), writer(_writer) {}
+
+ log::Writer* ReleaseWriter() {
+ auto* w = writer;
+ writer = nullptr;
+ return w;
+ }
+ Status ClearWriter() {
+ Status s = writer->WriteBuffer();
+ delete writer;
+ writer = nullptr;
+ return s;
+ }
+
+ uint64_t number;
+ // Visual Studio doesn't support deque's member to be noncopyable because
+ // of a std::unique_ptr as a member.
+ log::Writer* writer; // own
+ // true for some prefix of logs_
+ bool getting_synced = false;
+ };
+
+ // PurgeFileInfo is a structure to hold information of files to be deleted in
+ // purge_files_
+ struct PurgeFileInfo {
+ std::string fname;
+ std::string dir_to_sync;
+ FileType type;
+ uint64_t number;
+ int job_id;
+ PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
+ int jid)
+ : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
+ };
+
+ // Argument required by background flush thread.
+ struct BGFlushArg {
+ BGFlushArg()
+ : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
+ BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
+ SuperVersionContext* superversion_context)
+ : cfd_(cfd),
+ max_memtable_id_(max_memtable_id),
+ superversion_context_(superversion_context) {}
+
+ // Column family to flush.
+ ColumnFamilyData* cfd_;
+ // Maximum ID of memtable to flush. In this column family, memtables with
+ // IDs smaller than this value must be flushed before this flush completes.
+ uint64_t max_memtable_id_;
+ // Pointer to a SuperVersionContext object. After flush completes, RocksDB
+ // installs a new superversion for the column family. This operation
+ // requires a SuperVersionContext object (currently embedded in JobContext).
+ SuperVersionContext* superversion_context_;
+ };
+
+ // Argument passed to flush thread.
+ struct FlushThreadArg {
+ DBImpl* db_;
+
+ Env::Priority thread_pri_;
+ };
+
+ // Information for a manual compaction
+ struct ManualCompactionState {
+ ColumnFamilyData* cfd;
+ int input_level;
+ int output_level;
+ uint32_t output_path_id;
+ Status status;
+ bool done;
+ bool in_progress; // compaction request being processed?
+ bool incomplete; // only part of requested range compacted
+ bool exclusive; // current behavior of only one manual
+ bool disallow_trivial_move; // Force actual compaction to run
+ const InternalKey* begin; // nullptr means beginning of key range
+ const InternalKey* end; // nullptr means end of key range
+ InternalKey* manual_end; // how far we are compacting
+ InternalKey tmp_storage; // Used to keep track of compaction progress
+ InternalKey tmp_storage1; // Used to keep track of compaction progress
+ };
+ struct PrepickedCompaction {
+ // background compaction takes ownership of `compaction`.
+ Compaction* compaction;
+ // caller retains ownership of `manual_compaction_state` as it is reused
+ // across background compactions.
+ ManualCompactionState* manual_compaction_state; // nullptr if non-manual
+ // task limiter token is requested during compaction picking.
+ std::unique_ptr<TaskLimiterToken> task_token;
+ };
+
+ struct CompactionArg {
+ // caller retains ownership of `db`.
+ DBImpl* db;
+ // background compaction takes ownership of `prepicked_compaction`.
+ PrepickedCompaction* prepicked_compaction;
+ };
+
+ // Initialize the built-in column family for persistent stats. Depending on
+ // whether on-disk persistent stats have been enabled before, it may either
+ // create a new column family and column family handle or just a column family
+ // handle.
+ // Required: DB mutex held
+ Status InitPersistStatsColumnFamily();
+
+ // Persistent Stats column family has two format version key which are used
+ // for compatibility check. Write format version if it's created for the
+ // first time, read format version and check compatibility if recovering
+ // from disk. This function requires DB mutex held at entrance but may
+ // release and re-acquire DB mutex in the process.
+ // Required: DB mutex held
+ Status PersistentStatsProcessFormatVersion();
+
+ Status ResumeImpl();
+
+ void MaybeIgnoreError(Status* s) const;
+
+ const Status CreateArchivalDirectory();
+
+ Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+ const std::string& cf_name,
+ ColumnFamilyHandle** handle);
+
+ Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family);
+
+ // Delete any unneeded files and stale in-memory entries.
+ void DeleteObsoleteFiles();
+ // Delete obsolete files and log status and information of file deletion
+ void DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+ const std::string& path_to_sync, FileType type,
+ uint64_t number);
+
+ // Background process needs to call
+ // auto x = CaptureCurrentFileNumberInPendingOutputs()
+ // auto file_num = versions_->NewFileNumber();
+ // <do something>
+ // ReleaseFileNumberFromPendingOutputs(x)
+ // This will protect any file with number `file_num` or greater from being
+ // deleted while <do something> is running.
+ // -----------
+ // This function will capture current file number and append it to
+ // pending_outputs_. This will prevent any background process to delete any
+ // file created after this point.
+ std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
+ // This function should be called with the result of
+ // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
+ // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
+ // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
+ // and blocked by any other pending_outputs_ calls)
+ void ReleaseFileNumberFromPendingOutputs(
+ std::unique_ptr<std::list<uint64_t>::iterator>& v);
+
+ Status SyncClosedLogs(JobContext* job_context);
+
+ // Flush the in-memory write buffer to storage. Switches to a new
+ // log-file/memtable and writes a new descriptor iff successful. Then
+ // installs a new super version for the column family.
+ Status FlushMemTableToOutputFile(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ bool* madeProgress, JobContext* job_context,
+ SuperVersionContext* superversion_context,
+ std::vector<SequenceNumber>& snapshot_seqs,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+ Env::Priority thread_pri);
+
+ // Flush the memtables of (multiple) column families to multiple files on
+ // persistent storage.
+ Status FlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+ Status AtomicFlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+ // REQUIRES: log_numbers are sorted in ascending order
+ // corrupted_log_found is set to true if we recover from a corrupted log file.
+ Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+ SequenceNumber* next_sequence, bool read_only,
+ bool* corrupted_log_found);
+
+ // The following two methods are used to flush a memtable to
+ // storage. The first one is used at database RecoveryTime (when the
+ // database is opened) and is heavyweight because it holds the mutex
+ // for the entire period. The second method WriteLevel0Table supports
+ // concurrent flush memtables to storage.
+ Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+ MemTable* mem, VersionEdit* edit);
+
+ // Restore alive_log_files_ and total_log_size_ after recovery.
+ // It needs to run only when there's no flush during recovery
+ // (e.g. avoid_flush_during_recovery=true). May also trigger flush
+ // in case total_log_size > max_total_wal_size.
+ Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
+
+ // num_bytes: for slowdown case, delay time is calculated based on
+ // `num_bytes` going through.
+ Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
+
+ Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+ WriteBatch* my_batch);
+
+ // REQUIRES: mutex locked and in write thread.
+ Status ScheduleFlushes(WriteContext* context);
+
+ void MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds);
+
+ Status TrimMemtableHistory(WriteContext* context);
+
+ Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
+
+ void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
+
+ // Force current memtable contents to be flushed.
+ Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
+ FlushReason flush_reason, bool writes_stopped = false);
+
+ Status AtomicFlushMemTables(
+ const autovector<ColumnFamilyData*>& column_family_datas,
+ const FlushOptions& options, FlushReason flush_reason,
+ bool writes_stopped = false);
+
+ // Wait until flushing this column family won't stall writes
+ Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+ bool* flush_needed);
+
+ // Wait for memtable flushed.
+ // If flush_memtable_id is non-null, wait until the memtable with the ID
+ // gets flush. Otherwise, wait until the column family don't have any
+ // memtable pending flush.
+ // resuming_from_bg_err indicates whether the caller is attempting to resume
+ // from background error.
+ Status WaitForFlushMemTable(ColumnFamilyData* cfd,
+ const uint64_t* flush_memtable_id = nullptr,
+ bool resuming_from_bg_err = false) {
+ return WaitForFlushMemTables({cfd}, {flush_memtable_id},
+ resuming_from_bg_err);
+ }
+ // Wait for memtables to be flushed for multiple column families.
+ Status WaitForFlushMemTables(
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const uint64_t*>& flush_memtable_ids,
+ bool resuming_from_bg_err);
+
+ inline void WaitForPendingWrites() {
+ mutex_.AssertHeld();
+ TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock");
+ // In case of pipelined write is enabled, wait for all pending memtable
+ // writers.
+ if (immutable_db_options_.enable_pipelined_write) {
+ // Memtable writers may call DB::Get in case max_successive_merges > 0,
+ // which may lock mutex. Unlocking mutex here to avoid deadlock.
+ mutex_.Unlock();
+ write_thread_.WaitForMemTableWriters();
+ mutex_.Lock();
+ }
+
+ if (!immutable_db_options_.unordered_write) {
+ // Then the writes are finished before the next write group starts
+ return;
+ }
+
+ // Wait for the ones who already wrote to the WAL to finish their
+ // memtable write.
+ if (pending_memtable_writes_.load() != 0) {
+ std::unique_lock<std::mutex> guard(switch_mutex_);
+ switch_cv_.wait(guard,
+ [&] { return pending_memtable_writes_.load() == 0; });
+ }
+ }
+
+ // REQUIRES: mutex locked and in write thread.
+ void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
+
+ // REQUIRES: mutex locked and in write thread.
+ Status SwitchWAL(WriteContext* write_context);
+
+ // REQUIRES: mutex locked and in write thread.
+ Status HandleWriteBufferFull(WriteContext* write_context);
+
+ // REQUIRES: mutex locked
+ Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync,
+ WriteContext* write_context);
+
+ WriteBatch* MergeBatch(const WriteThread::WriteGroup& write_group,
+ WriteBatch* tmp_batch, size_t* write_with_wal,
+ WriteBatch** to_be_cached_state);
+
+ Status WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
+ uint64_t* log_used, uint64_t* log_size);
+
+ Status WriteToWAL(const WriteThread::WriteGroup& write_group,
+ log::Writer* log_writer, uint64_t* log_used,
+ bool need_log_sync, bool need_log_dir_sync,
+ SequenceNumber sequence);
+
+ Status ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
+ uint64_t* log_used, SequenceNumber* last_sequence,
+ size_t seq_inc);
+
+ // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
+ void WriteStatusCheck(const Status& status);
+
+ // Used by WriteImpl to update bg_error_ in case of memtable insert error.
+ void MemTableInsertStatusCheck(const Status& memtable_insert_status);
+
+#ifndef ROCKSDB_LITE
+
+ Status CompactFilesImpl(const CompactionOptions& compact_options,
+ ColumnFamilyData* cfd, Version* version,
+ const std::vector<std::string>& input_file_names,
+ std::vector<std::string>* const output_file_names,
+ const int output_level, int output_path_id,
+ JobContext* job_context, LogBuffer* log_buffer,
+ CompactionJobInfo* compaction_job_info);
+
+ // Wait for current IngestExternalFile() calls to finish.
+ // REQUIRES: mutex_ held
+ void WaitForIngestFile();
+
+#else
+ // IngestExternalFile is not supported in ROCKSDB_LITE so this function
+ // will be no-op
+ void WaitForIngestFile() {}
+#endif // ROCKSDB_LITE
+
+ ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
+
+ void MaybeScheduleFlushOrCompaction();
+
+ // A flush request specifies the column families to flush as well as the
+ // largest memtable id to persist for each column family. Once all the
+ // memtables whose IDs are smaller than or equal to this per-column-family
+ // specified value, this flush request is considered to have completed its
+ // work of flushing this column family. After completing the work for all
+ // column families in this request, this flush is considered complete.
+ typedef std::vector<std::pair<ColumnFamilyData*, uint64_t>> FlushRequest;
+
+ void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+ FlushRequest* req);
+
+ void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason);
+
+ void SchedulePendingCompaction(ColumnFamilyData* cfd);
+ void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+ FileType type, uint64_t number, int job_id);
+ static void BGWorkCompaction(void* arg);
+ // Runs a pre-chosen universal compaction involving bottom level in a
+ // separate, bottom-pri thread pool.
+ static void BGWorkBottomCompaction(void* arg);
+ static void BGWorkFlush(void* arg);
+ static void BGWorkPurge(void* arg);
+ static void UnscheduleCompactionCallback(void* arg);
+ static void UnscheduleFlushCallback(void* arg);
+ void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+ Env::Priority thread_pri);
+ void BackgroundCallFlush(Env::Priority thread_pri);
+ void BackgroundCallPurge();
+ Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
+ LogBuffer* log_buffer,
+ PrepickedCompaction* prepicked_compaction,
+ Env::Priority thread_pri);
+ Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
+ LogBuffer* log_buffer, FlushReason* reason,
+ Env::Priority thread_pri);
+
+ bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+ const std::vector<CompactionInputFiles>& inputs,
+ bool* sfm_bookkeeping, LogBuffer* log_buffer);
+
+ // Request compaction tasks token from compaction thread limiter.
+ // It always succeeds if force = true or limiter is disable.
+ bool RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+ std::unique_ptr<TaskLimiterToken>* token,
+ LogBuffer* log_buffer);
+
+ // Schedule background tasks
+ void StartTimedTasks();
+
+ void PrintStatistics();
+
+ size_t EstimateInMemoryStatsHistorySize() const;
+
+ // persist stats to column family "_persistent_stats"
+ void PersistStats();
+
+ // dump rocksdb.stats to LOG
+ void DumpStats();
+
+ // Return the minimum empty level that could hold the total data in the
+ // input level. Return the input level, if such level could not be found.
+ int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
+ const MutableCFOptions& mutable_cf_options,
+ int level);
+
+ // Move the files in the input level to the target level.
+ // If target_level < 0, automatically calculate the minimum level that could
+ // hold the data set.
+ Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
+
+ // helper functions for adding and removing from flush & compaction queues
+ void AddToCompactionQueue(ColumnFamilyData* cfd);
+ ColumnFamilyData* PopFirstFromCompactionQueue();
+ FlushRequest PopFirstFromFlushQueue();
+
+ // Pick the first unthrottled compaction with task token from queue.
+ ColumnFamilyData* PickCompactionFromQueue(
+ std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
+
+ // helper function to call after some of the logs_ were synced
+ void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status);
+
+ SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
+ bool lock = true);
+
+ uint64_t GetMaxTotalWalSize() const;
+
+ Directory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
+
+ Status CloseHelper();
+
+ void WaitForBackgroundWork();
+
+ // Background threads call this function, which is just a wrapper around
+ // the InstallSuperVersion() function. Background threads carry
+ // sv_context which can have new_superversion already
+ // allocated.
+ // All ColumnFamily state changes go through this function. Here we analyze
+ // the new state and we schedule background work if we detect that the new
+ // state needs flush or compaction.
+ void InstallSuperVersionAndScheduleWork(
+ ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+ const MutableCFOptions& mutable_cf_options);
+
+ bool GetIntPropertyInternal(ColumnFamilyData* cfd,
+ const DBPropertyInfo& property_info,
+ bool is_locked, uint64_t* value);
+ bool GetPropertyHandleOptionsStatistics(std::string* value);
+
+ bool HasPendingManualCompaction();
+ bool HasExclusiveManualCompaction();
+ void AddManualCompaction(ManualCompactionState* m);
+ void RemoveManualCompaction(ManualCompactionState* m);
+ bool ShouldntRunManualCompaction(ManualCompactionState* m);
+ bool HaveManualCompaction(ColumnFamilyData* cfd);
+ bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
+#ifndef ROCKSDB_LITE
+ void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& compaction_job_stats,
+ const int job_id, const Version* current,
+ CompactionJobInfo* compaction_job_info) const;
+ // Reserve the next 'num' file numbers for to-be-ingested external SST files,
+ // and return the current file_number in 'next_file_number'.
+ // Write a version edit to the MANIFEST.
+ Status ReserveFileNumbersBeforeIngestion(
+ ColumnFamilyData* cfd, uint64_t num,
+ std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+ uint64_t* next_file_number);
+#endif //! ROCKSDB_LITE
+
+ bool ShouldPurge(uint64_t file_number) const;
+ void MarkAsGrabbedForPurge(uint64_t file_number);
+
+ size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+ Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
+
+ Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+ size_t preallocate_block_size, log::Writer** new_log);
+
+ // Validate self-consistency of DB options
+ static Status ValidateOptions(const DBOptions& db_options);
+ // Validate self-consistency of DB options and its consistency with cf options
+ static Status ValidateOptions(
+ const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families);
+
+ // Utility function to do some debug validation and sort the given vector
+ // of MultiGet keys
+ void PrepareMultiGetKeys(
+ const size_t num_keys, bool sorted,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* key_ptrs);
+
+ // A structure to hold the information required to process MultiGet of keys
+ // belonging to one column family. For a multi column family MultiGet, there
+ // will be a container of these objects.
+ struct MultiGetColumnFamilyData {
+ ColumnFamilyHandle* cf;
+ ColumnFamilyData* cfd;
+
+ // For the batched MultiGet which relies on sorted keys, start specifies
+ // the index of first key belonging to this column family in the sorted
+ // list.
+ size_t start;
+
+ // For the batched MultiGet case, num_keys specifies the number of keys
+ // belonging to this column family in the sorted list
+ size_t num_keys;
+
+ // SuperVersion for the column family obtained in a manner that ensures a
+ // consistent view across all column families in the DB
+ SuperVersion* super_version;
+ MultiGetColumnFamilyData(ColumnFamilyHandle* column_family,
+ SuperVersion* sv)
+ : cf(column_family),
+ cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+ start(0),
+ num_keys(0),
+ super_version(sv) {}
+
+ MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first,
+ size_t count, SuperVersion* sv)
+ : cf(column_family),
+ cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+ start(first),
+ num_keys(count),
+ super_version(sv) {}
+
+ MultiGetColumnFamilyData() = default;
+ };
+
+ // A common function to obtain a consistent snapshot, which can be implicit
+ // if the user doesn't specify a snapshot in read_options, across
+ // multiple column families for MultiGet. It will attempt to get an implicit
+ // snapshot without acquiring the db_mutes, but will give up after a few
+ // tries and acquire the mutex if a memtable flush happens. The template
+ // allows both the batched and non-batched MultiGet to call this with
+ // either an std::unordered_map or autovector of column families.
+ //
+ // If callback is non-null, the callback is refreshed with the snapshot
+ // sequence number
+ //
+ // A return value of true indicates that the SuperVersions were obtained
+ // from the ColumnFamilyData, whereas false indicates they are thread
+ // local
+ template <class T>
+ bool MultiCFSnapshot(
+ const ReadOptions& read_options, ReadCallback* callback,
+ std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+ iter_deref_func,
+ T* cf_list, SequenceNumber* snapshot);
+
+ // The actual implementation of the batching MultiGet. The caller is expected
+ // to have acquired the SuperVersion and pass in a snapshot sequence number
+ // in order to construct the LookupKeys. The start_key and num_keys specify
+ // the range of keys in the sorted_keys vector for a single column family.
+ void MultiGetImpl(
+ const ReadOptions& read_options, size_t start_key, size_t num_keys,
+ autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+ SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback,
+ bool* is_blob_index);
+
+ // table_cache_ provides its own synchronization
+ std::shared_ptr<Cache> table_cache_;
+
+ // Lock over the persistent DB state. Non-nullptr iff successfully acquired.
+ FileLock* db_lock_;
+
+ // In addition to mutex_, log_write_mutex_ protected writes to stats_history_
+ InstrumentedMutex stats_history_mutex_;
+ // In addition to mutex_, log_write_mutex_ protected writes to logs_ and
+ // logfile_number_. With two_write_queues it also protects alive_log_files_,
+ // and log_empty_. Refer to the definition of each variable below for more
+ // details.
+ // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and
+ // mutex_, the order should be first mutex_ and then log_write_mutex_.
+ InstrumentedMutex log_write_mutex_;
+
+ std::atomic<bool> shutting_down_;
+ std::atomic<bool> manual_compaction_paused_;
+ // This condition variable is signaled on these conditions:
+ // * whenever bg_compaction_scheduled_ goes down to 0
+ // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
+ // made any progress
+ // * whenever a compaction made any progress
+ // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases
+ // (i.e. whenever a flush is done, even if it didn't make any progress)
+ // * whenever there is an error in background purge, flush or compaction
+ // * whenever num_running_ingest_file_ goes to 0.
+ // * whenever pending_purge_obsolete_files_ goes to 0.
+ // * whenever disable_delete_obsolete_files_ goes to 0.
+ // * whenever SetOptions successfully updates options.
+ // * whenever a column family is dropped.
+ InstrumentedCondVar bg_cv_;
+ // Writes are protected by locking both mutex_ and log_write_mutex_, and reads
+ // must be under either mutex_ or log_write_mutex_. Since after ::Open,
+ // logfile_number_ is currently updated only in write_thread_, it can be read
+ // from the same write_thread_ without any locks.
+ uint64_t logfile_number_;
+ std::deque<uint64_t>
+ log_recycle_files_; // a list of log files that we can recycle
+ bool log_dir_synced_;
+ // Without two_write_queues, read and writes to log_empty_ are protected by
+ // mutex_. Since it is currently updated/read only in write_thread_, it can be
+ // accessed from the same write_thread_ without any locks. With
+ // two_write_queues writes, where it can be updated in different threads,
+ // read and writes are protected by log_write_mutex_ instead. This is to avoid
+ // expesnive mutex_ lock during WAL write, which update log_empty_.
+ bool log_empty_;
+
+ ColumnFamilyHandleImpl* persist_stats_cf_handle_;
+
+ bool persistent_stats_cfd_exists_ = true;
+
+ // Without two_write_queues, read and writes to alive_log_files_ are
+ // protected by mutex_. However since back() is never popped, and push_back()
+ // is done only from write_thread_, the same thread can access the item
+ // reffered by back() without mutex_. With two_write_queues_, writes
+ // are protected by locking both mutex_ and log_write_mutex_, and reads must
+ // be under either mutex_ or log_write_mutex_.
+ std::deque<LogFileNumberSize> alive_log_files_;
+ // Log files that aren't fully synced, and the current log file.
+ // Synchronization:
+ // - push_back() is done from write_thread_ with locked mutex_ and
+ // log_write_mutex_
+ // - pop_front() is done from any thread with locked mutex_ and
+ // log_write_mutex_
+ // - reads are done with either locked mutex_ or log_write_mutex_
+ // - back() and items with getting_synced=true are not popped,
+ // - The same thread that sets getting_synced=true will reset it.
+ // - it follows that the object referred by back() can be safely read from
+ // the write_thread_ without using mutex
+ // - it follows that the items with getting_synced=true can be safely read
+ // from the same thread that has set getting_synced=true
+ std::deque<LogWriterNumber> logs_;
+ // Signaled when getting_synced becomes false for some of the logs_.
+ InstrumentedCondVar log_sync_cv_;
+ // This is the app-level state that is written to the WAL but will be used
+ // only during recovery. Using this feature enables not writing the state to
+ // memtable on normal writes and hence improving the throughput. Each new
+ // write of the state will replace the previous state entirely even if the
+ // keys in the two consecuitive states do not overlap.
+ // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
+ // Otherwise only the heaad of write_thread_ can access it.
+ WriteBatch cached_recoverable_state_;
+ std::atomic<bool> cached_recoverable_state_empty_ = {true};
+ std::atomic<uint64_t> total_log_size_;
+
+ // If this is non-empty, we need to delete these log files in background
+ // threads. Protected by db mutex.
+ autovector<log::Writer*> logs_to_free_;
+
+ bool is_snapshot_supported_;
+
+ std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
+
+ std::map<std::string, uint64_t> stats_slice_;
+
+ bool stats_slice_initialized_ = false;
+
+ Directories directories_;
+
+ WriteBufferManager* write_buffer_manager_;
+
+ WriteThread write_thread_;
+ WriteBatch tmp_batch_;
+ // The write thread when the writers have no memtable write. This will be used
+ // in 2PC to batch the prepares separately from the serial commit.
+ WriteThread nonmem_write_thread_;
+
+ WriteController write_controller_;
+
+ // Size of the last batch group. In slowdown mode, next write needs to
+ // sleep if it uses up the quota.
+ // Note: This is to protect memtable and compaction. If the batch only writes
+ // to the WAL its size need not to be included in this.
+ uint64_t last_batch_group_size_;
+
+ FlushScheduler flush_scheduler_;
+
+ TrimHistoryScheduler trim_history_scheduler_;
+
+ SnapshotList snapshots_;
+
+ // For each background job, pending_outputs_ keeps the current file number at
+ // the time that background job started.
+ // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
+ // number bigger than any of the file number in pending_outputs_. Since file
+ // numbers grow monotonically, this also means that pending_outputs_ is always
+ // sorted. After a background job is done executing, its file number is
+ // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
+ // it up.
+ // State is protected with db mutex.
+ std::list<uint64_t> pending_outputs_;
+
+ // flush_queue_ and compaction_queue_ hold column families that we need to
+ // flush and compact, respectively.
+ // A column family is inserted into flush_queue_ when it satisfies condition
+ // cfd->imm()->IsFlushPending()
+ // A column family is inserted into compaction_queue_ when it satisfied
+ // condition cfd->NeedsCompaction()
+ // Column families in this list are all Ref()-erenced
+ // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
+ // do RAII on ColumnFamilyData
+ // Column families are in this queue when they need to be flushed or
+ // compacted. Consumers of these queues are flush and compaction threads. When
+ // column family is put on this queue, we increase unscheduled_flushes_ and
+ // unscheduled_compactions_. When these variables are bigger than zero, that
+ // means we need to schedule background threads for flush and compaction.
+ // Once the background threads are scheduled, we decrease unscheduled_flushes_
+ // and unscheduled_compactions_. That way we keep track of number of
+ // compaction and flush threads we need to schedule. This scheduling is done
+ // in MaybeScheduleFlushOrCompaction()
+ // invariant(column family present in flush_queue_ <==>
+ // ColumnFamilyData::pending_flush_ == true)
+ std::deque<FlushRequest> flush_queue_;
+ // invariant(column family present in compaction_queue_ <==>
+ // ColumnFamilyData::pending_compaction_ == true)
+ std::deque<ColumnFamilyData*> compaction_queue_;
+
+ // A map to store file numbers and filenames of the files to be purged
+ std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
+
+ // A vector to store the file numbers that have been assigned to certain
+ // JobContext. Current implementation tracks ssts only.
+ std::unordered_set<uint64_t> files_grabbed_for_purge_;
+
+ // A queue to store log writers to close
+ std::deque<log::Writer*> logs_to_free_queue_;
+ std::deque<SuperVersion*> superversions_to_free_queue_;
+ int unscheduled_flushes_;
+ int unscheduled_compactions_;
+
+ // count how many background compactions are running or have been scheduled in
+ // the BOTTOM pool
+ int bg_bottom_compaction_scheduled_;
+
+ // count how many background compactions are running or have been scheduled
+ int bg_compaction_scheduled_;
+
+ // stores the number of compactions are currently running
+ int num_running_compactions_;
+
+ // number of background memtable flush jobs, submitted to the HIGH pool
+ int bg_flush_scheduled_;
+
+ // stores the number of flushes are currently running
+ int num_running_flushes_;
+
+ // number of background obsolete file purge jobs, submitted to the HIGH pool
+ int bg_purge_scheduled_;
+
+ std::deque<ManualCompactionState*> manual_compaction_dequeue_;
+
+ // shall we disable deletion of obsolete files
+ // if 0 the deletion is enabled.
+ // if non-zero, files will not be getting deleted
+ // This enables two different threads to call
+ // EnableFileDeletions() and DisableFileDeletions()
+ // without any synchronization
+ int disable_delete_obsolete_files_;
+
+ // Number of times FindObsoleteFiles has found deletable files and the
+ // corresponding call to PurgeObsoleteFiles has not yet finished.
+ int pending_purge_obsolete_files_;
+
+ // last time when DeleteObsoleteFiles with full scan was executed. Originally
+ // initialized with startup time.
+ uint64_t delete_obsolete_files_last_run_;
+
+ // last time stats were dumped to LOG
+ std::atomic<uint64_t> last_stats_dump_time_microsec_;
+
+ // The thread that wants to switch memtable, can wait on this cv until the
+ // pending writes to memtable finishes.
+ std::condition_variable switch_cv_;
+ // The mutex used by switch_cv_. mutex_ should be acquired beforehand.
+ std::mutex switch_mutex_;
+ // Number of threads intending to write to memtable
+ std::atomic<size_t> pending_memtable_writes_ = {};
+
+ // Each flush or compaction gets its own job id. this counter makes sure
+ // they're unique
+ std::atomic<int> next_job_id_;
+
+ // A flag indicating whether the current rocksdb database has any
+ // data that is not yet persisted into either WAL or SST file.
+ // Used when disableWAL is true.
+ std::atomic<bool> has_unpersisted_data_;
+
+ // if an attempt was made to flush all column families that
+ // the oldest log depends on but uncommitted data in the oldest
+ // log prevents the log from being released.
+ // We must attempt to free the dependent memtables again
+ // at a later time after the transaction in the oldest
+ // log is fully commited.
+ bool unable_to_release_oldest_log_;
+
+ static const int KEEP_LOG_FILE_NUM = 1000;
+ // MSVC version 1800 still does not have constexpr for ::max()
+ static const uint64_t kNoTimeOut = port::kMaxUint64;
+
+ std::string db_absolute_path_;
+
+ // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
+ // calls.
+ // REQUIRES: mutex held
+ int num_running_ingest_file_;
+
+#ifndef ROCKSDB_LITE
+ WalManager wal_manager_;
+#endif // ROCKSDB_LITE
+
+ // Unified interface for logging events
+ EventLogger event_logger_;
+
+ // A value of > 0 temporarily disables scheduling of background work
+ int bg_work_paused_;
+
+ // A value of > 0 temporarily disables scheduling of background compaction
+ int bg_compaction_paused_;
+
+ // Guard against multiple concurrent refitting
+ bool refitting_level_;
+
+ // Indicate DB was opened successfully
+ bool opened_successfully_;
+
+ // The min threshold to triggere bottommost compaction for removing
+ // garbages, among all column families.
+ SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+ LogsWithPrepTracker logs_with_prep_tracker_;
+
+ // Callback for compaction to check if a key is visible to a snapshot.
+ // REQUIRES: mutex held
+ std::unique_ptr<SnapshotChecker> snapshot_checker_;
+
+ // Callback for when the cached_recoverable_state_ is written to memtable
+ // Only to be set during initialization
+ std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
+
+ // handle for scheduling stats dumping at fixed intervals
+ // REQUIRES: mutex locked
+ std::unique_ptr<ROCKSDB_NAMESPACE::RepeatableThread> thread_dump_stats_;
+
+ // handle for scheduling stats snapshoting at fixed intervals
+ // REQUIRES: mutex locked
+ std::unique_ptr<ROCKSDB_NAMESPACE::RepeatableThread> thread_persist_stats_;
+
+ // When set, we use a separate queue for writes that dont write to memtable.
+ // In 2PC these are the writes at Prepare phase.
+ const bool two_write_queues_;
+ const bool manual_wal_flush_;
+
+ // LastSequence also indicates last published sequence visibile to the
+ // readers. Otherwise LastPublishedSequence should be used.
+ const bool last_seq_same_as_publish_seq_;
+ // It indicates that a customized gc algorithm must be used for
+ // flush/compaction and if it is not provided vis SnapshotChecker, we should
+ // disable gc to be safe.
+ const bool use_custom_gc_;
+ // Flag to indicate that the DB instance shutdown has been initiated. This
+ // different from shutting_down_ atomic in that it is set at the beginning
+ // of shutdown sequence, specifically in order to prevent any background
+ // error recovery from going on in parallel. The latter, shutting_down_,
+ // is set a little later during the shutdown after scheduling memtable
+ // flushes
+ std::atomic<bool> shutdown_initiated_;
+ // Flag to indicate whether sst_file_manager object was allocated in
+ // DB::Open() or passed to us
+ bool own_sfm_;
+
+ // Clients must periodically call SetPreserveDeletesSequenceNumber()
+ // to advance this seqnum. Default value is 0 which means ALL deletes are
+ // preserved. Note that this has no effect if DBOptions.preserve_deletes
+ // is set to false.
+ std::atomic<SequenceNumber> preserve_deletes_seqnum_;
+ const bool preserve_deletes_;
+
+ // Flag to check whether Close() has been called on this DB
+ bool closed_;
+
+ ErrorHandler error_handler_;
+
+ // Conditional variable to coordinate installation of atomic flush results.
+ // With atomic flush, each bg thread installs the result of flushing multiple
+ // column families, and different threads can flush different column
+ // families. It's difficult to rely on one thread to perform batch
+ // installation for all threads. This is different from the non-atomic flush
+ // case.
+ // atomic_flush_install_cv_ makes sure that threads install atomic flush
+ // results sequentially. Flush results of memtables with lower IDs get
+ // installed to MANIFEST first.
+ InstrumentedCondVar atomic_flush_install_cv_;
+
+ bool wal_in_db_path_;
+};
+
+extern Options SanitizeOptions(const std::string& db, const Options& src);
+
+extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
+
+extern CompressionType GetCompressionFlush(
+ const ImmutableCFOptions& ioptions,
+ const MutableCFOptions& mutable_cf_options);
+
+// Return the earliest log file to keep after the memtable flush is
+// finalized.
+// `cfd_to_flush` is the column family whose memtable (specified in
+// `memtables_to_flush`) will be flushed and thus will not depend on any WAL
+// file.
+// The function is only applicable to 2pc mode.
+extern uint64_t PrecomputeMinLogNumberToKeep(
+ VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+ autovector<VersionEdit*> edit_list,
+ const autovector<MemTable*>& memtables_to_flush,
+ LogsWithPrepTracker* prep_tracker);
+
+// `cfd_to_flush` is the column family whose memtable will be flushed and thus
+// will not depend on any WAL file. nullptr means no memtable is being flushed.
+// The function is only applicable to 2pc mode.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+ VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
+ const autovector<MemTable*>& memtables_to_flush);
+
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+ if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+ if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
new file mode 100644
index 000000000..c7b3510c3
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
@@ -0,0 +1,3116 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/concurrent_task_limiter_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool DBImpl::EnoughRoomForCompaction(
+ ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+ bool* sfm_reserved_compact_space, LogBuffer* log_buffer) {
+ // Check if we have enough room to do the compaction
+ bool enough_room = true;
+#ifndef ROCKSDB_LITE
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm) {
+ // Pass the current bg_error_ to SFM so it can decide what checks to
+ // perform. If this DB instance hasn't seen any error yet, the SFM can be
+ // optimistic and not do disk space checks
+ enough_room =
+ sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError());
+ if (enough_room) {
+ *sfm_reserved_compact_space = true;
+ }
+ }
+#else
+ (void)cfd;
+ (void)inputs;
+ (void)sfm_reserved_compact_space;
+#endif // ROCKSDB_LITE
+ if (!enough_room) {
+ // Just in case tests want to change the value of enough_room
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::BackgroundCompaction():CancelledCompaction", &enough_room);
+ ROCKS_LOG_BUFFER(log_buffer,
+ "Cancelled compaction because not enough room");
+ RecordTick(stats_, COMPACTION_CANCELLED, 1);
+ }
+ return enough_room;
+}
+
+bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+ std::unique_ptr<TaskLimiterToken>* token,
+ LogBuffer* log_buffer) {
+ assert(*token == nullptr);
+ auto limiter = static_cast<ConcurrentTaskLimiterImpl*>(
+ cfd->ioptions()->compaction_thread_limiter.get());
+ if (limiter == nullptr) {
+ return true;
+ }
+ *token = limiter->GetToken(force);
+ if (*token != nullptr) {
+ ROCKS_LOG_BUFFER(log_buffer,
+ "Thread limiter [%s] increase [%s] compaction task, "
+ "force: %s, tasks after: %d",
+ limiter->GetName().c_str(), cfd->GetName().c_str(),
+ force ? "true" : "false", limiter->GetOutstandingTask());
+ return true;
+ }
+ return false;
+}
+
+Status DBImpl::SyncClosedLogs(JobContext* job_context) {
+ TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
+ mutex_.AssertHeld();
+ autovector<log::Writer*, 1> logs_to_sync;
+ uint64_t current_log_number = logfile_number_;
+ while (logs_.front().number < current_log_number &&
+ logs_.front().getting_synced) {
+ log_sync_cv_.Wait();
+ }
+ for (auto it = logs_.begin();
+ it != logs_.end() && it->number < current_log_number; ++it) {
+ auto& log = *it;
+ assert(!log.getting_synced);
+ log.getting_synced = true;
+ logs_to_sync.push_back(log.writer);
+ }
+
+ Status s;
+ if (!logs_to_sync.empty()) {
+ mutex_.Unlock();
+
+ for (log::Writer* log : logs_to_sync) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[JOB %d] Syncing log #%" PRIu64, job_context->job_id,
+ log->get_log_number());
+ s = log->file()->Sync(immutable_db_options_.use_fsync);
+ if (!s.ok()) {
+ break;
+ }
+
+ if (immutable_db_options_.recycle_log_file_num > 0) {
+ s = log->Close();
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ if (s.ok()) {
+ s = directories_.GetWalDir()->Fsync();
+ }
+
+ mutex_.Lock();
+
+ // "number <= current_log_number - 1" is equivalent to
+ // "number < current_log_number".
+ MarkLogsSynced(current_log_number - 1, true, s);
+ if (!s.ok()) {
+ error_handler_.SetBGError(s, BackgroundErrorReason::kFlush);
+ TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed");
+ return s;
+ }
+ }
+ return s;
+}
+
+Status DBImpl::FlushMemTableToOutputFile(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ bool* made_progress, JobContext* job_context,
+ SuperVersionContext* superversion_context,
+ std::vector<SequenceNumber>& snapshot_seqs,
+ SequenceNumber earliest_write_conflict_snapshot,
+ SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+ Env::Priority thread_pri) {
+ mutex_.AssertHeld();
+ assert(cfd->imm()->NumNotFlushed() != 0);
+ assert(cfd->imm()->IsFlushPending());
+
+ FlushJob flush_job(
+ dbname_, cfd, immutable_db_options_, mutable_cf_options,
+ nullptr /* memtable_id */, file_options_for_compaction_, versions_.get(),
+ &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+ snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+ GetDataDir(cfd, 0U),
+ GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
+ &event_logger_, mutable_cf_options.report_bg_io_stats,
+ true /* sync_output_directory */, true /* write_manifest */, thread_pri);
+
+ FileMetaData file_meta;
+
+ TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
+ flush_job.PickMemTable();
+ TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables");
+
+#ifndef ROCKSDB_LITE
+ // may temporarily unlock and lock the mutex.
+ NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
+#endif // ROCKSDB_LITE
+
+ Status s;
+ if (logfile_number_ > 0 &&
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1) {
+ // If there are more than one column families, we need to make sure that
+ // all the log files except the most recent one are synced. Otherwise if
+ // the host crashes after flushing and before WAL is persistent, the
+ // flushed SST may contain data from write batches whose updates to
+ // other column families are missing.
+ // SyncClosedLogs() may unlock and re-lock the db_mutex.
+ s = SyncClosedLogs(job_context);
+ } else {
+ TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
+ }
+
+ // Within flush_job.Run, rocksdb may call event listener to notify
+ // file creation and deletion.
+ //
+ // Note that flush_job.Run will unlock and lock the db_mutex,
+ // and EventListener callback will be called when the db_mutex
+ // is unlocked by the current thread.
+ if (s.ok()) {
+ s = flush_job.Run(&logs_with_prep_tracker_, &file_meta);
+ } else {
+ flush_job.Cancel();
+ }
+
+ if (s.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd, superversion_context,
+ mutable_cf_options);
+ if (made_progress) {
+ *made_progress = true;
+ }
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+ cfd->GetName().c_str(),
+ cfd->current()->storage_info()->LevelSummary(&tmp));
+ }
+
+ if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
+ Status new_bg_error = s;
+ error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+ }
+ if (s.ok()) {
+#ifndef ROCKSDB_LITE
+ // may temporarily unlock and lock the mutex.
+ NotifyOnFlushCompleted(cfd, mutable_cf_options,
+ flush_job.GetCommittedFlushJobsInfo());
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm) {
+ // Notify sst_file_manager that a new file was added
+ std::string file_path = MakeTableFileName(
+ cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber());
+ sfm->OnAddFile(file_path);
+ if (sfm->IsMaxAllowedSpaceReached()) {
+ Status new_bg_error =
+ Status::SpaceLimit("Max allowed space was reached");
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
+ &new_bg_error);
+ error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+ }
+ }
+#endif // ROCKSDB_LITE
+ }
+ TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish");
+ return s;
+}
+
+Status DBImpl::FlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+ if (immutable_db_options_.atomic_flush) {
+ return AtomicFlushMemTablesToOutputFiles(
+ bg_flush_args, made_progress, job_context, log_buffer, thread_pri);
+ }
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+ Status status;
+ for (auto& arg : bg_flush_args) {
+ ColumnFamilyData* cfd = arg.cfd_;
+ MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+ SuperVersionContext* superversion_context = arg.superversion_context_;
+ Status s = FlushMemTableToOutputFile(
+ cfd, mutable_cf_options, made_progress, job_context,
+ superversion_context, snapshot_seqs, earliest_write_conflict_snapshot,
+ snapshot_checker, log_buffer, thread_pri);
+ if (!s.ok()) {
+ status = s;
+ if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
+ // At this point, DB is not shutting down, nor is cfd dropped.
+ // Something is wrong, thus we break out of the loop.
+ break;
+ }
+ }
+ }
+ return status;
+}
+
+/*
+ * Atomically flushes multiple column families.
+ *
+ * For each column family, all memtables with ID smaller than or equal to the
+ * ID specified in bg_flush_args will be flushed. Only after all column
+ * families finish flush will this function commit to MANIFEST. If any of the
+ * column families are not flushed successfully, this function does not have
+ * any side-effect on the state of the database.
+ */
+Status DBImpl::AtomicFlushMemTablesToOutputFiles(
+ const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+ JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+ mutex_.AssertHeld();
+
+ autovector<ColumnFamilyData*> cfds;
+ for (const auto& arg : bg_flush_args) {
+ cfds.emplace_back(arg.cfd_);
+ }
+
+#ifndef NDEBUG
+ for (const auto cfd : cfds) {
+ assert(cfd->imm()->NumNotFlushed() != 0);
+ assert(cfd->imm()->IsFlushPending());
+ }
+#endif /* !NDEBUG */
+
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+
+ autovector<Directory*> distinct_output_dirs;
+ autovector<std::string> distinct_output_dir_paths;
+ std::vector<std::unique_ptr<FlushJob>> jobs;
+ std::vector<MutableCFOptions> all_mutable_cf_options;
+ int num_cfs = static_cast<int>(cfds.size());
+ all_mutable_cf_options.reserve(num_cfs);
+ for (int i = 0; i < num_cfs; ++i) {
+ auto cfd = cfds[i];
+ Directory* data_dir = GetDataDir(cfd, 0U);
+ const std::string& curr_path = cfd->ioptions()->cf_paths[0].path;
+
+ // Add to distinct output directories if eligible. Use linear search. Since
+ // the number of elements in the vector is not large, performance should be
+ // tolerable.
+ bool found = false;
+ for (const auto& path : distinct_output_dir_paths) {
+ if (path == curr_path) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ distinct_output_dir_paths.emplace_back(curr_path);
+ distinct_output_dirs.emplace_back(data_dir);
+ }
+
+ all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
+ const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
+ const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_);
+ jobs.emplace_back(new FlushJob(
+ dbname_, cfd, immutable_db_options_, mutable_cf_options,
+ max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_,
+ &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+ snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+ data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+ stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
+ false /* sync_output_directory */, false /* write_manifest */,
+ thread_pri));
+ jobs.back()->PickMemTable();
+ }
+
+ std::vector<FileMetaData> file_meta(num_cfs);
+ Status s;
+ assert(num_cfs == static_cast<int>(jobs.size()));
+
+#ifndef ROCKSDB_LITE
+ for (int i = 0; i != num_cfs; ++i) {
+ const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
+ // may temporarily unlock and lock the mutex.
+ NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
+ job_context->job_id);
+ }
+#endif /* !ROCKSDB_LITE */
+
+ if (logfile_number_ > 0) {
+ // TODO (yanqin) investigate whether we should sync the closed logs for
+ // single column family case.
+ s = SyncClosedLogs(job_context);
+ }
+
+ // exec_status stores the execution status of flush_jobs as
+ // <bool /* executed */, Status /* status code */>
+ autovector<std::pair<bool, Status>> exec_status;
+ for (int i = 0; i != num_cfs; ++i) {
+ // Initially all jobs are not executed, with status OK.
+ exec_status.emplace_back(false, Status::OK());
+ }
+
+ if (s.ok()) {
+ // TODO (yanqin): parallelize jobs with threads.
+ for (int i = 1; i != num_cfs; ++i) {
+ exec_status[i].second =
+ jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i]);
+ exec_status[i].first = true;
+ }
+ if (num_cfs > 1) {
+ TEST_SYNC_POINT(
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1");
+ TEST_SYNC_POINT(
+ "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2");
+ }
+ assert(exec_status.size() > 0);
+ assert(!file_meta.empty());
+ exec_status[0].second =
+ jobs[0]->Run(&logs_with_prep_tracker_, &file_meta[0]);
+ exec_status[0].first = true;
+
+ Status error_status;
+ for (const auto& e : exec_status) {
+ if (!e.second.ok()) {
+ s = e.second;
+ if (!e.second.IsShutdownInProgress() &&
+ !e.second.IsColumnFamilyDropped()) {
+ // If a flush job did not return OK, and the CF is not dropped, and
+ // the DB is not shutting down, then we have to return this result to
+ // caller later.
+ error_status = e.second;
+ }
+ }
+ }
+
+ s = error_status.ok() ? s : error_status;
+ }
+
+ if (s.IsColumnFamilyDropped()) {
+ s = Status::OK();
+ }
+
+ if (s.ok() || s.IsShutdownInProgress()) {
+ // Sync on all distinct output directories.
+ for (auto dir : distinct_output_dirs) {
+ if (dir != nullptr) {
+ Status error_status = dir->Fsync();
+ if (!error_status.ok()) {
+ s = error_status;
+ break;
+ }
+ }
+ }
+ } else {
+ // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+ // it is not because of CF drop.
+ // Have to cancel the flush jobs that have NOT executed because we need to
+ // unref the versions.
+ for (int i = 0; i != num_cfs; ++i) {
+ if (!exec_status[i].first) {
+ jobs[i]->Cancel();
+ }
+ }
+ for (int i = 0; i != num_cfs; ++i) {
+ if (exec_status[i].first && exec_status[i].second.ok()) {
+ auto& mems = jobs[i]->GetMemTables();
+ cfds[i]->imm()->RollbackMemtableFlush(mems,
+ file_meta[i].fd.GetNumber());
+ }
+ }
+ }
+
+ if (s.ok()) {
+ auto wait_to_install_func = [&]() {
+ bool ready = true;
+ for (size_t i = 0; i != cfds.size(); ++i) {
+ const auto& mems = jobs[i]->GetMemTables();
+ if (cfds[i]->IsDropped()) {
+ // If the column family is dropped, then do not wait.
+ continue;
+ } else if (!mems.empty() &&
+ cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) {
+ // If a flush job needs to install the flush result for mems and
+ // mems[0] is not the earliest memtable, it means another thread must
+ // be installing flush results for the same column family, then the
+ // current thread needs to wait.
+ ready = false;
+ break;
+ } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <=
+ bg_flush_args[i].max_memtable_id_) {
+ // If a flush job does not need to install flush results, then it has
+ // to wait until all memtables up to max_memtable_id_ (inclusive) are
+ // installed.
+ ready = false;
+ break;
+ }
+ }
+ return ready;
+ };
+
+ bool resuming_from_bg_err = error_handler_.IsDBStopped();
+ while ((!error_handler_.IsDBStopped() ||
+ error_handler_.GetRecoveryError().ok()) &&
+ !wait_to_install_func()) {
+ atomic_flush_install_cv_.Wait();
+ }
+
+ s = resuming_from_bg_err ? error_handler_.GetRecoveryError()
+ : error_handler_.GetBGError();
+ }
+
+ if (s.ok()) {
+ autovector<ColumnFamilyData*> tmp_cfds;
+ autovector<const autovector<MemTable*>*> mems_list;
+ autovector<const MutableCFOptions*> mutable_cf_options_list;
+ autovector<FileMetaData*> tmp_file_meta;
+ for (int i = 0; i != num_cfs; ++i) {
+ const auto& mems = jobs[i]->GetMemTables();
+ if (!cfds[i]->IsDropped() && !mems.empty()) {
+ tmp_cfds.emplace_back(cfds[i]);
+ mems_list.emplace_back(&mems);
+ mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]);
+ tmp_file_meta.emplace_back(&file_meta[i]);
+ }
+ }
+
+ s = InstallMemtableAtomicFlushResults(
+ nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list,
+ versions_.get(), &mutex_, tmp_file_meta,
+ &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer);
+ }
+
+ if (s.ok()) {
+ assert(num_cfs ==
+ static_cast<int>(job_context->superversion_contexts.size()));
+ for (int i = 0; i != num_cfs; ++i) {
+ if (cfds[i]->IsDropped()) {
+ continue;
+ }
+ InstallSuperVersionAndScheduleWork(cfds[i],
+ &job_context->superversion_contexts[i],
+ all_mutable_cf_options[i]);
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+ cfds[i]->GetName().c_str(),
+ cfds[i]->current()->storage_info()->LevelSummary(&tmp));
+ }
+ if (made_progress) {
+ *made_progress = true;
+ }
+#ifndef ROCKSDB_LITE
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ assert(all_mutable_cf_options.size() == static_cast<size_t>(num_cfs));
+ for (int i = 0; i != num_cfs; ++i) {
+ if (cfds[i]->IsDropped()) {
+ continue;
+ }
+ NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i],
+ jobs[i]->GetCommittedFlushJobsInfo());
+ if (sfm) {
+ std::string file_path = MakeTableFileName(
+ cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
+ sfm->OnAddFile(file_path);
+ if (sfm->IsMaxAllowedSpaceReached() &&
+ error_handler_.GetBGError().ok()) {
+ Status new_bg_error =
+ Status::SpaceLimit("Max allowed space was reached");
+ error_handler_.SetBGError(new_bg_error,
+ BackgroundErrorReason::kFlush);
+ }
+ }
+ }
+#endif // ROCKSDB_LITE
+ }
+
+ if (!s.ok() && !s.IsShutdownInProgress()) {
+ Status new_bg_error = s;
+ error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+ }
+
+ return s;
+}
+
+void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+ const MutableCFOptions& mutable_cf_options,
+ int job_id) {
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+ bool triggered_writes_slowdown =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_slowdown_writes_trigger);
+ bool triggered_writes_stop =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_stop_writes_trigger);
+ // release lock while notifying events
+ mutex_.Unlock();
+ {
+ FlushJobInfo info{};
+ info.cf_id = cfd->GetID();
+ info.cf_name = cfd->GetName();
+ // TODO(yhchiang): make db_paths dynamic in case flush does not
+ // go to L0 in the future.
+ const uint64_t file_number = file_meta->fd.GetNumber();
+ info.file_path =
+ MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number);
+ info.file_number = file_number;
+ info.thread_id = env_->GetThreadID();
+ info.job_id = job_id;
+ info.triggered_writes_slowdown = triggered_writes_slowdown;
+ info.triggered_writes_stop = triggered_writes_stop;
+ info.smallest_seqno = file_meta->fd.smallest_seqno;
+ info.largest_seqno = file_meta->fd.largest_seqno;
+ info.flush_reason = cfd->GetFlushReason();
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnFlushBegin(this, info);
+ }
+ }
+ mutex_.Lock();
+// no need to signal bg_cv_ as it will be signaled at the end of the
+// flush process.
+#else
+ (void)cfd;
+ (void)file_meta;
+ (void)mutable_cf_options;
+ (void)job_id;
+#endif // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnFlushCompleted(
+ ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+ std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info) {
+#ifndef ROCKSDB_LITE
+ assert(flush_jobs_info != nullptr);
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+ bool triggered_writes_slowdown =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_slowdown_writes_trigger);
+ bool triggered_writes_stop =
+ (cfd->current()->storage_info()->NumLevelFiles(0) >=
+ mutable_cf_options.level0_stop_writes_trigger);
+ // release lock while notifying events
+ mutex_.Unlock();
+ {
+ for (auto& info : *flush_jobs_info) {
+ info->triggered_writes_slowdown = triggered_writes_slowdown;
+ info->triggered_writes_stop = triggered_writes_stop;
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnFlushCompleted(this, *info);
+ }
+ }
+ flush_jobs_info->clear();
+ }
+ mutex_.Lock();
+ // no need to signal bg_cv_ as it will be signaled at the end of the
+ // flush process.
+#else
+ (void)cfd;
+ (void)mutable_cf_options;
+ (void)flush_jobs_info;
+#endif // ROCKSDB_LITE
+}
+
+Status DBImpl::CompactRange(const CompactRangeOptions& options,
+ ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+
+ if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) {
+ return Status::InvalidArgument("Invalid target path ID");
+ }
+
+ bool exclusive = options.exclusive_manual_compaction;
+
+ bool flush_needed = true;
+ if (begin != nullptr && end != nullptr) {
+ // TODO(ajkr): We could also optimize away the flush in certain cases where
+ // one/both sides of the interval are unbounded. But it requires more
+ // changes to RangesOverlapWithMemtables.
+ Range range(*begin, *end);
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ cfd->RangesOverlapWithMemtables({range}, super_version, &flush_needed);
+ CleanupSuperVersion(super_version);
+ }
+
+ Status s;
+ if (flush_needed) {
+ FlushOptions fo;
+ fo.allow_write_stall = options.allow_write_stall;
+ if (immutable_db_options_.atomic_flush) {
+ autovector<ColumnFamilyData*> cfds;
+ mutex_.Lock();
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ mutex_.Unlock();
+ s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction,
+ false /* writes_stopped */);
+ } else {
+ s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction,
+ false /* writes_stopped*/);
+ }
+ if (!s.ok()) {
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+ }
+ }
+
+ int max_level_with_files = 0;
+ // max_file_num_to_ignore can be used to filter out newly created SST files,
+ // useful for bottom level compaction in a manual compaction
+ uint64_t max_file_num_to_ignore = port::kMaxUint64;
+ uint64_t next_file_number = port::kMaxUint64;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ Version* base = cfd->current();
+ for (int level = 1; level < base->storage_info()->num_non_empty_levels();
+ level++) {
+ if (base->storage_info()->OverlapInLevel(level, begin, end)) {
+ max_level_with_files = level;
+ }
+ }
+ next_file_number = versions_->current_next_file_number();
+ }
+
+ int final_output_level = 0;
+
+ if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
+ cfd->NumberLevels() > 1) {
+ // Always compact all files together.
+ final_output_level = cfd->NumberLevels() - 1;
+ // if bottom most level is reserved
+ if (immutable_db_options_.allow_ingest_behind) {
+ final_output_level--;
+ }
+ s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
+ final_output_level, options, begin, end, exclusive,
+ false, max_file_num_to_ignore);
+ } else {
+ for (int level = 0; level <= max_level_with_files; level++) {
+ int output_level;
+ // in case the compaction is universal or if we're compacting the
+ // bottom-most level, the output level will be the same as input one.
+ // level 0 can never be the bottommost level (i.e. if all files are in
+ // level 0, we will compact to level 1)
+ if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+ output_level = level;
+ } else if (level == max_level_with_files && level > 0) {
+ if (options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kSkip) {
+ // Skip bottommost level compaction
+ continue;
+ } else if (options.bottommost_level_compaction ==
+ BottommostLevelCompaction::kIfHaveCompactionFilter &&
+ cfd->ioptions()->compaction_filter == nullptr &&
+ cfd->ioptions()->compaction_filter_factory == nullptr) {
+ // Skip bottommost level compaction since we don't have a compaction
+ // filter
+ continue;
+ }
+ output_level = level;
+ // update max_file_num_to_ignore only for bottom level compaction
+ // because data in newly compacted files in middle levels may still need
+ // to be pushed down
+ max_file_num_to_ignore = next_file_number;
+ } else {
+ output_level = level + 1;
+ if (cfd->ioptions()->compaction_style == kCompactionStyleLevel &&
+ cfd->ioptions()->level_compaction_dynamic_level_bytes &&
+ level == 0) {
+ output_level = ColumnFamilyData::kCompactToBaseLevel;
+ }
+ }
+ s = RunManualCompaction(cfd, level, output_level, options, begin, end,
+ exclusive, false, max_file_num_to_ignore);
+ if (!s.ok()) {
+ break;
+ }
+ if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+ final_output_level = cfd->NumberLevels() - 1;
+ } else if (output_level > final_output_level) {
+ final_output_level = output_level;
+ }
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1");
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2");
+ }
+ }
+ if (!s.ok()) {
+ LogFlush(immutable_db_options_.info_log);
+ return s;
+ }
+
+ if (options.change_level) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[RefitLevel] waiting for background threads to stop");
+ s = PauseBackgroundWork();
+ if (s.ok()) {
+ s = ReFitLevel(cfd, final_output_level, options.target_level);
+ }
+ ContinueBackgroundWork();
+ }
+ LogFlush(immutable_db_options_.info_log);
+
+ {
+ InstrumentedMutexLock l(&mutex_);
+ // an automatic compaction that has been scheduled might have been
+ // preempted by the manual compactions. Need to schedule it back.
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ return s;
+}
+
+Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
+ ColumnFamilyHandle* column_family,
+ const std::vector<std::string>& input_file_names,
+ const int output_level, const int output_path_id,
+ std::vector<std::string>* const output_file_names,
+ CompactionJobInfo* compaction_job_info) {
+#ifdef ROCKSDB_LITE
+ (void)compact_options;
+ (void)column_family;
+ (void)input_file_names;
+ (void)output_level;
+ (void)output_path_id;
+ (void)output_file_names;
+ (void)compaction_job_info;
+ // not supported in lite version
+ return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+ if (column_family == nullptr) {
+ return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
+ }
+
+ auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ assert(cfd);
+
+ Status s;
+ JobContext job_context(0, true);
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+
+ // Perform CompactFiles
+ TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2");
+ {
+ InstrumentedMutexLock l(&mutex_);
+
+ // This call will unlock/lock the mutex to wait for current running
+ // IngestExternalFile() calls to finish.
+ WaitForIngestFile();
+
+ // We need to get current after `WaitForIngestFile`, because
+ // `IngestExternalFile` may add files that overlap with `input_file_names`
+ auto* current = cfd->current();
+ current->Ref();
+
+ s = CompactFilesImpl(compact_options, cfd, current, input_file_names,
+ output_file_names, output_level, output_path_id,
+ &job_context, &log_buffer, compaction_job_info);
+
+ current->Unref();
+ }
+
+ // Find and delete obsolete files
+ {
+ InstrumentedMutexLock l(&mutex_);
+ // If !s.ok(), this means that Compaction failed. In that case, we want
+ // to delete all obsolete files we might have created and we force
+ // FindObsoleteFiles(). This is because job_context does not
+ // catch all created files if compaction failed.
+ FindObsoleteFiles(&job_context, !s.ok());
+ } // release the mutex
+
+ // delete unnecessary files if any, this is done outside the mutex
+ if (job_context.HaveSomethingToClean() ||
+ job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+ // Have to flush the info logs before bg_compaction_scheduled_--
+ // because if bg_flush_scheduled_ becomes 0 and the lock is
+ // released, the deconstructor of DB can kick in and destroy all the
+ // states of DB so info_log might not be available after that point.
+ // It also applies to access other states that DB owns.
+ log_buffer.FlushBufferToLog();
+ if (job_context.HaveSomethingToDelete()) {
+ // no mutex is locked here. No need to Unlock() and Lock() here.
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ }
+
+ return s;
+#endif // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::CompactFilesImpl(
+ const CompactionOptions& compact_options, ColumnFamilyData* cfd,
+ Version* version, const std::vector<std::string>& input_file_names,
+ std::vector<std::string>* const output_file_names, const int output_level,
+ int output_path_id, JobContext* job_context, LogBuffer* log_buffer,
+ CompactionJobInfo* compaction_job_info) {
+ mutex_.AssertHeld();
+
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return Status::ShutdownInProgress();
+ }
+ if (manual_compaction_paused_.load(std::memory_order_acquire)) {
+ return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+
+ std::unordered_set<uint64_t> input_set;
+ for (const auto& file_name : input_file_names) {
+ input_set.insert(TableFileNameToNumber(file_name));
+ }
+
+ ColumnFamilyMetaData cf_meta;
+ // TODO(yhchiang): can directly use version here if none of the
+ // following functions call is pluggable to external developers.
+ version->GetColumnFamilyMetaData(&cf_meta);
+
+ if (output_path_id < 0) {
+ if (cfd->ioptions()->cf_paths.size() == 1U) {
+ output_path_id = 0;
+ } else {
+ return Status::NotSupported(
+ "Automatic output path selection is not "
+ "yet supported in CompactFiles()");
+ }
+ }
+
+ Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
+ &input_set, cf_meta, output_level);
+ if (!s.ok()) {
+ return s;
+ }
+
+ std::vector<CompactionInputFiles> input_files;
+ s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+ &input_files, &input_set, version->storage_info(), compact_options);
+ if (!s.ok()) {
+ return s;
+ }
+
+ for (const auto& inputs : input_files) {
+ if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) {
+ return Status::Aborted(
+ "Some of the necessary compaction input "
+ "files are already being compacted");
+ }
+ }
+ bool sfm_reserved_compact_space = false;
+ // First check if we have enough room to do the compaction
+ bool enough_room = EnoughRoomForCompaction(
+ cfd, input_files, &sfm_reserved_compact_space, log_buffer);
+
+ if (!enough_room) {
+ // m's vars will get set properly at the end of this function,
+ // as long as status == CompactionTooLarge
+ return Status::CompactionTooLarge();
+ }
+
+ // At this point, CompactFiles will be run.
+ bg_compaction_scheduled_++;
+
+ std::unique_ptr<Compaction> c;
+ assert(cfd->compaction_picker());
+ c.reset(cfd->compaction_picker()->CompactFiles(
+ compact_options, input_files, output_level, version->storage_info(),
+ *cfd->GetLatestMutableCFOptions(), output_path_id));
+ // we already sanitized the set of input files and checked for conflicts
+ // without releasing the lock, so we're guaranteed a compaction can be formed.
+ assert(c != nullptr);
+
+ c->SetInputVersion(version);
+ // deletion compaction currently not allowed in CompactFiles.
+ assert(!c->deletion_compaction());
+
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+ new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+
+ assert(is_snapshot_supported_ || snapshots_.empty());
+ CompactionJobStats compaction_job_stats;
+ CompactionJob compaction_job(
+ job_context->job_id, c.get(), immutable_db_options_,
+ file_options_for_compaction_, versions_.get(), &shutting_down_,
+ preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
+ GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_,
+ &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
+ snapshot_checker, table_cache_, &event_logger_,
+ c->mutable_cf_options()->paranoid_file_checks,
+ c->mutable_cf_options()->report_bg_io_stats, dbname_,
+ &compaction_job_stats, Env::Priority::USER, &manual_compaction_paused_);
+
+ // Creating a compaction influences the compaction score because the score
+ // takes running compactions into account (by skipping files that are already
+ // being compacted). Since we just changed compaction score, we recalculate it
+ // here.
+ version->storage_info()->ComputeCompactionScore(*cfd->ioptions(),
+ *c->mutable_cf_options());
+
+ compaction_job.Prepare();
+
+ mutex_.Unlock();
+ TEST_SYNC_POINT("CompactFilesImpl:0");
+ TEST_SYNC_POINT("CompactFilesImpl:1");
+ compaction_job.Run();
+ TEST_SYNC_POINT("CompactFilesImpl:2");
+ TEST_SYNC_POINT("CompactFilesImpl:3");
+ mutex_.Lock();
+
+ Status status = compaction_job.Install(*c->mutable_cf_options());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+ }
+ c->ReleaseCompactionFiles(s);
+#ifndef ROCKSDB_LITE
+ // Need to make sure SstFileManager does its bookkeeping
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm && sfm_reserved_compact_space) {
+ sfm->OnCompactionCompletion(c.get());
+ }
+#endif // ROCKSDB_LITE
+
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ if (compaction_job_info != nullptr) {
+ BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats,
+ job_context->job_id, version, compaction_job_info);
+ }
+
+ if (status.ok()) {
+ // Done
+ } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+ // Ignore compaction errors found during shutting down
+ } else if (status.IsManualCompactionPaused()) {
+ // Don't report stopping manual compaction as error
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] [JOB %d] Stopping manual compaction",
+ c->column_family_data()->GetName().c_str(),
+ job_context->job_id);
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "[%s] [JOB %d] Compaction error: %s",
+ c->column_family_data()->GetName().c_str(),
+ job_context->job_id, status.ToString().c_str());
+ error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+ }
+
+ if (output_file_names != nullptr) {
+ for (const auto newf : c->edit()->GetNewFiles()) {
+ (*output_file_names)
+ .push_back(TableFileName(c->immutable_cf_options()->cf_paths,
+ newf.second.fd.GetNumber(),
+ newf.second.fd.GetPathId()));
+ }
+ }
+
+ c.reset();
+
+ bg_compaction_scheduled_--;
+ if (bg_compaction_scheduled_ == 0) {
+ bg_cv_.SignalAll();
+ }
+ MaybeScheduleFlushOrCompaction();
+ TEST_SYNC_POINT("CompactFilesImpl:End");
+
+ return status;
+}
+#endif // ROCKSDB_LITE
+
+Status DBImpl::PauseBackgroundWork() {
+ InstrumentedMutexLock guard_lock(&mutex_);
+ bg_compaction_paused_++;
+ while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 ||
+ bg_flush_scheduled_ > 0) {
+ bg_cv_.Wait();
+ }
+ bg_work_paused_++;
+ return Status::OK();
+}
+
+Status DBImpl::ContinueBackgroundWork() {
+ InstrumentedMutexLock guard_lock(&mutex_);
+ if (bg_work_paused_ == 0) {
+ return Status::InvalidArgument();
+ }
+ assert(bg_work_paused_ > 0);
+ assert(bg_compaction_paused_ > 0);
+ bg_compaction_paused_--;
+ bg_work_paused_--;
+ // It's sufficient to check just bg_work_paused_ here since
+ // bg_work_paused_ is always no greater than bg_compaction_paused_
+ if (bg_work_paused_ == 0) {
+ MaybeScheduleFlushOrCompaction();
+ }
+ return Status::OK();
+}
+
+void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+ const Status& st,
+ const CompactionJobStats& job_stats,
+ int job_id) {
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.listeners.empty()) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+ if (c->is_manual_compaction() &&
+ manual_compaction_paused_.load(std::memory_order_acquire)) {
+ return;
+ }
+ Version* current = cfd->current();
+ current->Ref();
+ // release lock while notifying events
+ mutex_.Unlock();
+ TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
+ {
+ CompactionJobInfo info{};
+ info.cf_name = cfd->GetName();
+ info.status = st;
+ info.thread_id = env_->GetThreadID();
+ info.job_id = job_id;
+ info.base_input_level = c->start_level();
+ info.output_level = c->output_level();
+ info.stats = job_stats;
+ info.table_properties = c->GetOutputTableProperties();
+ info.compaction_reason = c->compaction_reason();
+ info.compression = c->output_compression();
+ for (size_t i = 0; i < c->num_input_levels(); ++i) {
+ for (const auto fmd : *c->inputs(i)) {
+ const FileDescriptor& desc = fmd->fd;
+ const uint64_t file_number = desc.GetNumber();
+ auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
+ file_number, desc.GetPathId());
+ info.input_files.push_back(fn);
+ info.input_file_infos.push_back(CompactionFileInfo{
+ static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
+ if (info.table_properties.count(fn) == 0) {
+ std::shared_ptr<const TableProperties> tp;
+ auto s = current->GetTableProperties(&tp, fmd, &fn);
+ if (s.ok()) {
+ info.table_properties[fn] = tp;
+ }
+ }
+ }
+ }
+ for (const auto newf : c->edit()->GetNewFiles()) {
+ const FileMetaData& meta = newf.second;
+ const FileDescriptor& desc = meta.fd;
+ const uint64_t file_number = desc.GetNumber();
+ info.output_files.push_back(TableFileName(
+ c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+ info.output_file_infos.push_back(CompactionFileInfo{
+ newf.first, file_number, meta.oldest_blob_file_number});
+ }
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnCompactionBegin(this, info);
+ }
+ }
+ mutex_.Lock();
+ current->Unref();
+#else
+ (void)cfd;
+ (void)c;
+ (void)st;
+ (void)job_stats;
+ (void)job_id;
+#endif // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnCompactionCompleted(
+ ColumnFamilyData* cfd, Compaction* c, const Status& st,
+ const CompactionJobStats& compaction_job_stats, const int job_id) {
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ mutex_.AssertHeld();
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+ if (c->is_manual_compaction() &&
+ manual_compaction_paused_.load(std::memory_order_acquire)) {
+ return;
+ }
+ Version* current = cfd->current();
+ current->Ref();
+ // release lock while notifying events
+ mutex_.Unlock();
+ TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
+ {
+ CompactionJobInfo info{};
+ BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current,
+ &info);
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnCompactionCompleted(this, info);
+ }
+ }
+ mutex_.Lock();
+ current->Unref();
+ // no need to signal bg_cv_ as it will be signaled at the end of the
+ // flush process.
+#else
+ (void)cfd;
+ (void)c;
+ (void)st;
+ (void)compaction_job_stats;
+ (void)job_id;
+#endif // ROCKSDB_LITE
+}
+
+// REQUIREMENT: block all background work by calling PauseBackgroundWork()
+// before calling this function
+Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
+ assert(level < cfd->NumberLevels());
+ if (target_level >= cfd->NumberLevels()) {
+ return Status::InvalidArgument("Target level exceeds number of levels");
+ }
+
+ SuperVersionContext sv_context(/* create_superversion */ true);
+
+ Status status;
+
+ InstrumentedMutexLock guard_lock(&mutex_);
+
+ // only allow one thread refitting
+ if (refitting_level_) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[ReFitLevel] another thread is refitting");
+ return Status::NotSupported("another thread is refitting");
+ }
+ refitting_level_ = true;
+
+ const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+ // move to a smaller level
+ int to_level = target_level;
+ if (target_level < 0) {
+ to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level);
+ }
+
+ auto* vstorage = cfd->current()->storage_info();
+ if (to_level > level) {
+ if (level == 0) {
+ return Status::NotSupported(
+ "Cannot change from level 0 to other levels.");
+ }
+ // Check levels are empty for a trivial move
+ for (int l = level + 1; l <= to_level; l++) {
+ if (vstorage->NumLevelFiles(l) > 0) {
+ return Status::NotSupported(
+ "Levels between source and target are not empty for a move.");
+ }
+ }
+ }
+ if (to_level != level) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] Before refitting:\n%s", cfd->GetName().c_str(),
+ cfd->current()->DebugString().data());
+
+ VersionEdit edit;
+ edit.SetColumnFamily(cfd->GetID());
+ for (const auto& f : vstorage->LevelFiles(level)) {
+ edit.DeleteFile(level, f->fd.GetNumber());
+ edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
+ f->fd.GetFileSize(), f->smallest, f->largest,
+ f->fd.smallest_seqno, f->fd.largest_seqno,
+ f->marked_for_compaction, f->oldest_blob_file_number,
+ f->oldest_ancester_time, f->file_creation_time,
+ f->file_checksum, f->file_checksum_func_name);
+ }
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
+ edit.DebugString().data());
+
+ status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
+ directories_.GetDbDir());
+ InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options);
+
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n",
+ cfd->GetName().c_str(), status.ToString().data());
+
+ if (status.ok()) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] After refitting:\n%s", cfd->GetName().c_str(),
+ cfd->current()->DebugString().data());
+ }
+ }
+
+ sv_context.Clean();
+ refitting_level_ = false;
+
+ return status;
+}
+
+int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ return cfh->cfd()->NumberLevels();
+}
+
+int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) {
+ return 0;
+}
+
+int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ InstrumentedMutexLock l(&mutex_);
+ return cfh->cfd()
+ ->GetSuperVersion()
+ ->mutable_cf_options.level0_stop_writes_trigger;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+ ColumnFamilyHandle* column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.",
+ cfh->GetName().c_str());
+ Status s;
+ if (immutable_db_options_.atomic_flush) {
+ s = AtomicFlushMemTables({cfh->cfd()}, flush_options,
+ FlushReason::kManualFlush);
+ } else {
+ s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush);
+ }
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] Manual flush finished, status: %s\n",
+ cfh->GetName().c_str(), s.ToString().c_str());
+ return s;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+ const std::vector<ColumnFamilyHandle*>& column_families) {
+ Status s;
+ if (!immutable_db_options_.atomic_flush) {
+ for (auto cfh : column_families) {
+ s = Flush(flush_options, cfh);
+ if (!s.ok()) {
+ break;
+ }
+ }
+ } else {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Manual atomic flush start.\n"
+ "=====Column families:=====");
+ for (auto cfh : column_families) {
+ auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+ cfhi->GetName().c_str());
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "=====End of column families list=====");
+ autovector<ColumnFamilyData*> cfds;
+ std::for_each(column_families.begin(), column_families.end(),
+ [&cfds](ColumnFamilyHandle* elem) {
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(elem);
+ cfds.emplace_back(cfh->cfd());
+ });
+ s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Manual atomic flush finished, status: %s\n"
+ "=====Column families:=====",
+ s.ToString().c_str());
+ for (auto cfh : column_families) {
+ auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+ cfhi->GetName().c_str());
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "=====End of column families list=====");
+ }
+ return s;
+}
+
+Status DBImpl::RunManualCompaction(
+ ColumnFamilyData* cfd, int input_level, int output_level,
+ const CompactRangeOptions& compact_range_options, const Slice* begin,
+ const Slice* end, bool exclusive, bool disallow_trivial_move,
+ uint64_t max_file_num_to_ignore) {
+ assert(input_level == ColumnFamilyData::kCompactAllLevels ||
+ input_level >= 0);
+
+ InternalKey begin_storage, end_storage;
+ CompactionArg* ca;
+
+ bool scheduled = false;
+ bool manual_conflict = false;
+ ManualCompactionState manual;
+ manual.cfd = cfd;
+ manual.input_level = input_level;
+ manual.output_level = output_level;
+ manual.output_path_id = compact_range_options.target_path_id;
+ manual.done = false;
+ manual.in_progress = false;
+ manual.incomplete = false;
+ manual.exclusive = exclusive;
+ manual.disallow_trivial_move = disallow_trivial_move;
+ // For universal compaction, we enforce every manual compaction to compact
+ // all files.
+ if (begin == nullptr ||
+ cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+ manual.begin = nullptr;
+ } else {
+ begin_storage.SetMinPossibleForUserKey(*begin);
+ manual.begin = &begin_storage;
+ }
+ if (end == nullptr ||
+ cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+ manual.end = nullptr;
+ } else {
+ end_storage.SetMaxPossibleForUserKey(*end);
+ manual.end = &end_storage;
+ }
+
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:0");
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:1");
+ InstrumentedMutexLock l(&mutex_);
+
+ // When a manual compaction arrives, temporarily disable scheduling of
+ // non-manual compactions and wait until the number of scheduled compaction
+ // jobs drops to zero. This is needed to ensure that this manual compaction
+ // can compact any range of keys/files.
+ //
+ // HasPendingManualCompaction() is true when at least one thread is inside
+ // RunManualCompaction(), i.e. during that time no other compaction will
+ // get scheduled (see MaybeScheduleFlushOrCompaction).
+ //
+ // Note that the following loop doesn't stop more that one thread calling
+ // RunManualCompaction() from getting to the second while loop below.
+ // However, only one of them will actually schedule compaction, while
+ // others will wait on a condition variable until it completes.
+
+ AddManualCompaction(&manual);
+ TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_);
+ if (exclusive) {
+ while (bg_bottom_compaction_scheduled_ > 0 ||
+ bg_compaction_scheduled_ > 0) {
+ TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled");
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "[%s] Manual compaction waiting for all other scheduled background "
+ "compactions to finish",
+ cfd->GetName().c_str());
+ bg_cv_.Wait();
+ }
+ }
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] Manual compaction starting", cfd->GetName().c_str());
+
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+ // We don't check bg_error_ here, because if we get the error in compaction,
+ // the compaction will set manual.status to bg_error_ and set manual.done to
+ // true.
+ while (!manual.done) {
+ assert(HasPendingManualCompaction());
+ manual_conflict = false;
+ Compaction* compaction = nullptr;
+ if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) ||
+ scheduled ||
+ (((manual.manual_end = &manual.tmp_storage1) != nullptr) &&
+ ((compaction = manual.cfd->CompactRange(
+ *manual.cfd->GetLatestMutableCFOptions(), manual.input_level,
+ manual.output_level, compact_range_options, manual.begin,
+ manual.end, &manual.manual_end, &manual_conflict,
+ max_file_num_to_ignore)) == nullptr &&
+ manual_conflict))) {
+ // exclusive manual compactions should not see a conflict during
+ // CompactRange
+ assert(!exclusive || !manual_conflict);
+ // Running either this or some other manual compaction
+ bg_cv_.Wait();
+ if (scheduled && manual.incomplete == true) {
+ assert(!manual.in_progress);
+ scheduled = false;
+ manual.incomplete = false;
+ }
+ } else if (!scheduled) {
+ if (compaction == nullptr) {
+ manual.done = true;
+ bg_cv_.SignalAll();
+ continue;
+ }
+ ca = new CompactionArg;
+ ca->db = this;
+ ca->prepicked_compaction = new PrepickedCompaction;
+ ca->prepicked_compaction->manual_compaction_state = &manual;
+ ca->prepicked_compaction->compaction = compaction;
+ if (!RequestCompactionToken(
+ cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) {
+ // Don't throttle manual compaction, only count outstanding tasks.
+ assert(false);
+ }
+ manual.incomplete = false;
+ bg_compaction_scheduled_++;
+ env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
+ &DBImpl::UnscheduleCompactionCallback);
+ scheduled = true;
+ }
+ }
+
+ log_buffer.FlushBufferToLog();
+ assert(!manual.in_progress);
+ assert(HasPendingManualCompaction());
+ RemoveManualCompaction(&manual);
+ bg_cv_.SignalAll();
+ return manual.status;
+}
+
+void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+ FlushRequest* req) {
+ assert(req != nullptr);
+ req->reserve(cfds.size());
+ for (const auto cfd : cfds) {
+ if (nullptr == cfd) {
+ // cfd may be null, see DBImpl::ScheduleFlushes
+ continue;
+ }
+ uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID();
+ req->emplace_back(cfd, max_memtable_id);
+ }
+}
+
+Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
+ const FlushOptions& flush_options,
+ FlushReason flush_reason, bool writes_stopped) {
+ Status s;
+ uint64_t flush_memtable_id = 0;
+ if (!flush_options.allow_write_stall) {
+ bool flush_needed = true;
+ s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone");
+ if (!s.ok() || !flush_needed) {
+ return s;
+ }
+ }
+ FlushRequest flush_req;
+ {
+ WriteContext context;
+ InstrumentedMutexLock guard_lock(&mutex_);
+
+ WriteThread::Writer w;
+ WriteThread::Writer nonmem_w;
+ if (!writes_stopped) {
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+ }
+ WaitForPendingWrites();
+
+ if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
+ s = SwitchMemtable(cfd, &context);
+ }
+ if (s.ok()) {
+ if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+ !cached_recoverable_state_empty_.load()) {
+ flush_memtable_id = cfd->imm()->GetLatestMemTableID();
+ flush_req.emplace_back(cfd, flush_memtable_id);
+ }
+ if (immutable_db_options_.persist_stats_to_disk) {
+ ColumnFamilyData* cfd_stats =
+ versions_->GetColumnFamilySet()->GetColumnFamily(
+ kPersistentStatsColumnFamilyName);
+ if (cfd_stats != nullptr && cfd_stats != cfd &&
+ !cfd_stats->mem()->IsEmpty()) {
+ // only force flush stats CF when it will be the only CF lagging
+ // behind after the current flush
+ bool stats_cf_flush_needed = true;
+ for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+ if (loop_cfd == cfd_stats || loop_cfd == cfd) {
+ continue;
+ }
+ if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+ stats_cf_flush_needed = false;
+ }
+ }
+ if (stats_cf_flush_needed) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Force flushing stats CF with manual flush of %s "
+ "to avoid holding old logs",
+ cfd->GetName().c_str());
+ s = SwitchMemtable(cfd_stats, &context);
+ flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID();
+ flush_req.emplace_back(cfd_stats, flush_memtable_id);
+ }
+ }
+ }
+ }
+
+ if (s.ok() && !flush_req.empty()) {
+ for (auto& elem : flush_req) {
+ ColumnFamilyData* loop_cfd = elem.first;
+ loop_cfd->imm()->FlushRequested();
+ }
+ // If the caller wants to wait for this flush to complete, it indicates
+ // that the caller expects the ColumnFamilyData not to be free'ed by
+ // other threads which may drop the column family concurrently.
+ // Therefore, we increase the cfd's ref count.
+ if (flush_options.wait) {
+ for (auto& elem : flush_req) {
+ ColumnFamilyData* loop_cfd = elem.first;
+ loop_cfd->Ref();
+ }
+ }
+ SchedulePendingFlush(flush_req, flush_reason);
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ if (!writes_stopped) {
+ write_thread_.ExitUnbatched(&w);
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush");
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush");
+ if (s.ok() && flush_options.wait) {
+ autovector<ColumnFamilyData*> cfds;
+ autovector<const uint64_t*> flush_memtable_ids;
+ for (auto& iter : flush_req) {
+ cfds.push_back(iter.first);
+ flush_memtable_ids.push_back(&(iter.second));
+ }
+ s = WaitForFlushMemTables(cfds, flush_memtable_ids,
+ (flush_reason == FlushReason::kErrorRecovery));
+ InstrumentedMutexLock lock_guard(&mutex_);
+ for (auto* tmp_cfd : cfds) {
+ tmp_cfd->UnrefAndTryDelete();
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished");
+ return s;
+}
+
+// Flush all elements in 'column_family_datas'
+// and atomically record the result to the MANIFEST.
+Status DBImpl::AtomicFlushMemTables(
+ const autovector<ColumnFamilyData*>& column_family_datas,
+ const FlushOptions& flush_options, FlushReason flush_reason,
+ bool writes_stopped) {
+ Status s;
+ if (!flush_options.allow_write_stall) {
+ int num_cfs_to_flush = 0;
+ for (auto cfd : column_family_datas) {
+ bool flush_needed = true;
+ s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+ if (!s.ok()) {
+ return s;
+ } else if (flush_needed) {
+ ++num_cfs_to_flush;
+ }
+ }
+ if (0 == num_cfs_to_flush) {
+ return s;
+ }
+ }
+ FlushRequest flush_req;
+ autovector<ColumnFamilyData*> cfds;
+ {
+ WriteContext context;
+ InstrumentedMutexLock guard_lock(&mutex_);
+
+ WriteThread::Writer w;
+ WriteThread::Writer nonmem_w;
+ if (!writes_stopped) {
+ write_thread_.EnterUnbatched(&w, &mutex_);
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+ }
+ WaitForPendingWrites();
+
+ for (auto cfd : column_family_datas) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+ !cached_recoverable_state_empty_.load()) {
+ cfds.emplace_back(cfd);
+ }
+ }
+ for (auto cfd : cfds) {
+ if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) {
+ continue;
+ }
+ cfd->Ref();
+ s = SwitchMemtable(cfd, &context);
+ cfd->UnrefAndTryDelete();
+ if (!s.ok()) {
+ break;
+ }
+ }
+ if (s.ok()) {
+ AssignAtomicFlushSeq(cfds);
+ for (auto cfd : cfds) {
+ cfd->imm()->FlushRequested();
+ }
+ // If the caller wants to wait for this flush to complete, it indicates
+ // that the caller expects the ColumnFamilyData not to be free'ed by
+ // other threads which may drop the column family concurrently.
+ // Therefore, we increase the cfd's ref count.
+ if (flush_options.wait) {
+ for (auto cfd : cfds) {
+ cfd->Ref();
+ }
+ }
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, flush_reason);
+ MaybeScheduleFlushOrCompaction();
+ }
+
+ if (!writes_stopped) {
+ write_thread_.ExitUnbatched(&w);
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
+ TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush");
+ if (s.ok() && flush_options.wait) {
+ autovector<const uint64_t*> flush_memtable_ids;
+ for (auto& iter : flush_req) {
+ flush_memtable_ids.push_back(&(iter.second));
+ }
+ s = WaitForFlushMemTables(cfds, flush_memtable_ids,
+ (flush_reason == FlushReason::kErrorRecovery));
+ InstrumentedMutexLock lock_guard(&mutex_);
+ for (auto* cfd : cfds) {
+ cfd->UnrefAndTryDelete();
+ }
+ }
+ return s;
+}
+
+// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can
+// cause write stall, for example if one memtable is being flushed already.
+// This method tries to avoid write stall (similar to CompactRange() behavior)
+// it emulates how the SuperVersion / LSM would change if flush happens, checks
+// it against various constrains and delays flush if it'd cause write stall.
+// Called should check status and flush_needed to see if flush already happened.
+Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+ bool* flush_needed) {
+ {
+ *flush_needed = true;
+ InstrumentedMutexLock l(&mutex_);
+ uint64_t orig_active_memtable_id = cfd->mem()->GetID();
+ WriteStallCondition write_stall_condition = WriteStallCondition::kNormal;
+ do {
+ if (write_stall_condition != WriteStallCondition::kNormal) {
+ // Same error handling as user writes: Don't wait if there's a
+ // background error, even if it's a soft error. We might wait here
+ // indefinitely as the pending flushes/compactions may never finish
+ // successfully, resulting in the stall condition lasting indefinitely
+ if (error_handler_.IsBGWorkStopped()) {
+ return error_handler_.GetBGError();
+ }
+
+ TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait");
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] WaitUntilFlushWouldNotStallWrites"
+ " waiting on stall conditions to clear",
+ cfd->GetName().c_str());
+ bg_cv_.Wait();
+ }
+ if (cfd->IsDropped()) {
+ return Status::ColumnFamilyDropped();
+ }
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return Status::ShutdownInProgress();
+ }
+
+ uint64_t earliest_memtable_id =
+ std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID());
+ if (earliest_memtable_id > orig_active_memtable_id) {
+ // We waited so long that the memtable we were originally waiting on was
+ // flushed.
+ *flush_needed = false;
+ return Status::OK();
+ }
+
+ const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+ const auto* vstorage = cfd->current()->storage_info();
+
+ // Skip stalling check if we're below auto-flush and auto-compaction
+ // triggers. If it stalled in these conditions, that'd mean the stall
+ // triggers are so low that stalling is needed for any background work. In
+ // that case we shouldn't wait since background work won't be scheduled.
+ if (cfd->imm()->NumNotFlushed() <
+ cfd->ioptions()->min_write_buffer_number_to_merge &&
+ vstorage->l0_delay_trigger_count() <
+ mutable_cf_options.level0_file_num_compaction_trigger) {
+ break;
+ }
+
+ // check whether one extra immutable memtable or an extra L0 file would
+ // cause write stalling mode to be entered. It could still enter stall
+ // mode due to pending compaction bytes, but that's less common
+ write_stall_condition =
+ ColumnFamilyData::GetWriteStallConditionAndCause(
+ cfd->imm()->NumNotFlushed() + 1,
+ vstorage->l0_delay_trigger_count() + 1,
+ vstorage->estimated_compaction_needed_bytes(), mutable_cf_options)
+ .first;
+ } while (write_stall_condition != WriteStallCondition::kNormal);
+ }
+ return Status::OK();
+}
+
+// Wait for memtables to be flushed for multiple column families.
+// let N = cfds.size()
+// for i in [0, N),
+// 1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs
+// have to be flushed for THIS column family;
+// 2) if flush_memtable_ids[i] is null, then all memtables in THIS column
+// family have to be flushed.
+// Finish waiting when ALL column families finish flushing memtables.
+// resuming_from_bg_err indicates whether the caller is trying to resume from
+// background error or in normal processing.
+Status DBImpl::WaitForFlushMemTables(
+ const autovector<ColumnFamilyData*>& cfds,
+ const autovector<const uint64_t*>& flush_memtable_ids,
+ bool resuming_from_bg_err) {
+ int num = static_cast<int>(cfds.size());
+ // Wait until the compaction completes
+ InstrumentedMutexLock l(&mutex_);
+ // If the caller is trying to resume from bg error, then
+ // error_handler_.IsDBStopped() is true.
+ while (resuming_from_bg_err || !error_handler_.IsDBStopped()) {
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return Status::ShutdownInProgress();
+ }
+ // If an error has occurred during resumption, then no need to wait.
+ if (!error_handler_.GetRecoveryError().ok()) {
+ break;
+ }
+ // Number of column families that have been dropped.
+ int num_dropped = 0;
+ // Number of column families that have finished flush.
+ int num_finished = 0;
+ for (int i = 0; i < num; ++i) {
+ if (cfds[i]->IsDropped()) {
+ ++num_dropped;
+ } else if (cfds[i]->imm()->NumNotFlushed() == 0 ||
+ (flush_memtable_ids[i] != nullptr &&
+ cfds[i]->imm()->GetEarliestMemTableID() >
+ *flush_memtable_ids[i])) {
+ ++num_finished;
+ }
+ }
+ if (1 == num_dropped && 1 == num) {
+ return Status::InvalidArgument("Cannot flush a dropped CF");
+ }
+ // Column families involved in this flush request have either been dropped
+ // or finished flush. Then it's time to finish waiting.
+ if (num_dropped + num_finished == num) {
+ break;
+ }
+ bg_cv_.Wait();
+ }
+ Status s;
+ // If not resuming from bg error, and an error has caused the DB to stop,
+ // then report the bg error to caller.
+ if (!resuming_from_bg_err && error_handler_.IsDBStopped()) {
+ s = error_handler_.GetBGError();
+ }
+ return s;
+}
+
+Status DBImpl::EnableAutoCompaction(
+ const std::vector<ColumnFamilyHandle*>& column_family_handles) {
+ Status s;
+ for (auto cf_ptr : column_family_handles) {
+ Status status =
+ this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}});
+ if (!status.ok()) {
+ s = status;
+ }
+ }
+
+ return s;
+}
+
+void DBImpl::DisableManualCompaction() {
+ manual_compaction_paused_.store(true, std::memory_order_release);
+}
+
+void DBImpl::EnableManualCompaction() {
+ manual_compaction_paused_.store(false, std::memory_order_release);
+}
+
+void DBImpl::MaybeScheduleFlushOrCompaction() {
+ mutex_.AssertHeld();
+ if (!opened_successfully_) {
+ // Compaction may introduce data race to DB open
+ return;
+ }
+ if (bg_work_paused_ > 0) {
+ // we paused the background work
+ return;
+ } else if (error_handler_.IsBGWorkStopped() &&
+ !error_handler_.IsRecoveryInProgress()) {
+ // There has been a hard error and this call is not part of the recovery
+ // sequence. Bail out here so we don't get into an endless loop of
+ // scheduling BG work which will again call this function
+ return;
+ } else if (shutting_down_.load(std::memory_order_acquire)) {
+ // DB is being deleted; no more background compactions
+ return;
+ }
+ auto bg_job_limits = GetBGJobLimits();
+ bool is_flush_pool_empty =
+ env_->GetBackgroundThreads(Env::Priority::HIGH) == 0;
+ while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
+ bg_flush_scheduled_ < bg_job_limits.max_flushes) {
+ bg_flush_scheduled_++;
+ FlushThreadArg* fta = new FlushThreadArg;
+ fta->db_ = this;
+ fta->thread_pri_ = Env::Priority::HIGH;
+ env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::HIGH, this,
+ &DBImpl::UnscheduleFlushCallback);
+ --unscheduled_flushes_;
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0",
+ &unscheduled_flushes_);
+ }
+
+ // special case -- if high-pri (flush) thread pool is empty, then schedule
+ // flushes in low-pri (compaction) thread pool.
+ if (is_flush_pool_empty) {
+ while (unscheduled_flushes_ > 0 &&
+ bg_flush_scheduled_ + bg_compaction_scheduled_ <
+ bg_job_limits.max_flushes) {
+ bg_flush_scheduled_++;
+ FlushThreadArg* fta = new FlushThreadArg;
+ fta->db_ = this;
+ fta->thread_pri_ = Env::Priority::LOW;
+ env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::LOW, this,
+ &DBImpl::UnscheduleFlushCallback);
+ --unscheduled_flushes_;
+ }
+ }
+
+ if (bg_compaction_paused_ > 0) {
+ // we paused the background compaction
+ return;
+ } else if (error_handler_.IsBGWorkStopped()) {
+ // Compaction is not part of the recovery sequence from a hard error. We
+ // might get here because recovery might do a flush and install a new
+ // super version, which will try to schedule pending compactions. Bail
+ // out here and let the higher level recovery handle compactions
+ return;
+ }
+
+ if (HasExclusiveManualCompaction()) {
+ // only manual compactions are allowed to run. don't schedule automatic
+ // compactions
+ TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Conflict");
+ return;
+ }
+
+ while (bg_compaction_scheduled_ < bg_job_limits.max_compactions &&
+ unscheduled_compactions_ > 0) {
+ CompactionArg* ca = new CompactionArg;
+ ca->db = this;
+ ca->prepicked_compaction = nullptr;
+ bg_compaction_scheduled_++;
+ unscheduled_compactions_--;
+ env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
+ &DBImpl::UnscheduleCompactionCallback);
+ }
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const {
+ mutex_.AssertHeld();
+ return GetBGJobLimits(immutable_db_options_.max_background_flushes,
+ mutable_db_options_.max_background_compactions,
+ mutable_db_options_.max_background_jobs,
+ write_controller_.NeedSpeedupCompaction());
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes,
+ int max_background_compactions,
+ int max_background_jobs,
+ bool parallelize_compactions) {
+ BGJobLimits res;
+ if (max_background_flushes == -1 && max_background_compactions == -1) {
+ // for our first stab implementing max_background_jobs, simply allocate a
+ // quarter of the threads to flushes.
+ res.max_flushes = std::max(1, max_background_jobs / 4);
+ res.max_compactions = std::max(1, max_background_jobs - res.max_flushes);
+ } else {
+ // compatibility code in case users haven't migrated to max_background_jobs,
+ // which automatically computes flush/compaction limits
+ res.max_flushes = std::max(1, max_background_flushes);
+ res.max_compactions = std::max(1, max_background_compactions);
+ }
+ if (!parallelize_compactions) {
+ // throttle background compactions until we deem necessary
+ res.max_compactions = 1;
+ }
+ return res;
+}
+
+void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
+ assert(!cfd->queued_for_compaction());
+ cfd->Ref();
+ compaction_queue_.push_back(cfd);
+ cfd->set_queued_for_compaction(true);
+}
+
+ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
+ assert(!compaction_queue_.empty());
+ auto cfd = *compaction_queue_.begin();
+ compaction_queue_.pop_front();
+ assert(cfd->queued_for_compaction());
+ cfd->set_queued_for_compaction(false);
+ return cfd;
+}
+
+DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() {
+ assert(!flush_queue_.empty());
+ FlushRequest flush_req = flush_queue_.front();
+ flush_queue_.pop_front();
+ // TODO: need to unset flush reason?
+ return flush_req;
+}
+
+ColumnFamilyData* DBImpl::PickCompactionFromQueue(
+ std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer) {
+ assert(!compaction_queue_.empty());
+ assert(*token == nullptr);
+ autovector<ColumnFamilyData*> throttled_candidates;
+ ColumnFamilyData* cfd = nullptr;
+ while (!compaction_queue_.empty()) {
+ auto first_cfd = *compaction_queue_.begin();
+ compaction_queue_.pop_front();
+ assert(first_cfd->queued_for_compaction());
+ if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) {
+ throttled_candidates.push_back(first_cfd);
+ continue;
+ }
+ cfd = first_cfd;
+ cfd->set_queued_for_compaction(false);
+ break;
+ }
+ // Add throttled compaction candidates back to queue in the original order.
+ for (auto iter = throttled_candidates.rbegin();
+ iter != throttled_candidates.rend(); ++iter) {
+ compaction_queue_.push_front(*iter);
+ }
+ return cfd;
+}
+
+void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
+ FlushReason flush_reason) {
+ if (flush_req.empty()) {
+ return;
+ }
+ for (auto& iter : flush_req) {
+ ColumnFamilyData* cfd = iter.first;
+ cfd->Ref();
+ cfd->SetFlushReason(flush_reason);
+ }
+ ++unscheduled_flushes_;
+ flush_queue_.push_back(flush_req);
+}
+
+void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
+ if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+ }
+}
+
+void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+ FileType type, uint64_t number, int job_id) {
+ mutex_.AssertHeld();
+ PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
+ purge_files_.insert({{number, std::move(file_info)}});
+}
+
+void DBImpl::BGWorkFlush(void* arg) {
+ FlushThreadArg fta = *(reinterpret_cast<FlushThreadArg*>(arg));
+ delete reinterpret_cast<FlushThreadArg*>(arg);
+
+ IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_);
+ TEST_SYNC_POINT("DBImpl::BGWorkFlush");
+ static_cast_with_check<DBImpl, DB>(fta.db_)->BackgroundCallFlush(
+ fta.thread_pri_);
+ TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
+}
+
+void DBImpl::BGWorkCompaction(void* arg) {
+ CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
+ delete reinterpret_cast<CompactionArg*>(arg);
+ IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW);
+ TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
+ auto prepicked_compaction =
+ static_cast<PrepickedCompaction*>(ca.prepicked_compaction);
+ static_cast_with_check<DBImpl, DB>(ca.db)->BackgroundCallCompaction(
+ prepicked_compaction, Env::Priority::LOW);
+ delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkBottomCompaction(void* arg) {
+ CompactionArg ca = *(static_cast<CompactionArg*>(arg));
+ delete static_cast<CompactionArg*>(arg);
+ IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM);
+ TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction");
+ auto* prepicked_compaction = ca.prepicked_compaction;
+ assert(prepicked_compaction && prepicked_compaction->compaction &&
+ !prepicked_compaction->manual_compaction_state);
+ ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM);
+ delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkPurge(void* db) {
+ IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
+ TEST_SYNC_POINT("DBImpl::BGWorkPurge:start");
+ reinterpret_cast<DBImpl*>(db)->BackgroundCallPurge();
+ TEST_SYNC_POINT("DBImpl::BGWorkPurge:end");
+}
+
+void DBImpl::UnscheduleCompactionCallback(void* arg) {
+ CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
+ delete reinterpret_cast<CompactionArg*>(arg);
+ if (ca.prepicked_compaction != nullptr) {
+ if (ca.prepicked_compaction->compaction != nullptr) {
+ delete ca.prepicked_compaction->compaction;
+ }
+ delete ca.prepicked_compaction;
+ }
+ TEST_SYNC_POINT("DBImpl::UnscheduleCompactionCallback");
+}
+
+void DBImpl::UnscheduleFlushCallback(void* arg) {
+ delete reinterpret_cast<FlushThreadArg*>(arg);
+ TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback");
+}
+
+Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
+ LogBuffer* log_buffer, FlushReason* reason,
+ Env::Priority thread_pri) {
+ mutex_.AssertHeld();
+
+ Status status;
+ *reason = FlushReason::kOthers;
+ // If BG work is stopped due to an error, but a recovery is in progress,
+ // that means this flush is part of the recovery. So allow it to go through
+ if (!error_handler_.IsBGWorkStopped()) {
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ status = Status::ShutdownInProgress();
+ }
+ } else if (!error_handler_.IsRecoveryInProgress()) {
+ status = error_handler_.GetBGError();
+ }
+
+ if (!status.ok()) {
+ return status;
+ }
+
+ autovector<BGFlushArg> bg_flush_args;
+ std::vector<SuperVersionContext>& superversion_contexts =
+ job_context->superversion_contexts;
+ autovector<ColumnFamilyData*> column_families_not_to_flush;
+ while (!flush_queue_.empty()) {
+ // This cfd is already referenced
+ const FlushRequest& flush_req = PopFirstFromFlushQueue();
+ superversion_contexts.clear();
+ superversion_contexts.reserve(flush_req.size());
+
+ for (const auto& iter : flush_req) {
+ ColumnFamilyData* cfd = iter.first;
+ if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
+ // can't flush this CF, try next one
+ column_families_not_to_flush.push_back(cfd);
+ continue;
+ }
+ superversion_contexts.emplace_back(SuperVersionContext(true));
+ bg_flush_args.emplace_back(cfd, iter.second,
+ &(superversion_contexts.back()));
+ }
+ if (!bg_flush_args.empty()) {
+ break;
+ }
+ }
+
+ if (!bg_flush_args.empty()) {
+ auto bg_job_limits = GetBGJobLimits();
+ for (const auto& arg : bg_flush_args) {
+ ColumnFamilyData* cfd = arg.cfd_;
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "Calling FlushMemTableToOutputFile with column "
+ "family [%s], flush slots available %d, compaction slots available "
+ "%d, "
+ "flush slots scheduled %d, compaction slots scheduled %d",
+ cfd->GetName().c_str(), bg_job_limits.max_flushes,
+ bg_job_limits.max_compactions, bg_flush_scheduled_,
+ bg_compaction_scheduled_);
+ }
+ status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress,
+ job_context, log_buffer, thread_pri);
+ TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush");
+ // All the CFDs in the FlushReq must have the same flush reason, so just
+ // grab the first one
+ *reason = bg_flush_args[0].cfd_->GetFlushReason();
+ for (auto& arg : bg_flush_args) {
+ ColumnFamilyData* cfd = arg.cfd_;
+ if (cfd->UnrefAndTryDelete()) {
+ arg.cfd_ = nullptr;
+ }
+ }
+ }
+ for (auto cfd : column_families_not_to_flush) {
+ cfd->UnrefAndTryDelete();
+ }
+ return status;
+}
+
+void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
+ bool made_progress = false;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:start");
+
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+ {
+ InstrumentedMutexLock l(&mutex_);
+ assert(bg_flush_scheduled_);
+ num_running_flushes_++;
+
+ std::unique_ptr<std::list<uint64_t>::iterator>
+ pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+ FlushReason reason;
+
+ Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
+ &reason, thread_pri);
+ if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
+ reason != FlushReason::kErrorRecovery) {
+ // Wait a little bit before retrying background flush in
+ // case this is an environmental problem and we do not want to
+ // chew up resources for failed flushes for the duration of
+ // the problem.
+ uint64_t error_cnt =
+ default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
+ mutex_.Unlock();
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Waiting after background flush error: %s"
+ "Accumulated background error counts: %" PRIu64,
+ s.ToString().c_str(), error_cnt);
+ log_buffer.FlushBufferToLog();
+ LogFlush(immutable_db_options_.info_log);
+ env_->SleepForMicroseconds(1000000);
+ mutex_.Lock();
+ }
+
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0");
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ // If flush failed, we want to delete all temporary files that we might have
+ // created. Thus, we force full scan in FindObsoleteFiles()
+ FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+ !s.IsColumnFamilyDropped());
+ // delete unnecessary files if any, this is done outside the mutex
+ if (job_context.HaveSomethingToClean() ||
+ job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+ mutex_.Unlock();
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound");
+ // Have to flush the info logs before bg_flush_scheduled_--
+ // because if bg_flush_scheduled_ becomes 0 and the lock is
+ // released, the deconstructor of DB can kick in and destroy all the
+ // states of DB so info_log might not be available after that point.
+ // It also applies to access other states that DB owns.
+ log_buffer.FlushBufferToLog();
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ mutex_.Lock();
+ }
+ TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
+
+ assert(num_running_flushes_ > 0);
+ num_running_flushes_--;
+ bg_flush_scheduled_--;
+ // See if there's more work to be done
+ MaybeScheduleFlushOrCompaction();
+ atomic_flush_install_cv_.SignalAll();
+ bg_cv_.SignalAll();
+ // IMPORTANT: there should be no code after calling SignalAll. This call may
+ // signal the DB destructor that it's OK to proceed with destruction. In
+ // that case, all DB variables will be dealloacated and referencing them
+ // will cause trouble.
+ }
+}
+
+void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+ Env::Priority bg_thread_pri) {
+ bool made_progress = false;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ TEST_SYNC_POINT("BackgroundCallCompaction:0");
+ LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+ immutable_db_options_.info_log.get());
+ {
+ InstrumentedMutexLock l(&mutex_);
+
+ // This call will unlock/lock the mutex to wait for current running
+ // IngestExternalFile() calls to finish.
+ WaitForIngestFile();
+
+ num_running_compactions_++;
+
+ std::unique_ptr<std::list<uint64_t>::iterator>
+ pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+
+ assert((bg_thread_pri == Env::Priority::BOTTOM &&
+ bg_bottom_compaction_scheduled_) ||
+ (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_));
+ Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer,
+ prepicked_compaction, bg_thread_pri);
+ TEST_SYNC_POINT("BackgroundCallCompaction:1");
+ if (s.IsBusy()) {
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
+ mutex_.Unlock();
+ env_->SleepForMicroseconds(10000); // prevent hot loop
+ mutex_.Lock();
+ } else if (!s.ok() && !s.IsShutdownInProgress() &&
+ !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
+ // Wait a little bit before retrying background compaction in
+ // case this is an environmental problem and we do not want to
+ // chew up resources for failed compactions for the duration of
+ // the problem.
+ uint64_t error_cnt =
+ default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+ bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
+ mutex_.Unlock();
+ log_buffer.FlushBufferToLog();
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Waiting after background compaction error: %s, "
+ "Accumulated background error counts: %" PRIu64,
+ s.ToString().c_str(), error_cnt);
+ LogFlush(immutable_db_options_.info_log);
+ env_->SleepForMicroseconds(1000000);
+ mutex_.Lock();
+ } else if (s.IsManualCompactionPaused()) {
+ ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
+ assert(m);
+ ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
+ m->cfd->GetName().c_str(), job_context.job_id);
+ }
+
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ // If compaction failed, we want to delete all temporary files that we might
+ // have created (they might not be all recorded in job_context in case of a
+ // failure). Thus, we force full scan in FindObsoleteFiles()
+ FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+ !s.IsManualCompactionPaused() &&
+ !s.IsColumnFamilyDropped());
+ TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
+
+ // delete unnecessary files if any, this is done outside the mutex
+ if (job_context.HaveSomethingToClean() ||
+ job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+ mutex_.Unlock();
+ // Have to flush the info logs before bg_compaction_scheduled_--
+ // because if bg_flush_scheduled_ becomes 0 and the lock is
+ // released, the deconstructor of DB can kick in and destroy all the
+ // states of DB so info_log might not be available after that point.
+ // It also applies to access other states that DB owns.
+ log_buffer.FlushBufferToLog();
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles");
+ }
+ job_context.Clean();
+ mutex_.Lock();
+ }
+
+ assert(num_running_compactions_ > 0);
+ num_running_compactions_--;
+ if (bg_thread_pri == Env::Priority::LOW) {
+ bg_compaction_scheduled_--;
+ } else {
+ assert(bg_thread_pri == Env::Priority::BOTTOM);
+ bg_bottom_compaction_scheduled_--;
+ }
+
+ versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
+
+ // See if there's more work to be done
+ MaybeScheduleFlushOrCompaction();
+ if (made_progress ||
+ (bg_compaction_scheduled_ == 0 &&
+ bg_bottom_compaction_scheduled_ == 0) ||
+ HasPendingManualCompaction() || unscheduled_compactions_ == 0) {
+ // signal if
+ // * made_progress -- need to wakeup DelayWrite
+ // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
+ // * HasPendingManualCompaction -- need to wakeup RunManualCompaction
+ // If none of this is true, there is no need to signal since nobody is
+ // waiting for it
+ bg_cv_.SignalAll();
+ }
+ // IMPORTANT: there should be no code after calling SignalAll. This call may
+ // signal the DB destructor that it's OK to proceed with destruction. In
+ // that case, all DB variables will be dealloacated and referencing them
+ // will cause trouble.
+ }
+}
+
+Status DBImpl::BackgroundCompaction(bool* made_progress,
+ JobContext* job_context,
+ LogBuffer* log_buffer,
+ PrepickedCompaction* prepicked_compaction,
+ Env::Priority thread_pri) {
+ ManualCompactionState* manual_compaction =
+ prepicked_compaction == nullptr
+ ? nullptr
+ : prepicked_compaction->manual_compaction_state;
+ *made_progress = false;
+ mutex_.AssertHeld();
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start");
+
+ bool is_manual = (manual_compaction != nullptr);
+ std::unique_ptr<Compaction> c;
+ if (prepicked_compaction != nullptr &&
+ prepicked_compaction->compaction != nullptr) {
+ c.reset(prepicked_compaction->compaction);
+ }
+ bool is_prepicked = is_manual || c;
+
+ // (manual_compaction->in_progress == false);
+ bool trivial_move_disallowed =
+ is_manual && manual_compaction->disallow_trivial_move;
+
+ CompactionJobStats compaction_job_stats;
+ Status status;
+ if (!error_handler_.IsBGWorkStopped()) {
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ status = Status::ShutdownInProgress();
+ } else if (is_manual &&
+ manual_compaction_paused_.load(std::memory_order_acquire)) {
+ status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+ }
+ } else {
+ status = error_handler_.GetBGError();
+ // If we get here, it means a hard error happened after this compaction
+ // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got
+ // a chance to execute. Since we didn't pop a cfd from the compaction
+ // queue, increment unscheduled_compactions_
+ unscheduled_compactions_++;
+ }
+
+ if (!status.ok()) {
+ if (is_manual) {
+ manual_compaction->status = status;
+ manual_compaction->done = true;
+ manual_compaction->in_progress = false;
+ manual_compaction = nullptr;
+ }
+ if (c) {
+ c->ReleaseCompactionFiles(status);
+ c.reset();
+ }
+ return status;
+ }
+
+ if (is_manual) {
+ // another thread cannot pick up the same work
+ manual_compaction->in_progress = true;
+ }
+
+ std::unique_ptr<TaskLimiterToken> task_token;
+
+ // InternalKey manual_end_storage;
+ // InternalKey* manual_end = &manual_end_storage;
+ bool sfm_reserved_compact_space = false;
+ if (is_manual) {
+ ManualCompactionState* m = manual_compaction;
+ assert(m->in_progress);
+ if (!c) {
+ m->done = true;
+ m->manual_end = nullptr;
+ ROCKS_LOG_BUFFER(log_buffer,
+ "[%s] Manual compaction from level-%d from %s .. "
+ "%s; nothing to do\n",
+ m->cfd->GetName().c_str(), m->input_level,
+ (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+ (m->end ? m->end->DebugString().c_str() : "(end)"));
+ } else {
+ // First check if we have enough room to do the compaction
+ bool enough_room = EnoughRoomForCompaction(
+ m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+ if (!enough_room) {
+ // Then don't do the compaction
+ c->ReleaseCompactionFiles(status);
+ c.reset();
+ // m's vars will get set properly at the end of this function,
+ // as long as status == CompactionTooLarge
+ status = Status::CompactionTooLarge();
+ } else {
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Manual compaction from level-%d to level-%d from %s .. "
+ "%s; will stop at %s\n",
+ m->cfd->GetName().c_str(), m->input_level, c->output_level(),
+ (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+ (m->end ? m->end->DebugString().c_str() : "(end)"),
+ ((m->done || m->manual_end == nullptr)
+ ? "(end)"
+ : m->manual_end->DebugString().c_str()));
+ }
+ }
+ } else if (!is_prepicked && !compaction_queue_.empty()) {
+ if (HasExclusiveManualCompaction()) {
+ // Can't compact right now, but try again later
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict");
+
+ // Stay in the compaction queue.
+ unscheduled_compactions_++;
+
+ return Status::OK();
+ }
+
+ auto cfd = PickCompactionFromQueue(&task_token, log_buffer);
+ if (cfd == nullptr) {
+ // Can't find any executable task from the compaction queue.
+ // All tasks have been throttled by compaction thread limiter.
+ ++unscheduled_compactions_;
+ return Status::Busy();
+ }
+
+ // We unreference here because the following code will take a Ref() on
+ // this cfd if it is going to use it (Compaction class holds a
+ // reference).
+ // This will all happen under a mutex so we don't have to be afraid of
+ // somebody else deleting it.
+ if (cfd->UnrefAndTryDelete()) {
+ // This was the last reference of the column family, so no need to
+ // compact.
+ return Status::OK();
+ }
+
+ // Pick up latest mutable CF Options and use it throughout the
+ // compaction job
+ // Compaction makes a copy of the latest MutableCFOptions. It should be used
+ // throughout the compaction procedure to make sure consistency. It will
+ // eventually be installed into SuperVersion
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) {
+ // NOTE: try to avoid unnecessary copy of MutableCFOptions if
+ // compaction is not necessary. Need to make sure mutex is held
+ // until we make a copy in the following code
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
+ c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer));
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
+
+ if (c != nullptr) {
+ bool enough_room = EnoughRoomForCompaction(
+ cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+ if (!enough_room) {
+ // Then don't do the compaction
+ c->ReleaseCompactionFiles(status);
+ c->column_family_data()
+ ->current()
+ ->storage_info()
+ ->ComputeCompactionScore(*(c->immutable_cf_options()),
+ *(c->mutable_cf_options()));
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+
+ c.reset();
+ // Don't need to sleep here, because BackgroundCallCompaction
+ // will sleep if !s.ok()
+ status = Status::CompactionTooLarge();
+ } else {
+ // update statistics
+ RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
+ c->inputs(0)->size());
+ // There are three things that can change compaction score:
+ // 1) When flush or compaction finish. This case is covered by
+ // InstallSuperVersionAndScheduleWork
+ // 2) When MutableCFOptions changes. This case is also covered by
+ // InstallSuperVersionAndScheduleWork, because this is when the new
+ // options take effect.
+ // 3) When we Pick a new compaction, we "remove" those files being
+ // compacted from the calculation, which then influences compaction
+ // score. Here we check if we need the new compaction even without the
+ // files that are currently being compacted. If we need another
+ // compaction, we might be able to execute it in parallel, so we add
+ // it to the queue and schedule a new thread.
+ if (cfd->NeedsCompaction()) {
+ // Yes, we need more compactions!
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+ MaybeScheduleFlushOrCompaction();
+ }
+ }
+ }
+ }
+ }
+
+ if (!c) {
+ // Nothing to do
+ ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do");
+ } else if (c->deletion_compaction()) {
+ // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
+ // file if there is alive snapshot pointing to it
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+ c->column_family_data());
+ assert(c->num_input_files(1) == 0);
+ assert(c->level() == 0);
+ assert(c->column_family_data()->ioptions()->compaction_style ==
+ kCompactionStyleFIFO);
+
+ compaction_job_stats.num_input_files = c->num_input_files(0);
+
+ NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+
+ for (const auto& f : *c->inputs(0)) {
+ c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
+ }
+ status = versions_->LogAndApply(c->column_family_data(),
+ *c->mutable_cf_options(), c->edit(),
+ &mutex_, directories_.GetDbDir());
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+ ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n",
+ c->column_family_data()->GetName().c_str(),
+ c->num_input_files(0));
+ *made_progress = true;
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+ c->column_family_data());
+ } else if (!trivial_move_disallowed && c->IsTrivialMove()) {
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+ c->column_family_data());
+ // Instrument for event update
+ // TODO(yhchiang): add op details for showing trivial-move.
+ ThreadStatusUtil::SetColumnFamily(
+ c->column_family_data(), c->column_family_data()->ioptions()->env,
+ immutable_db_options_.enable_thread_tracking);
+ ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+ compaction_job_stats.num_input_files = c->num_input_files(0);
+
+ NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+
+ // Move files to next level
+ int32_t moved_files = 0;
+ int64_t moved_bytes = 0;
+ for (unsigned int l = 0; l < c->num_input_levels(); l++) {
+ if (c->level(l) == c->output_level()) {
+ continue;
+ }
+ for (size_t i = 0; i < c->num_input_files(l); i++) {
+ FileMetaData* f = c->input(l, i);
+ c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
+ c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
+ f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
+ f->largest, f->fd.smallest_seqno,
+ f->fd.largest_seqno, f->marked_for_compaction,
+ f->oldest_blob_file_number, f->oldest_ancester_time,
+ f->file_creation_time, f->file_checksum,
+ f->file_checksum_func_name);
+
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+ c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+ c->output_level(), f->fd.GetFileSize());
+ ++moved_files;
+ moved_bytes += f->fd.GetFileSize();
+ }
+ }
+
+ status = versions_->LogAndApply(c->column_family_data(),
+ *c->mutable_cf_options(), c->edit(),
+ &mutex_, directories_.GetDbDir());
+ // Use latest MutableCFOptions
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(),
+ moved_bytes);
+ {
+ event_logger_.LogToBuffer(log_buffer)
+ << "job" << job_context->job_id << "event"
+ << "trivial_move"
+ << "destination_level" << c->output_level() << "files" << moved_files
+ << "total_files_size" << moved_bytes;
+ }
+ ROCKS_LOG_BUFFER(
+ log_buffer,
+ "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
+ c->column_family_data()->GetName().c_str(), moved_files,
+ c->output_level(), moved_bytes, status.ToString().c_str(),
+ c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
+ *made_progress = true;
+
+ // Clear Instrument
+ ThreadStatusUtil::ResetThreadStatus();
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+ c->column_family_data());
+ } else if (!is_prepicked && c->output_level() > 0 &&
+ c->output_level() ==
+ c->column_family_data()
+ ->current()
+ ->storage_info()
+ ->MaxOutputLevel(
+ immutable_db_options_.allow_ingest_behind) &&
+ env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+ // Forward compactions involving last level to the bottom pool if it exists,
+ // such that compactions unlikely to contribute to write stalls can be
+ // delayed or deprioritized.
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool");
+ CompactionArg* ca = new CompactionArg;
+ ca->db = this;
+ ca->prepicked_compaction = new PrepickedCompaction;
+ ca->prepicked_compaction->compaction = c.release();
+ ca->prepicked_compaction->manual_compaction_state = nullptr;
+ // Transfer requested token, so it doesn't need to do it again.
+ ca->prepicked_compaction->task_token = std::move(task_token);
+ ++bg_bottom_compaction_scheduled_;
+ env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM,
+ this, &DBImpl::UnscheduleCompactionCallback);
+ } else {
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+ c->column_family_data());
+ int output_level __attribute__((__unused__));
+ output_level = c->output_level();
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
+ &output_level);
+ std::vector<SequenceNumber> snapshot_seqs;
+ SequenceNumber earliest_write_conflict_snapshot;
+ SnapshotChecker* snapshot_checker;
+ GetSnapshotContext(job_context, &snapshot_seqs,
+ &earliest_write_conflict_snapshot, &snapshot_checker);
+ assert(is_snapshot_supported_ || snapshots_.empty());
+ CompactionJob compaction_job(
+ job_context->job_id, c.get(), immutable_db_options_,
+ file_options_for_compaction_, versions_.get(), &shutting_down_,
+ preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
+ GetDataDir(c->column_family_data(), c->output_path_id()), stats_,
+ &mutex_, &error_handler_, snapshot_seqs,
+ earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
+ &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
+ c->mutable_cf_options()->report_bg_io_stats, dbname_,
+ &compaction_job_stats, thread_pri,
+ is_manual ? &manual_compaction_paused_ : nullptr);
+ compaction_job.Prepare();
+
+ NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+
+ mutex_.Unlock();
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
+ compaction_job.Run();
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
+ mutex_.Lock();
+
+ status = compaction_job.Install(*c->mutable_cf_options());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(c->column_family_data(),
+ &job_context->superversion_contexts[0],
+ *c->mutable_cf_options());
+ }
+ *made_progress = true;
+ TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+ c->column_family_data());
+ }
+ if (c != nullptr) {
+ c->ReleaseCompactionFiles(status);
+ *made_progress = true;
+
+#ifndef ROCKSDB_LITE
+ // Need to make sure SstFileManager does its bookkeeping
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ immutable_db_options_.sst_file_manager.get());
+ if (sfm && sfm_reserved_compact_space) {
+ sfm->OnCompactionCompletion(c.get());
+ }
+#endif // ROCKSDB_LITE
+
+ NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status,
+ compaction_job_stats, job_context->job_id);
+ }
+
+ if (status.ok() || status.IsCompactionTooLarge() ||
+ status.IsManualCompactionPaused()) {
+ // Done
+ } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+ // Ignore compaction errors found during shutting down
+ } else {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
+ status.ToString().c_str());
+ error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+ if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) {
+ // Put this cfd back in the compaction queue so we can retry after some
+ // time
+ auto cfd = c->column_family_data();
+ assert(cfd != nullptr);
+ // Since this compaction failed, we need to recompute the score so it
+ // takes the original input files into account
+ c->column_family_data()
+ ->current()
+ ->storage_info()
+ ->ComputeCompactionScore(*(c->immutable_cf_options()),
+ *(c->mutable_cf_options()));
+ if (!cfd->queued_for_compaction()) {
+ AddToCompactionQueue(cfd);
+ ++unscheduled_compactions_;
+ }
+ }
+ }
+ // this will unref its input_version and column_family_data
+ c.reset();
+
+ if (is_manual) {
+ ManualCompactionState* m = manual_compaction;
+ if (!status.ok()) {
+ m->status = status;
+ m->done = true;
+ }
+ // For universal compaction:
+ // Because universal compaction always happens at level 0, so one
+ // compaction will pick up all overlapped files. No files will be
+ // filtered out due to size limit and left for a successive compaction.
+ // So we can safely conclude the current compaction.
+ //
+ // Also note that, if we don't stop here, then the current compaction
+ // writes a new file back to level 0, which will be used in successive
+ // compaction. Hence the manual compaction will never finish.
+ //
+ // Stop the compaction if manual_end points to nullptr -- this means
+ // that we compacted the whole range. manual_end should always point
+ // to nullptr in case of universal compaction
+ if (m->manual_end == nullptr) {
+ m->done = true;
+ }
+ if (!m->done) {
+ // We only compacted part of the requested range. Update *m
+ // to the range that is left to be compacted.
+ // Universal and FIFO compactions should always compact the whole range
+ assert(m->cfd->ioptions()->compaction_style !=
+ kCompactionStyleUniversal ||
+ m->cfd->ioptions()->num_levels > 1);
+ assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO);
+ m->tmp_storage = *m->manual_end;
+ m->begin = &m->tmp_storage;
+ m->incomplete = true;
+ }
+ m->in_progress = false; // not being processed anymore
+ }
+ TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish");
+ return status;
+}
+
+bool DBImpl::HasPendingManualCompaction() {
+ return (!manual_compaction_dequeue_.empty());
+}
+
+void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) {
+ manual_compaction_dequeue_.push_back(m);
+}
+
+void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) {
+ // Remove from queue
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ while (it != manual_compaction_dequeue_.end()) {
+ if (m == (*it)) {
+ it = manual_compaction_dequeue_.erase(it);
+ return;
+ }
+ ++it;
+ }
+ assert(false);
+ return;
+}
+
+bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) {
+ if (num_running_ingest_file_ > 0) {
+ // We need to wait for other IngestExternalFile() calls to finish
+ // before running a manual compaction.
+ return true;
+ }
+ if (m->exclusive) {
+ return (bg_bottom_compaction_scheduled_ > 0 ||
+ bg_compaction_scheduled_ > 0);
+ }
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ bool seen = false;
+ while (it != manual_compaction_dequeue_.end()) {
+ if (m == (*it)) {
+ ++it;
+ seen = true;
+ continue;
+ } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) {
+ // Consider the other manual compaction *it, conflicts if:
+ // overlaps with m
+ // and (*it) is ahead in the queue and is not yet in progress
+ return true;
+ }
+ ++it;
+ }
+ return false;
+}
+
+bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) {
+ // Remove from priority queue
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ while (it != manual_compaction_dequeue_.end()) {
+ if ((*it)->exclusive) {
+ return true;
+ }
+ if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) {
+ // Allow automatic compaction if manual compaction is
+ // in progress
+ return true;
+ }
+ ++it;
+ }
+ return false;
+}
+
+bool DBImpl::HasExclusiveManualCompaction() {
+ // Remove from priority queue
+ std::deque<ManualCompactionState*>::iterator it =
+ manual_compaction_dequeue_.begin();
+ while (it != manual_compaction_dequeue_.end()) {
+ if ((*it)->exclusive) {
+ return true;
+ }
+ ++it;
+ }
+ return false;
+}
+
+bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
+ if ((m->exclusive) || (m1->exclusive)) {
+ return true;
+ }
+ if (m->cfd != m1->cfd) {
+ return false;
+ }
+ return true;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::BuildCompactionJobInfo(
+ const ColumnFamilyData* cfd, Compaction* c, const Status& st,
+ const CompactionJobStats& compaction_job_stats, const int job_id,
+ const Version* current, CompactionJobInfo* compaction_job_info) const {
+ assert(compaction_job_info != nullptr);
+ compaction_job_info->cf_id = cfd->GetID();
+ compaction_job_info->cf_name = cfd->GetName();
+ compaction_job_info->status = st;
+ compaction_job_info->thread_id = env_->GetThreadID();
+ compaction_job_info->job_id = job_id;
+ compaction_job_info->base_input_level = c->start_level();
+ compaction_job_info->output_level = c->output_level();
+ compaction_job_info->stats = compaction_job_stats;
+ compaction_job_info->table_properties = c->GetOutputTableProperties();
+ compaction_job_info->compaction_reason = c->compaction_reason();
+ compaction_job_info->compression = c->output_compression();
+ for (size_t i = 0; i < c->num_input_levels(); ++i) {
+ for (const auto fmd : *c->inputs(i)) {
+ const FileDescriptor& desc = fmd->fd;
+ const uint64_t file_number = desc.GetNumber();
+ auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number,
+ desc.GetPathId());
+ compaction_job_info->input_files.push_back(fn);
+ compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
+ static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
+ if (compaction_job_info->table_properties.count(fn) == 0) {
+ std::shared_ptr<const TableProperties> tp;
+ auto s = current->GetTableProperties(&tp, fmd, &fn);
+ if (s.ok()) {
+ compaction_job_info->table_properties[fn] = tp;
+ }
+ }
+ }
+ }
+ for (const auto& newf : c->edit()->GetNewFiles()) {
+ const FileMetaData& meta = newf.second;
+ const FileDescriptor& desc = meta.fd;
+ const uint64_t file_number = desc.GetNumber();
+ compaction_job_info->output_files.push_back(TableFileName(
+ c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+ compaction_job_info->output_file_infos.push_back(CompactionFileInfo{
+ newf.first, file_number, meta.oldest_blob_file_number});
+ }
+}
+#endif
+
+// SuperVersionContext gets created and destructed outside of the lock --
+// we use this conveniently to:
+// * malloc one SuperVersion() outside of the lock -- new_superversion
+// * delete SuperVersion()s outside of the lock -- superversions_to_free
+//
+// However, if InstallSuperVersionAndScheduleWork() gets called twice with the
+// same sv_context, we can't reuse the SuperVersion() that got
+// malloced because
+// first call already used it. In that rare case, we take a hit and create a
+// new SuperVersion() inside of the mutex. We do similar thing
+// for superversion_to_free
+
+void DBImpl::InstallSuperVersionAndScheduleWork(
+ ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+ const MutableCFOptions& mutable_cf_options) {
+ mutex_.AssertHeld();
+
+ // Update max_total_in_memory_state_
+ size_t old_memtable_size = 0;
+ auto* old_sv = cfd->GetSuperVersion();
+ if (old_sv) {
+ old_memtable_size = old_sv->mutable_cf_options.write_buffer_size *
+ old_sv->mutable_cf_options.max_write_buffer_number;
+ }
+
+ // this branch is unlikely to step in
+ if (UNLIKELY(sv_context->new_superversion == nullptr)) {
+ sv_context->NewSuperVersion();
+ }
+ cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options);
+
+ // There may be a small data race here. The snapshot tricking bottommost
+ // compaction may already be released here. But assuming there will always be
+ // newer snapshot created and released frequently, the compaction will be
+ // triggered soon anyway.
+ bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+ for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
+ bottommost_files_mark_threshold_ = std::min(
+ bottommost_files_mark_threshold_,
+ my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
+ }
+
+ // Whenever we install new SuperVersion, we might need to issue new flushes or
+ // compactions.
+ SchedulePendingCompaction(cfd);
+ MaybeScheduleFlushOrCompaction();
+
+ // Update max_total_in_memory_state_
+ max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size +
+ mutable_cf_options.write_buffer_size *
+ mutable_cf_options.max_write_buffer_number;
+}
+
+// ShouldPurge is called by FindObsoleteFiles when doing a full scan,
+// and db mutex (mutex_) should already be held.
+// Actually, the current implementation of FindObsoleteFiles with
+// full_scan=true can issue I/O requests to obtain list of files in
+// directories, e.g. env_->getChildren while holding db mutex.
+bool DBImpl::ShouldPurge(uint64_t file_number) const {
+ return files_grabbed_for_purge_.find(file_number) ==
+ files_grabbed_for_purge_.end() &&
+ purge_files_.find(file_number) == purge_files_.end();
+}
+
+// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex
+// (mutex_) should already be held.
+void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) {
+ files_grabbed_for_purge_.insert(file_number);
+}
+
+void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {
+ InstrumentedMutexLock l(&mutex_);
+ // snapshot_checker_ should only set once. If we need to set it multiple
+ // times, we need to make sure the old one is not deleted while it is still
+ // using by a compaction job.
+ assert(!snapshot_checker_);
+ snapshot_checker_.reset(snapshot_checker);
+}
+
+void DBImpl::GetSnapshotContext(
+ JobContext* job_context, std::vector<SequenceNumber>* snapshot_seqs,
+ SequenceNumber* earliest_write_conflict_snapshot,
+ SnapshotChecker** snapshot_checker_ptr) {
+ mutex_.AssertHeld();
+ assert(job_context != nullptr);
+ assert(snapshot_seqs != nullptr);
+ assert(earliest_write_conflict_snapshot != nullptr);
+ assert(snapshot_checker_ptr != nullptr);
+
+ *snapshot_checker_ptr = snapshot_checker_.get();
+ if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) {
+ *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance();
+ }
+ if (*snapshot_checker_ptr != nullptr) {
+ // If snapshot_checker is used, that means the flush/compaction may
+ // contain values not visible to snapshot taken after
+ // flush/compaction job starts. Take a snapshot and it will appear
+ // in snapshot_seqs and force compaction iterator to consider such
+ // snapshots.
+ const Snapshot* job_snapshot =
+ GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/);
+ job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot));
+ }
+ *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot);
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_debug.cc b/src/rocksdb/db/db_impl/db_impl_debug.cc
new file mode 100644
index 000000000..610b57d39
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_debug.cc
@@ -0,0 +1,294 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef NDEBUG
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "monitoring/thread_status_updater.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+ InstrumentedMutexLock l(&mutex_);
+ return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
+}
+
+void DBImpl::TEST_SwitchWAL() {
+ WriteContext write_context;
+ InstrumentedMutexLock l(&mutex_);
+ void* writer = TEST_BeginWrite();
+ SwitchWAL(&write_context);
+ TEST_EndWrite(writer);
+}
+
+bool DBImpl::TEST_WALBufferIsEmpty(bool lock) {
+ if (lock) {
+ log_write_mutex_.Lock();
+ }
+ log::Writer* cur_log_writer = logs_.back().writer;
+ auto res = cur_log_writer->TEST_BufferIsEmpty();
+ if (lock) {
+ log_write_mutex_.Unlock();
+ }
+ return res;
+}
+
+int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
+ ColumnFamilyHandle* column_family) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ cfd = cfh->cfd();
+ }
+ InstrumentedMutexLock l(&mutex_);
+ return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
+}
+
+void DBImpl::TEST_GetFilesMetaData(
+ ColumnFamilyHandle* column_family,
+ std::vector<std::vector<FileMetaData>>* metadata) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ InstrumentedMutexLock l(&mutex_);
+ metadata->resize(NumberLevels());
+ for (int level = 0; level < NumberLevels(); level++) {
+ const std::vector<FileMetaData*>& files =
+ cfd->current()->storage_info()->LevelFiles(level);
+
+ (*metadata)[level].clear();
+ for (const auto& f : files) {
+ (*metadata)[level].push_back(*f);
+ }
+ }
+}
+
+uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
+ return versions_->manifest_file_number();
+}
+
+uint64_t DBImpl::TEST_Current_Next_FileNo() {
+ return versions_->current_next_file_number();
+}
+
+Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
+ const Slice* end,
+ ColumnFamilyHandle* column_family,
+ bool disallow_trivial_move) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ cfd = cfh->cfd();
+ }
+ int output_level =
+ (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+ cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
+ ? level
+ : level + 1;
+ return RunManualCompaction(cfd, level, output_level, CompactRangeOptions(),
+ begin, end, true, disallow_trivial_move,
+ port::kMaxUint64 /*max_file_num_to_ignore*/);
+}
+
+Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
+ WriteContext write_context;
+ InstrumentedMutexLock l(&mutex_);
+ if (cfd == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ }
+
+ Status s;
+ void* writer = TEST_BeginWrite();
+ if (two_write_queues_) {
+ WriteThread::Writer nonmem_w;
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ s = SwitchMemtable(cfd, &write_context);
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ } else {
+ s = SwitchMemtable(cfd, &write_context);
+ }
+ TEST_EndWrite(writer);
+ return s;
+}
+
+Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
+ ColumnFamilyHandle* cfh) {
+ FlushOptions fo;
+ fo.wait = wait;
+ fo.allow_write_stall = allow_write_stall;
+ ColumnFamilyData* cfd;
+ if (cfh == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh);
+ cfd = cfhi->cfd();
+ }
+ return FlushMemTable(cfd, fo, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd,
+ const FlushOptions& flush_opts) {
+ return FlushMemTable(cfd, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_AtomicFlushMemTables(
+ const autovector<ColumnFamilyData*>& cfds, const FlushOptions& flush_opts) {
+ return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
+ ColumnFamilyData* cfd;
+ if (column_family == nullptr) {
+ cfd = default_cf_handle_->cfd();
+ } else {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ cfd = cfh->cfd();
+ }
+ return WaitForFlushMemTable(cfd, nullptr, false);
+}
+
+Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) {
+ // Wait until the compaction completes
+
+ // TODO: a bug here. This function actually does not necessarily
+ // wait for compact. It actually waits for scheduled compaction
+ // OR flush to finish.
+
+ InstrumentedMutexLock l(&mutex_);
+ while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+ bg_flush_scheduled_ ||
+ (wait_unscheduled && unscheduled_compactions_)) &&
+ (error_handler_.GetBGError() == Status::OK())) {
+ bg_cv_.Wait();
+ }
+ return error_handler_.GetBGError();
+}
+
+void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
+
+void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
+
+void* DBImpl::TEST_BeginWrite() {
+ auto w = new WriteThread::Writer();
+ write_thread_.EnterUnbatched(w, &mutex_);
+ return reinterpret_cast<void*>(w);
+}
+
+void DBImpl::TEST_EndWrite(void* w) {
+ auto writer = reinterpret_cast<WriteThread::Writer*>(w);
+ write_thread_.ExitUnbatched(writer);
+ delete writer;
+}
+
+size_t DBImpl::TEST_LogsToFreeSize() {
+ InstrumentedMutexLock l(&mutex_);
+ return logs_to_free_.size();
+}
+
+uint64_t DBImpl::TEST_LogfileNumber() {
+ InstrumentedMutexLock l(&mutex_);
+ return logfile_number_;
+}
+
+Status DBImpl::TEST_GetAllImmutableCFOptions(
+ std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map) {
+ std::vector<std::string> cf_names;
+ std::vector<const ImmutableCFOptions*> iopts;
+ {
+ InstrumentedMutexLock l(&mutex_);
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ cf_names.push_back(cfd->GetName());
+ iopts.push_back(cfd->ioptions());
+ }
+ }
+ iopts_map->clear();
+ for (size_t i = 0; i < cf_names.size(); ++i) {
+ iopts_map->insert({cf_names[i], iopts[i]});
+ }
+
+ return Status::OK();
+}
+
+uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() {
+ return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+}
+
+size_t DBImpl::TEST_PreparedSectionCompletedSize() {
+ return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize();
+}
+
+size_t DBImpl::TEST_LogsWithPrepSize() {
+ return logs_with_prep_tracker_.TEST_LogsWithPrepSize();
+}
+
+uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
+ autovector<MemTable*> empty_list;
+ return FindMinPrepLogReferencedByMemTable(versions_.get(), nullptr,
+ empty_list);
+}
+
+Status DBImpl::TEST_GetLatestMutableCFOptions(
+ ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) {
+ InstrumentedMutexLock l(&mutex_);
+
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions();
+ return Status::OK();
+}
+
+int DBImpl::TEST_BGCompactionsAllowed() const {
+ InstrumentedMutexLock l(&mutex_);
+ return GetBGJobLimits().max_compactions;
+}
+
+int DBImpl::TEST_BGFlushesAllowed() const {
+ InstrumentedMutexLock l(&mutex_);
+ return GetBGJobLimits().max_flushes;
+}
+
+SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const {
+ if (last_seq_same_as_publish_seq_) {
+ return versions_->LastSequence();
+ } else {
+ return versions_->LastAllocatedSequence();
+ }
+}
+
+size_t DBImpl::TEST_GetWalPreallocateBlockSize(
+ uint64_t write_buffer_size) const {
+ InstrumentedMutexLock l(&mutex_);
+ return GetWalPreallocateBlockSize(write_buffer_size);
+}
+
+void DBImpl::TEST_WaitForDumpStatsRun(std::function<void()> callback) const {
+ if (thread_dump_stats_ != nullptr) {
+ thread_dump_stats_->TEST_WaitForRun(callback);
+ }
+}
+
+void DBImpl::TEST_WaitForPersistStatsRun(std::function<void()> callback) const {
+ if (thread_persist_stats_ != nullptr) {
+ thread_persist_stats_->TEST_WaitForRun(callback);
+ }
+}
+
+bool DBImpl::TEST_IsPersistentStatsEnabled() const {
+ return thread_persist_stats_ && thread_persist_stats_->IsRunning();
+}
+
+size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
+ return EstimateInMemoryStatsHistorySize();
+}
+} // namespace ROCKSDB_NAMESPACE
+#endif // NDEBUG
diff --git a/src/rocksdb/db/db_impl/db_impl_experimental.cc b/src/rocksdb/db/db_impl/db_impl_experimental.cc
new file mode 100644
index 000000000..f0c17ce95
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_experimental.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
+ const Slice* begin, const Slice* end) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ InternalKey start_key, end_key;
+ if (begin != nullptr) {
+ start_key.SetMinPossibleForUserKey(*begin);
+ }
+ if (end != nullptr) {
+ end_key.SetMaxPossibleForUserKey(*end);
+ }
+ {
+ InstrumentedMutexLock l(&mutex_);
+ auto vstorage = cfd->current()->storage_info();
+ for (int level = 0; level < vstorage->num_non_empty_levels() - 1; ++level) {
+ std::vector<FileMetaData*> inputs;
+ vstorage->GetOverlappingInputs(
+ level, begin == nullptr ? nullptr : &start_key,
+ end == nullptr ? nullptr : &end_key, &inputs);
+ for (auto f : inputs) {
+ f->marked_for_compaction = true;
+ }
+ }
+ // Since we have some more files to compact, we should also recompute
+ // compaction score
+ vstorage->ComputeCompactionScore(*cfd->ioptions(),
+ *cfd->GetLatestMutableCFOptions());
+ SchedulePendingCompaction(cfd);
+ MaybeScheduleFlushOrCompaction();
+ }
+ return Status::OK();
+}
+
+Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
+ assert(column_family);
+
+ if (target_level < 1) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Invalid target level %d\n", target_level);
+ return Status::InvalidArgument("Invalid target level");
+ }
+
+ Status status;
+ VersionEdit edit;
+ JobContext job_context(next_job_id_.fetch_add(1), true);
+ {
+ InstrumentedMutexLock l(&mutex_);
+ auto* cfd = static_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+ const auto* vstorage = cfd->current()->storage_info();
+
+ if (target_level >= vstorage->num_levels()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Target level %d does not exist\n",
+ target_level);
+ job_context.Clean();
+ return Status::InvalidArgument("Target level does not exist");
+ }
+
+ // Sort L0 files by range.
+ const InternalKeyComparator* icmp = &cfd->internal_comparator();
+ auto l0_files = vstorage->LevelFiles(0);
+ std::sort(l0_files.begin(), l0_files.end(),
+ [icmp](FileMetaData* f1, FileMetaData* f2) {
+ return icmp->Compare(f1->largest, f2->largest) < 0;
+ });
+
+ // Check that no L0 file is being compacted and that they have
+ // non-overlapping ranges.
+ for (size_t i = 0; i < l0_files.size(); ++i) {
+ auto f = l0_files[i];
+ if (f->being_compacted) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. File %" PRIu64 " being compacted\n",
+ f->fd.GetNumber());
+ job_context.Clean();
+ return Status::InvalidArgument("PromoteL0 called during L0 compaction");
+ }
+
+ if (i == 0) continue;
+ auto prev_f = l0_files[i - 1];
+ if (icmp->Compare(prev_f->largest, f->smallest) >= 0) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Files %" PRIu64 " and %" PRIu64
+ " have overlapping ranges\n",
+ prev_f->fd.GetNumber(), f->fd.GetNumber());
+ job_context.Clean();
+ return Status::InvalidArgument("L0 has overlapping files");
+ }
+ }
+
+ // Check that all levels up to target_level are empty.
+ for (int level = 1; level <= target_level; ++level) {
+ if (vstorage->NumLevelFiles(level) > 0) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "PromoteL0 FAILED. Level %d not empty\n", level);
+ job_context.Clean();
+ return Status::InvalidArgument(
+ "All levels up to target_level "
+ "must be empty");
+ }
+ }
+
+ edit.SetColumnFamily(cfd->GetID());
+ for (const auto& f : l0_files) {
+ edit.DeleteFile(0, f->fd.GetNumber());
+ edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
+ f->fd.GetFileSize(), f->smallest, f->largest,
+ f->fd.smallest_seqno, f->fd.largest_seqno,
+ f->marked_for_compaction, f->oldest_blob_file_number,
+ f->oldest_ancester_time, f->file_creation_time,
+ f->file_checksum, f->file_checksum_func_name);
+ }
+
+ status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+ &edit, &mutex_, directories_.GetDbDir());
+ if (status.ok()) {
+ InstallSuperVersionAndScheduleWork(cfd,
+ &job_context.superversion_contexts[0],
+ *cfd->GetLatestMutableCFOptions());
+ }
+ } // lock released here
+ LogFlush(immutable_db_options_.info_log);
+ job_context.Clean();
+
+ return status;
+}
+#endif // ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_files.cc b/src/rocksdb/db/db_impl/db_impl_files.cc
new file mode 100644
index 000000000..c5d07dd01
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_files.cc
@@ -0,0 +1,667 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+#include <set>
+#include <unordered_set>
+#include "db/event_helpers.h"
+#include "db/memtable_list.h"
+#include "file/file_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint64_t DBImpl::MinLogNumberToKeep() {
+ if (allow_2pc()) {
+ return versions_->min_log_number_to_keep_2pc();
+ } else {
+ return versions_->MinLogNumberWithUnflushedData();
+ }
+}
+
+uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
+ mutex_.AssertHeld();
+ if (!pending_outputs_.empty()) {
+ return *pending_outputs_.begin();
+ }
+ return std::numeric_limits<uint64_t>::max();
+}
+
+// * Returns the list of live files in 'sst_live'
+// If it's doing full scan:
+// * Returns the list of all files in the filesystem in
+// 'full_scan_candidate_files'.
+// Otherwise, gets obsolete files from VersionSet.
+// no_full_scan = true -- never do the full scan using GetChildren()
+// force = false -- don't force the full scan, except every
+// mutable_db_options_.delete_obsolete_files_period_micros
+// force = true -- force the full scan
+void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
+ bool no_full_scan) {
+ mutex_.AssertHeld();
+
+ // if deletion is disabled, do nothing
+ if (disable_delete_obsolete_files_ > 0) {
+ return;
+ }
+
+ bool doing_the_full_scan = false;
+
+ // logic for figuring out if we're doing the full scan
+ if (no_full_scan) {
+ doing_the_full_scan = false;
+ } else if (force ||
+ mutable_db_options_.delete_obsolete_files_period_micros == 0) {
+ doing_the_full_scan = true;
+ } else {
+ const uint64_t now_micros = env_->NowMicros();
+ if ((delete_obsolete_files_last_run_ +
+ mutable_db_options_.delete_obsolete_files_period_micros) <
+ now_micros) {
+ doing_the_full_scan = true;
+ delete_obsolete_files_last_run_ = now_micros;
+ }
+ }
+
+ // don't delete files that might be currently written to from compaction
+ // threads
+ // Since job_context->min_pending_output is set, until file scan finishes,
+ // mutex_ cannot be released. Otherwise, we might see no min_pending_output
+ // here but later find newer generated unfinalized files while scanning.
+ if (!pending_outputs_.empty()) {
+ job_context->min_pending_output = *pending_outputs_.begin();
+ } else {
+ // delete all of them
+ job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
+ }
+
+ // Get obsolete files. This function will also update the list of
+ // pending files in VersionSet().
+ versions_->GetObsoleteFiles(&job_context->sst_delete_files,
+ &job_context->manifest_delete_files,
+ job_context->min_pending_output);
+
+ // Mark the elements in job_context->sst_delete_files as grabbedForPurge
+ // so that other threads calling FindObsoleteFiles with full_scan=true
+ // will not add these files to candidate list for purge.
+ for (const auto& sst_to_del : job_context->sst_delete_files) {
+ MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber());
+ }
+
+ // store the current filenum, lognum, etc
+ job_context->manifest_file_number = versions_->manifest_file_number();
+ job_context->pending_manifest_file_number =
+ versions_->pending_manifest_file_number();
+ job_context->log_number = MinLogNumberToKeep();
+ job_context->prev_log_number = versions_->prev_log_number();
+
+ versions_->AddLiveFiles(&job_context->sst_live);
+ if (doing_the_full_scan) {
+ InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+ dbname_);
+ std::set<std::string> paths;
+ for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size();
+ path_id++) {
+ paths.insert(immutable_db_options_.db_paths[path_id].path);
+ }
+
+ // Note that if cf_paths is not specified in the ColumnFamilyOptions
+ // of a particular column family, we use db_paths as the cf_paths
+ // setting. Hence, there can be multiple duplicates of files from db_paths
+ // in the following code. The duplicate are removed while identifying
+ // unique files in PurgeObsoleteFiles.
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ for (size_t path_id = 0; path_id < cfd->ioptions()->cf_paths.size();
+ path_id++) {
+ auto& path = cfd->ioptions()->cf_paths[path_id].path;
+
+ if (paths.find(path) == paths.end()) {
+ paths.insert(path);
+ }
+ }
+ }
+
+ for (auto& path : paths) {
+ // set of all files in the directory. We'll exclude files that are still
+ // alive in the subsequent processings.
+ std::vector<std::string> files;
+ env_->GetChildren(path, &files); // Ignore errors
+ for (const std::string& file : files) {
+ uint64_t number;
+ FileType type;
+ // 1. If we cannot parse the file name, we skip;
+ // 2. If the file with file_number equals number has already been
+ // grabbed for purge by another compaction job, or it has already been
+ // schedule for purge, we also skip it if we
+ // are doing full scan in order to avoid double deletion of the same
+ // file under race conditions. See
+ // https://github.com/facebook/rocksdb/issues/3573
+ if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) ||
+ !ShouldPurge(number)) {
+ continue;
+ }
+
+ // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
+ job_context->full_scan_candidate_files.emplace_back("/" + file, path);
+ }
+ }
+
+ // Add log files in wal_dir
+ if (immutable_db_options_.wal_dir != dbname_) {
+ std::vector<std::string> log_files;
+ env_->GetChildren(immutable_db_options_.wal_dir,
+ &log_files); // Ignore errors
+ for (const std::string& log_file : log_files) {
+ job_context->full_scan_candidate_files.emplace_back(
+ log_file, immutable_db_options_.wal_dir);
+ }
+ }
+ // Add info log files in db_log_dir
+ if (!immutable_db_options_.db_log_dir.empty() &&
+ immutable_db_options_.db_log_dir != dbname_) {
+ std::vector<std::string> info_log_files;
+ // Ignore errors
+ env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files);
+ for (std::string& log_file : info_log_files) {
+ job_context->full_scan_candidate_files.emplace_back(
+ log_file, immutable_db_options_.db_log_dir);
+ }
+ }
+ }
+
+ // logs_ is empty when called during recovery, in which case there can't yet
+ // be any tracked obsolete logs
+ if (!alive_log_files_.empty() && !logs_.empty()) {
+ uint64_t min_log_number = job_context->log_number;
+ size_t num_alive_log_files = alive_log_files_.size();
+ // find newly obsoleted log files
+ while (alive_log_files_.begin()->number < min_log_number) {
+ auto& earliest = *alive_log_files_.begin();
+ if (immutable_db_options_.recycle_log_file_num >
+ log_recycle_files_.size()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "adding log %" PRIu64 " to recycle list\n",
+ earliest.number);
+ log_recycle_files_.push_back(earliest.number);
+ } else {
+ job_context->log_delete_files.push_back(earliest.number);
+ }
+ if (job_context->size_log_to_delete == 0) {
+ job_context->prev_total_log_size = total_log_size_;
+ job_context->num_alive_log_files = num_alive_log_files;
+ }
+ job_context->size_log_to_delete += earliest.size;
+ total_log_size_ -= earliest.size;
+ if (two_write_queues_) {
+ log_write_mutex_.Lock();
+ }
+ alive_log_files_.pop_front();
+ if (two_write_queues_) {
+ log_write_mutex_.Unlock();
+ }
+ // Current log should always stay alive since it can't have
+ // number < MinLogNumber().
+ assert(alive_log_files_.size());
+ }
+ while (!logs_.empty() && logs_.front().number < min_log_number) {
+ auto& log = logs_.front();
+ if (log.getting_synced) {
+ log_sync_cv_.Wait();
+ // logs_ could have changed while we were waiting.
+ continue;
+ }
+ logs_to_free_.push_back(log.ReleaseWriter());
+ {
+ InstrumentedMutexLock wl(&log_write_mutex_);
+ logs_.pop_front();
+ }
+ }
+ // Current log cannot be obsolete.
+ assert(!logs_.empty());
+ }
+
+ // We're just cleaning up for DB::Write().
+ assert(job_context->logs_to_free.empty());
+ job_context->logs_to_free = logs_to_free_;
+ job_context->log_recycle_files.assign(log_recycle_files_.begin(),
+ log_recycle_files_.end());
+ if (job_context->HaveSomethingToDelete()) {
+ ++pending_purge_obsolete_files_;
+ }
+ logs_to_free_.clear();
+}
+
+namespace {
+bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
+ const JobContext::CandidateFileInfo& second) {
+ if (first.file_name > second.file_name) {
+ return true;
+ } else if (first.file_name < second.file_name) {
+ return false;
+ } else {
+ return (first.file_path > second.file_path);
+ }
+}
+}; // namespace
+
+// Delete obsolete files and log status and information of file deletion
+void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+ const std::string& path_to_sync,
+ FileType type, uint64_t number) {
+ Status file_deletion_status;
+ if (type == kTableFile || type == kLogFile) {
+ file_deletion_status =
+ DeleteDBFile(&immutable_db_options_, fname, path_to_sync,
+ /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_);
+ } else {
+ file_deletion_status = env_->DeleteFile(fname);
+ }
+ TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion",
+ &file_deletion_status);
+ if (file_deletion_status.ok()) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id,
+ fname.c_str(), type, number,
+ file_deletion_status.ToString().c_str());
+ } else if (env_->FileExists(fname).IsNotFound()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64
+ " -- %s\n",
+ job_id, fname.c_str(), type, number,
+ file_deletion_status.ToString().c_str());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n",
+ job_id, fname.c_str(), type, number,
+ file_deletion_status.ToString().c_str());
+ }
+ if (type == kTableFile) {
+ EventHelpers::LogAndNotifyTableFileDeletion(
+ &event_logger_, job_id, number, fname, file_deletion_status, GetName(),
+ immutable_db_options_.listeners);
+ }
+}
+
+// Diffs the files listed in filenames and those that do not
+// belong to live files are possibly removed. Also, removes all the
+// files in sst_delete_files and log_delete_files.
+// It is not necessary to hold the mutex when invoking this method.
+void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
+ TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:Begin");
+ // we'd better have sth to delete
+ assert(state.HaveSomethingToDelete());
+
+ // FindObsoleteFiles() should've populated this so nonzero
+ assert(state.manifest_file_number != 0);
+
+ // Now, convert live list to an unordered map, WITHOUT mutex held;
+ // set is slow.
+ std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map;
+ for (const FileDescriptor& fd : state.sst_live) {
+ sst_live_map[fd.GetNumber()] = &fd;
+ }
+ std::unordered_set<uint64_t> log_recycle_files_set(
+ state.log_recycle_files.begin(), state.log_recycle_files.end());
+
+ auto candidate_files = state.full_scan_candidate_files;
+ candidate_files.reserve(
+ candidate_files.size() + state.sst_delete_files.size() +
+ state.log_delete_files.size() + state.manifest_delete_files.size());
+ // We may ignore the dbname when generating the file names.
+ for (auto& file : state.sst_delete_files) {
+ candidate_files.emplace_back(
+ MakeTableFileName(file.metadata->fd.GetNumber()), file.path);
+ if (file.metadata->table_reader_handle) {
+ table_cache_->Release(file.metadata->table_reader_handle);
+ }
+ file.DeleteMetadata();
+ }
+
+ for (auto file_num : state.log_delete_files) {
+ if (file_num > 0) {
+ candidate_files.emplace_back(LogFileName(file_num),
+ immutable_db_options_.wal_dir);
+ }
+ }
+ for (const auto& filename : state.manifest_delete_files) {
+ candidate_files.emplace_back(filename, dbname_);
+ }
+
+ // dedup state.candidate_files so we don't try to delete the same
+ // file twice
+ std::sort(candidate_files.begin(), candidate_files.end(),
+ CompareCandidateFile);
+ candidate_files.erase(
+ std::unique(candidate_files.begin(), candidate_files.end()),
+ candidate_files.end());
+
+ if (state.prev_total_log_size > 0) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[JOB %d] Try to delete WAL files size %" PRIu64
+ ", prev total WAL file size %" PRIu64
+ ", number of live WAL files %" ROCKSDB_PRIszt ".\n",
+ state.job_id, state.size_log_to_delete,
+ state.prev_total_log_size, state.num_alive_log_files);
+ }
+
+ std::vector<std::string> old_info_log_files;
+ InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+ dbname_);
+
+ // File numbers of most recent two OPTIONS file in candidate_files (found in
+ // previos FindObsoleteFiles(full_scan=true))
+ // At this point, there must not be any duplicate file numbers in
+ // candidate_files.
+ uint64_t optsfile_num1 = std::numeric_limits<uint64_t>::min();
+ uint64_t optsfile_num2 = std::numeric_limits<uint64_t>::min();
+ for (const auto& candidate_file : candidate_files) {
+ const std::string& fname = candidate_file.file_name;
+ uint64_t number;
+ FileType type;
+ if (!ParseFileName(fname, &number, info_log_prefix.prefix, &type) ||
+ type != kOptionsFile) {
+ continue;
+ }
+ if (number > optsfile_num1) {
+ optsfile_num2 = optsfile_num1;
+ optsfile_num1 = number;
+ } else if (number > optsfile_num2) {
+ optsfile_num2 = number;
+ }
+ }
+
+ // Close WALs before trying to delete them.
+ for (const auto w : state.logs_to_free) {
+ // TODO: maybe check the return value of Close.
+ w->Close();
+ }
+
+ bool own_files = OwnTablesAndLogs();
+ std::unordered_set<uint64_t> files_to_del;
+ for (const auto& candidate_file : candidate_files) {
+ const std::string& to_delete = candidate_file.file_name;
+ uint64_t number;
+ FileType type;
+ // Ignore file if we cannot recognize it.
+ if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) {
+ continue;
+ }
+
+ bool keep = true;
+ switch (type) {
+ case kLogFile:
+ keep = ((number >= state.log_number) ||
+ (number == state.prev_log_number) ||
+ (log_recycle_files_set.find(number) !=
+ log_recycle_files_set.end()));
+ break;
+ case kDescriptorFile:
+ // Keep my manifest file, and any newer incarnations'
+ // (can happen during manifest roll)
+ keep = (number >= state.manifest_file_number);
+ break;
+ case kTableFile:
+ // If the second condition is not there, this makes
+ // DontDeletePendingOutputs fail
+ keep = (sst_live_map.find(number) != sst_live_map.end()) ||
+ number >= state.min_pending_output;
+ if (!keep) {
+ files_to_del.insert(number);
+ }
+ break;
+ case kTempFile:
+ // Any temp files that are currently being written to must
+ // be recorded in pending_outputs_, which is inserted into "live".
+ // Also, SetCurrentFile creates a temp file when writing out new
+ // manifest, which is equal to state.pending_manifest_file_number. We
+ // should not delete that file
+ //
+ // TODO(yhchiang): carefully modify the third condition to safely
+ // remove the temp options files.
+ keep = (sst_live_map.find(number) != sst_live_map.end()) ||
+ (number == state.pending_manifest_file_number) ||
+ (to_delete.find(kOptionsFileNamePrefix) != std::string::npos);
+ break;
+ case kInfoLogFile:
+ keep = true;
+ if (number != 0) {
+ old_info_log_files.push_back(to_delete);
+ }
+ break;
+ case kOptionsFile:
+ keep = (number >= optsfile_num2);
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1",
+ reinterpret_cast<void*>(&number));
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2",
+ reinterpret_cast<void*>(&keep));
+ break;
+ case kCurrentFile:
+ case kDBLockFile:
+ case kIdentityFile:
+ case kMetaDatabase:
+ case kBlobFile:
+ keep = true;
+ break;
+ }
+
+ if (keep) {
+ continue;
+ }
+
+ std::string fname;
+ std::string dir_to_sync;
+ if (type == kTableFile) {
+ // evict from cache
+ TableCache::Evict(table_cache_.get(), number);
+ fname = MakeTableFileName(candidate_file.file_path, number);
+ dir_to_sync = candidate_file.file_path;
+ } else {
+ dir_to_sync =
+ (type == kLogFile) ? immutable_db_options_.wal_dir : dbname_;
+ fname = dir_to_sync +
+ ((!dir_to_sync.empty() && dir_to_sync.back() == '/') ||
+ (!to_delete.empty() && to_delete.front() == '/')
+ ? ""
+ : "/") +
+ to_delete;
+ }
+
+#ifndef ROCKSDB_LITE
+ if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 ||
+ immutable_db_options_.wal_size_limit_mb > 0)) {
+ wal_manager_.ArchiveWALFile(fname, number);
+ continue;
+ }
+#endif // !ROCKSDB_LITE
+
+ // If I do not own these files, e.g. secondary instance with max_open_files
+ // = -1, then no need to delete or schedule delete these files since they
+ // will be removed by their owner, e.g. the primary instance.
+ if (!own_files) {
+ continue;
+ }
+ Status file_deletion_status;
+ if (schedule_only) {
+ InstrumentedMutexLock guard_lock(&mutex_);
+ SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id);
+ } else {
+ DeleteObsoleteFileImpl(state.job_id, fname, dir_to_sync, type, number);
+ }
+ }
+
+ {
+ // After purging obsolete files, remove them from files_grabbed_for_purge_.
+ InstrumentedMutexLock guard_lock(&mutex_);
+ autovector<uint64_t> to_be_removed;
+ for (auto fn : files_grabbed_for_purge_) {
+ if (files_to_del.count(fn) != 0) {
+ to_be_removed.emplace_back(fn);
+ }
+ }
+ for (auto fn : to_be_removed) {
+ files_grabbed_for_purge_.erase(fn);
+ }
+ }
+
+ // Delete old info log files.
+ size_t old_info_log_file_count = old_info_log_files.size();
+ if (old_info_log_file_count != 0 &&
+ old_info_log_file_count >= immutable_db_options_.keep_log_file_num) {
+ std::sort(old_info_log_files.begin(), old_info_log_files.end());
+ size_t end =
+ old_info_log_file_count - immutable_db_options_.keep_log_file_num;
+ for (unsigned int i = 0; i <= end; i++) {
+ std::string& to_delete = old_info_log_files.at(i);
+ std::string full_path_to_delete =
+ (immutable_db_options_.db_log_dir.empty()
+ ? dbname_
+ : immutable_db_options_.db_log_dir) +
+ "/" + to_delete;
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[JOB %d] Delete info log file %s\n", state.job_id,
+ full_path_to_delete.c_str());
+ Status s = env_->DeleteFile(full_path_to_delete);
+ if (!s.ok()) {
+ if (env_->FileExists(full_path_to_delete).IsNotFound()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "[JOB %d] Tried to delete non-existing info log file %s FAILED "
+ "-- %s\n",
+ state.job_id, to_delete.c_str(), s.ToString().c_str());
+ } else {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "[JOB %d] Delete info log file %s FAILED -- %s\n",
+ state.job_id, to_delete.c_str(),
+ s.ToString().c_str());
+ }
+ }
+ }
+ }
+#ifndef ROCKSDB_LITE
+ wal_manager_.PurgeObsoleteWALFiles();
+#endif // ROCKSDB_LITE
+ LogFlush(immutable_db_options_.info_log);
+ InstrumentedMutexLock l(&mutex_);
+ --pending_purge_obsolete_files_;
+ assert(pending_purge_obsolete_files_ >= 0);
+ if (pending_purge_obsolete_files_ == 0) {
+ bg_cv_.SignalAll();
+ }
+ TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:End");
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+ mutex_.AssertHeld();
+ JobContext job_context(next_job_id_.fetch_add(1));
+ FindObsoleteFiles(&job_context, true);
+
+ mutex_.Unlock();
+ if (job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(job_context);
+ }
+ job_context.Clean();
+ mutex_.Lock();
+}
+
+uint64_t FindMinPrepLogReferencedByMemTable(
+ VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
+ const autovector<MemTable*>& memtables_to_flush) {
+ uint64_t min_log = 0;
+
+ // we must look through the memtables for two phase transactions
+ // that have been committed but not yet flushed
+ for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+ if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) {
+ continue;
+ }
+
+ auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+ memtables_to_flush);
+
+ if (log > 0 && (min_log == 0 || log < min_log)) {
+ min_log = log;
+ }
+
+ log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+
+ if (log > 0 && (min_log == 0 || log < min_log)) {
+ min_log = log;
+ }
+ }
+
+ return min_log;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep(
+ VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+ autovector<VersionEdit*> edit_list,
+ const autovector<MemTable*>& memtables_to_flush,
+ LogsWithPrepTracker* prep_tracker) {
+ assert(vset != nullptr);
+ assert(prep_tracker != nullptr);
+ // Calculate updated min_log_number_to_keep
+ // Since the function should only be called in 2pc mode, log number in
+ // the version edit should be sufficient.
+
+ // Precompute the min log number containing unflushed data for the column
+ // family being flushed (`cfd_to_flush`).
+ uint64_t cf_min_log_number_to_keep = 0;
+ for (auto& e : edit_list) {
+ if (e->HasLogNumber()) {
+ cf_min_log_number_to_keep =
+ std::max(cf_min_log_number_to_keep, e->GetLogNumber());
+ }
+ }
+ if (cf_min_log_number_to_keep == 0) {
+ // No version edit contains information on log number. The log number
+ // for this column family should stay the same as it is.
+ cf_min_log_number_to_keep = cfd_to_flush.GetLogNumber();
+ }
+
+ // Get min log number containing unflushed data for other column families.
+ uint64_t min_log_number_to_keep =
+ vset->PreComputeMinLogNumberWithUnflushedData(&cfd_to_flush);
+ if (cf_min_log_number_to_keep != 0) {
+ min_log_number_to_keep =
+ std::min(cf_min_log_number_to_keep, min_log_number_to_keep);
+ }
+
+ // if are 2pc we must consider logs containing prepared
+ // sections of outstanding transactions.
+ //
+ // We must check min logs with outstanding prep before we check
+ // logs references by memtables because a log referenced by the
+ // first data structure could transition to the second under us.
+ //
+ // TODO: iterating over all column families under db mutex.
+ // should find more optimal solution
+ auto min_log_in_prep_heap =
+ prep_tracker->FindMinLogContainingOutstandingPrep();
+
+ if (min_log_in_prep_heap != 0 &&
+ min_log_in_prep_heap < min_log_number_to_keep) {
+ min_log_number_to_keep = min_log_in_prep_heap;
+ }
+
+ uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(
+ vset, &cfd_to_flush, memtables_to_flush);
+
+ if (min_log_refed_by_mem != 0 &&
+ min_log_refed_by_mem < min_log_number_to_keep) {
+ min_log_number_to_keep = min_log_refed_by_mem;
+ }
+ return min_log_number_to_keep;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_open.cc b/src/rocksdb/db/db_impl/db_impl_open.cc
new file mode 100644
index 000000000..6ae4ead54
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_open.cc
@@ -0,0 +1,1651 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/error_handler.h"
+#include "env/composite_env_wrapper.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "rocksdb/wal_filter.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "test_util/sync_point.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Options SanitizeOptions(const std::string& dbname, const Options& src) {
+ auto db_options = SanitizeOptions(dbname, DBOptions(src));
+ ImmutableDBOptions immutable_db_options(db_options);
+ auto cf_options =
+ SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
+ return Options(db_options, cf_options);
+}
+
+DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
+ DBOptions result(src);
+
+ if (result.file_system == nullptr) {
+ if (result.env == Env::Default()) {
+ result.file_system = FileSystem::Default();
+ } else {
+ result.file_system.reset(new LegacyFileSystemWrapper(result.env));
+ }
+ } else {
+ if (result.env == nullptr) {
+ result.env = Env::Default();
+ }
+ }
+
+ // result.max_open_files means an "infinite" open files.
+ if (result.max_open_files != -1) {
+ int max_max_open_files = port::GetMaxOpenFiles();
+ if (max_max_open_files == -1) {
+ max_max_open_files = 0x400000;
+ }
+ ClipToRange(&result.max_open_files, 20, max_max_open_files);
+ TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles",
+ &result.max_open_files);
+ }
+
+ if (result.info_log == nullptr) {
+ Status s = CreateLoggerFromOptions(dbname, result, &result.info_log);
+ if (!s.ok()) {
+ // No place suitable for logging
+ result.info_log = nullptr;
+ }
+ }
+
+ if (!result.write_buffer_manager) {
+ result.write_buffer_manager.reset(
+ new WriteBufferManager(result.db_write_buffer_size));
+ }
+ auto bg_job_limits = DBImpl::GetBGJobLimits(
+ result.max_background_flushes, result.max_background_compactions,
+ result.max_background_jobs, true /* parallelize_compactions */);
+ result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions,
+ Env::Priority::LOW);
+ result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes,
+ Env::Priority::HIGH);
+
+ if (result.rate_limiter.get() != nullptr) {
+ if (result.bytes_per_sync == 0) {
+ result.bytes_per_sync = 1024 * 1024;
+ }
+ }
+
+ if (result.delayed_write_rate == 0) {
+ if (result.rate_limiter.get() != nullptr) {
+ result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond();
+ }
+ if (result.delayed_write_rate == 0) {
+ result.delayed_write_rate = 16 * 1024 * 1024;
+ }
+ }
+
+ if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) {
+ result.recycle_log_file_num = false;
+ }
+
+ if (result.recycle_log_file_num &&
+ (result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery ||
+ result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) {
+ // kPointInTimeRecovery is inconsistent with recycle log file feature since
+ // we define the "end" of the log as the first corrupt record we encounter.
+ // kAbsoluteConsistency doesn't make sense because even a clean
+ // shutdown leaves old junk at the end of the log file.
+ result.recycle_log_file_num = 0;
+ }
+
+ if (result.wal_dir.empty()) {
+ // Use dbname as default
+ result.wal_dir = dbname;
+ }
+ if (result.wal_dir.back() == '/') {
+ result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
+ }
+
+ if (result.db_paths.size() == 0) {
+ result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
+ }
+
+ if (result.use_direct_reads && result.compaction_readahead_size == 0) {
+ TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr);
+ result.compaction_readahead_size = 1024 * 1024 * 2;
+ }
+
+ if (result.compaction_readahead_size > 0 || result.use_direct_reads) {
+ result.new_table_reader_for_compaction_inputs = true;
+ }
+
+ // Force flush on DB open if 2PC is enabled, since with 2PC we have no
+ // guarantee that consecutive log files have consecutive sequence id, which
+ // make recovery complicated.
+ if (result.allow_2pc) {
+ result.avoid_flush_during_recovery = false;
+ }
+
+#ifndef ROCKSDB_LITE
+ ImmutableDBOptions immutable_db_options(result);
+ if (!IsWalDirSameAsDBPath(&immutable_db_options)) {
+ // Either the WAL dir and db_paths[0]/db_name are not the same, or we
+ // cannot tell for sure. In either case, assume they're different and
+ // explicitly cleanup the trash log files (bypass DeleteScheduler)
+ // Do this first so even if we end up calling
+ // DeleteScheduler::CleanupDirectory on the same dir later, it will be
+ // safe
+ std::vector<std::string> filenames;
+ result.env->GetChildren(result.wal_dir, &filenames);
+ for (std::string& filename : filenames) {
+ if (filename.find(".log.trash", filename.length() -
+ std::string(".log.trash").length()) !=
+ std::string::npos) {
+ std::string trash_file = result.wal_dir + "/" + filename;
+ result.env->DeleteFile(trash_file);
+ }
+ }
+ }
+ // When the DB is stopped, it's possible that there are some .trash files that
+ // were not deleted yet, when we open the DB we will find these .trash files
+ // and schedule them to be deleted (or delete immediately if SstFileManager
+ // was not used)
+ auto sfm = static_cast<SstFileManagerImpl*>(result.sst_file_manager.get());
+ for (size_t i = 0; i < result.db_paths.size(); i++) {
+ DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path);
+ }
+
+ // Create a default SstFileManager for purposes of tracking compaction size
+ // and facilitating recovery from out of space errors.
+ if (result.sst_file_manager.get() == nullptr) {
+ std::shared_ptr<SstFileManager> sst_file_manager(
+ NewSstFileManager(result.env, result.info_log));
+ result.sst_file_manager = sst_file_manager;
+ }
+#endif
+
+ if (!result.paranoid_checks) {
+ result.skip_checking_sst_file_sizes_on_db_open = true;
+ ROCKS_LOG_INFO(result.info_log,
+ "file size check will be skipped during open.");
+ }
+
+ return result;
+}
+
+namespace {
+Status SanitizeOptionsByTable(
+ const DBOptions& db_opts,
+ const std::vector<ColumnFamilyDescriptor>& column_families) {
+ Status s;
+ for (auto cf : column_families) {
+ s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ return Status::OK();
+}
+} // namespace
+
+Status DBImpl::ValidateOptions(
+ const DBOptions& db_options,
+ const std::vector<ColumnFamilyDescriptor>& column_families) {
+ Status s;
+ for (auto& cfd : column_families) {
+ s = ColumnFamilyData::ValidateOptions(db_options, cfd.options);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ s = ValidateOptions(db_options);
+ return s;
+}
+
+Status DBImpl::ValidateOptions(const DBOptions& db_options) {
+ if (db_options.db_paths.size() > 4) {
+ return Status::NotSupported(
+ "More than four DB paths are not supported yet. ");
+ }
+
+ if (db_options.allow_mmap_reads && db_options.use_direct_reads) {
+ // Protect against assert in PosixMMapReadableFile constructor
+ return Status::NotSupported(
+ "If memory mapped reads (allow_mmap_reads) are enabled "
+ "then direct I/O reads (use_direct_reads) must be disabled. ");
+ }
+
+ if (db_options.allow_mmap_writes &&
+ db_options.use_direct_io_for_flush_and_compaction) {
+ return Status::NotSupported(
+ "If memory mapped writes (allow_mmap_writes) are enabled "
+ "then direct I/O writes (use_direct_io_for_flush_and_compaction) must "
+ "be disabled. ");
+ }
+
+ if (db_options.keep_log_file_num == 0) {
+ return Status::InvalidArgument("keep_log_file_num must be greater than 0");
+ }
+
+ if (db_options.unordered_write &&
+ !db_options.allow_concurrent_memtable_write) {
+ return Status::InvalidArgument(
+ "unordered_write is incompatible with !allow_concurrent_memtable_write");
+ }
+
+ if (db_options.unordered_write && db_options.enable_pipelined_write) {
+ return Status::InvalidArgument(
+ "unordered_write is incompatible with enable_pipelined_write");
+ }
+
+ if (db_options.atomic_flush && db_options.enable_pipelined_write) {
+ return Status::InvalidArgument(
+ "atomic_flush is incompatible with enable_pipelined_write");
+ }
+
+ return Status::OK();
+}
+
+Status DBImpl::NewDB() {
+ VersionEdit new_db;
+ Status s = SetIdentityFile(env_, dbname_);
+ if (!s.ok()) {
+ return s;
+ }
+ if (immutable_db_options_.write_dbid_to_manifest) {
+ std::string temp_db_id;
+ GetDbIdentityFromIdentityFile(&temp_db_id);
+ new_db.SetDBId(temp_db_id);
+ }
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ {
+ std::unique_ptr<FSWritableFile> file;
+ FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
+ s = NewWritableFile(fs_.get(), manifest, &file, file_options);
+ if (!s.ok()) {
+ return s;
+ }
+ file->SetPreallocationBlockSize(
+ immutable_db_options_.manifest_preallocation_size);
+ std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+ std::move(file), manifest, file_options, env_, nullptr /* stats */,
+ immutable_db_options_.listeners));
+ log::Writer log(std::move(file_writer), 0, false);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+ if (s.ok()) {
+ s = SyncManifest(env_, &immutable_db_options_, log.file());
+ }
+ }
+ if (s.ok()) {
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir());
+ } else {
+ fs_->DeleteFile(manifest, IOOptions(), nullptr);
+ }
+ return s;
+}
+
+Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname,
+ std::unique_ptr<Directory>* directory) {
+ // We call CreateDirIfMissing() as the directory may already exist (if we
+ // are reopening a DB), when this happens we don't want creating the
+ // directory to cause an error. However, we need to check if creating the
+ // directory fails or else we may get an obscure message about the lock
+ // file not existing. One real-world example of this occurring is if
+ // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
+ // when dbname_ is "dir/db" but when "dir" doesn't exist.
+ Status s = env->CreateDirIfMissing(dirname);
+ if (!s.ok()) {
+ return s;
+ }
+ return env->NewDirectory(dirname, directory);
+}
+
+Status Directories::SetDirectories(Env* env, const std::string& dbname,
+ const std::string& wal_dir,
+ const std::vector<DbPath>& data_paths) {
+ Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_);
+ if (!s.ok()) {
+ return s;
+ }
+ if (!wal_dir.empty() && dbname != wal_dir) {
+ s = DBImpl::CreateAndNewDirectory(env, wal_dir, &wal_dir_);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+
+ data_dirs_.clear();
+ for (auto& p : data_paths) {
+ const std::string db_path = p.path;
+ if (db_path == dbname) {
+ data_dirs_.emplace_back(nullptr);
+ } else {
+ std::unique_ptr<Directory> path_directory;
+ s = DBImpl::CreateAndNewDirectory(env, db_path, &path_directory);
+ if (!s.ok()) {
+ return s;
+ }
+ data_dirs_.emplace_back(path_directory.release());
+ }
+ }
+ assert(data_dirs_.size() == data_paths.size());
+ return Status::OK();
+}
+
+Status DBImpl::Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+ bool error_if_log_file_exist, bool error_if_data_exists_in_logs,
+ uint64_t* recovered_seq) {
+ mutex_.AssertHeld();
+
+ bool is_new_db = false;
+ assert(db_lock_ == nullptr);
+ if (!read_only) {
+ Status s = directories_.SetDirectories(env_, dbname_,
+ immutable_db_options_.wal_dir,
+ immutable_db_options_.db_paths);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ std::string current_fname = CurrentFileName(dbname_);
+ s = env_->FileExists(current_fname);
+ if (s.IsNotFound()) {
+ if (immutable_db_options_.create_if_missing) {
+ s = NewDB();
+ is_new_db = true;
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ return Status::InvalidArgument(
+ current_fname, "does not exist (create_if_missing is false)");
+ }
+ } else if (s.ok()) {
+ if (immutable_db_options_.error_if_exists) {
+ return Status::InvalidArgument(dbname_,
+ "exists (error_if_exists is true)");
+ }
+ } else {
+ // Unexpected error reading file
+ assert(s.IsIOError());
+ return s;
+ }
+ // Verify compatibility of file_options_ and filesystem
+ {
+ std::unique_ptr<FSRandomAccessFile> idfile;
+ FileOptions customized_fs(file_options_);
+ customized_fs.use_direct_reads |=
+ immutable_db_options_.use_direct_io_for_flush_and_compaction;
+ s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile,
+ nullptr);
+ if (!s.ok()) {
+ std::string error_str = s.ToString();
+ // Check if unsupported Direct I/O is the root cause
+ customized_fs.use_direct_reads = false;
+ s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile,
+ nullptr);
+ if (s.ok()) {
+ return Status::InvalidArgument(
+ "Direct I/O is not supported by the specified DB.");
+ } else {
+ return Status::InvalidArgument(
+ "Found options incompatible with filesystem", error_str.c_str());
+ }
+ }
+ }
+ }
+ assert(db_id_.empty());
+ Status s = versions_->Recover(column_families, read_only, &db_id_);
+ if (!s.ok()) {
+ return s;
+ }
+ // Happens when immutable_db_options_.write_dbid_to_manifest is set to true
+ // the very first time.
+ if (db_id_.empty()) {
+ // Check for the IDENTITY file and create it if not there.
+ s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
+ // Typically Identity file is created in NewDB() and for some reason if
+ // it is no longer available then at this point DB ID is not in Identity
+ // file or Manifest.
+ if (s.IsNotFound()) {
+ s = SetIdentityFile(env_, dbname_);
+ if (!s.ok()) {
+ return s;
+ }
+ } else if (!s.ok()) {
+ assert(s.IsIOError());
+ return s;
+ }
+ s = GetDbIdentityFromIdentityFile(&db_id_);
+ if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
+ VersionEdit edit;
+ edit.SetDBId(db_id_);
+ Options options;
+ MutableCFOptions mutable_cf_options(options);
+ versions_->db_id_ = db_id_;
+ s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+ mutable_cf_options, &edit, &mutex_, nullptr,
+ false);
+ }
+ } else {
+ s = SetIdentityFile(env_, dbname_, db_id_);
+ }
+
+ if (immutable_db_options_.paranoid_checks && s.ok()) {
+ s = CheckConsistency();
+ }
+ if (s.ok() && !read_only) {
+ std::map<std::string, std::shared_ptr<Directory>> created_dirs;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ s = cfd->AddDirectories(&created_dirs);
+ if (!s.ok()) {
+ return s;
+ }
+ }
+ }
+ // DB mutex is already held
+ if (s.ok() && immutable_db_options_.persist_stats_to_disk) {
+ s = InitPersistStatsColumnFamily();
+ }
+
+ if (s.ok()) {
+ // Initial max_total_in_memory_state_ before recovery logs. Log recovery
+ // may check this value to decide whether to flush.
+ max_total_in_memory_state_ = 0;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+ mutable_cf_options->max_write_buffer_number;
+ }
+
+ SequenceNumber next_sequence(kMaxSequenceNumber);
+ default_cf_handle_ = new ColumnFamilyHandleImpl(
+ versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+ default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+ // TODO(Zhongyi): handle single_column_family_mode_ when
+ // persistent_stats is enabled
+ single_column_family_mode_ =
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
+
+ // Recover from all newer log files than the ones named in the
+ // descriptor (new log files may have been added by the previous
+ // incarnation without registering them in the descriptor).
+ //
+ // Note that prev_log_number() is no longer used, but we pay
+ // attention to it in case we are recovering a database
+ // produced by an older version of rocksdb.
+ std::vector<std::string> filenames;
+ s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
+ if (s.IsNotFound()) {
+ return Status::InvalidArgument("wal_dir not found",
+ immutable_db_options_.wal_dir);
+ } else if (!s.ok()) {
+ return s;
+ }
+
+ std::vector<uint64_t> logs;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) {
+ if (is_new_db) {
+ return Status::Corruption(
+ "While creating a new Db, wal_dir contains "
+ "existing log file: ",
+ filenames[i]);
+ } else {
+ logs.push_back(number);
+ }
+ }
+ }
+
+ if (logs.size() > 0) {
+ if (error_if_log_file_exist) {
+ return Status::Corruption(
+ "The db was opened in readonly mode with error_if_log_file_exist"
+ "flag but a log file already exists");
+ } else if (error_if_data_exists_in_logs) {
+ for (auto& log : logs) {
+ std::string fname = LogFileName(immutable_db_options_.wal_dir, log);
+ uint64_t bytes;
+ s = env_->GetFileSize(fname, &bytes);
+ if (s.ok()) {
+ if (bytes > 0) {
+ return Status::Corruption(
+ "error_if_data_exists_in_logs is set but there are data "
+ " in log files.");
+ }
+ }
+ }
+ }
+ }
+
+ if (!logs.empty()) {
+ // Recover in the order in which the logs were generated
+ std::sort(logs.begin(), logs.end());
+ bool corrupted_log_found = false;
+ s = RecoverLogFiles(logs, &next_sequence, read_only,
+ &corrupted_log_found);
+ if (corrupted_log_found && recovered_seq != nullptr) {
+ *recovered_seq = next_sequence;
+ }
+ if (!s.ok()) {
+ // Clear memtables if recovery failed
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ kMaxSequenceNumber);
+ }
+ }
+ }
+ }
+
+ if (read_only) {
+ // If we are opening as read-only, we need to update options_file_number_
+ // to reflect the most recent OPTIONS file. It does not matter for regular
+ // read-write db instance because options_file_number_ will later be
+ // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile.
+ std::vector<std::string> file_names;
+ if (s.ok()) {
+ s = env_->GetChildren(GetName(), &file_names);
+ }
+ if (s.ok()) {
+ uint64_t number = 0;
+ uint64_t options_file_number = 0;
+ FileType type;
+ for (const auto& fname : file_names) {
+ if (ParseFileName(fname, &number, &type) && type == kOptionsFile) {
+ options_file_number = std::max(number, options_file_number);
+ }
+ }
+ versions_->options_file_number_ = options_file_number;
+ }
+ }
+
+ return s;
+}
+
+Status DBImpl::PersistentStatsProcessFormatVersion() {
+ mutex_.AssertHeld();
+ Status s;
+ // persist version when stats CF doesn't exist
+ bool should_persist_format_version = !persistent_stats_cfd_exists_;
+ mutex_.Unlock();
+ if (persistent_stats_cfd_exists_) {
+ // Check persistent stats format version compatibility. Drop and recreate
+ // persistent stats CF if format version is incompatible
+ uint64_t format_version_recovered = 0;
+ Status s_format = DecodePersistentStatsVersionNumber(
+ this, StatsVersionKeyType::kFormatVersion, &format_version_recovered);
+ uint64_t compatible_version_recovered = 0;
+ Status s_compatible = DecodePersistentStatsVersionNumber(
+ this, StatsVersionKeyType::kCompatibleVersion,
+ &compatible_version_recovered);
+ // abort reading from existing stats CF if any of following is true:
+ // 1. failed to read format version or compatible version from disk
+ // 2. sst's format version is greater than current format version, meaning
+ // this sst is encoded with a newer RocksDB release, and current compatible
+ // version is below the sst's compatible version
+ if (!s_format.ok() || !s_compatible.ok() ||
+ (kStatsCFCurrentFormatVersion < format_version_recovered &&
+ kStatsCFCompatibleFormatVersion < compatible_version_recovered)) {
+ if (!s_format.ok() || !s_compatible.ok()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Reading persistent stats version key failed. Format key: %s, "
+ "compatible key: %s",
+ s_format.ToString().c_str(), s_compatible.ToString().c_str());
+ } else {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Disable persistent stats due to corrupted or incompatible format "
+ "version\n");
+ }
+ DropColumnFamily(persist_stats_cf_handle_);
+ DestroyColumnFamilyHandle(persist_stats_cf_handle_);
+ ColumnFamilyHandle* handle = nullptr;
+ ColumnFamilyOptions cfo;
+ OptimizeForPersistentStats(&cfo);
+ s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+ persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+ // should also persist version here because old stats CF is discarded
+ should_persist_format_version = true;
+ }
+ }
+ if (s.ok() && should_persist_format_version) {
+ // Persistent stats CF being created for the first time, need to write
+ // format version key
+ WriteBatch batch;
+ batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString,
+ ToString(kStatsCFCurrentFormatVersion));
+ batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString,
+ ToString(kStatsCFCompatibleFormatVersion));
+ WriteOptions wo;
+ wo.low_pri = true;
+ wo.no_slowdown = true;
+ wo.sync = false;
+ s = Write(wo, &batch);
+ }
+ mutex_.Lock();
+ return s;
+}
+
+Status DBImpl::InitPersistStatsColumnFamily() {
+ mutex_.AssertHeld();
+ assert(!persist_stats_cf_handle_);
+ ColumnFamilyData* persistent_stats_cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(
+ kPersistentStatsColumnFamilyName);
+ persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr;
+
+ Status s;
+ if (persistent_stats_cfd != nullptr) {
+ // We are recovering from a DB which already contains persistent stats CF,
+ // the CF is already created in VersionSet::ApplyOneVersionEdit, but
+ // column family handle was not. Need to explicitly create handle here.
+ persist_stats_cf_handle_ =
+ new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_);
+ } else {
+ mutex_.Unlock();
+ ColumnFamilyHandle* handle = nullptr;
+ ColumnFamilyOptions cfo;
+ OptimizeForPersistentStats(&cfo);
+ s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+ persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+ mutex_.Lock();
+ }
+ return s;
+}
+
+// REQUIRES: log_numbers are sorted in ascending order
+Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+ SequenceNumber* next_sequence, bool read_only,
+ bool* corrupted_log_found) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ const char* fname;
+ Status* status; // nullptr if immutable_db_options_.paranoid_checks==false
+ void Corruption(size_t bytes, const Status& s) override {
+ ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+ (this->status == nullptr ? "(ignoring error) " : ""),
+ fname, static_cast<int>(bytes), s.ToString().c_str());
+ if (this->status != nullptr && this->status->ok()) {
+ *this->status = s;
+ }
+ }
+ };
+
+ mutex_.AssertHeld();
+ Status status;
+ std::unordered_map<int, VersionEdit> version_edits;
+ // no need to refcount because iteration is under mutex
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ VersionEdit edit;
+ edit.SetColumnFamily(cfd->GetID());
+ version_edits.insert({cfd->GetID(), edit});
+ }
+ int job_id = next_job_id_.fetch_add(1);
+ {
+ auto stream = event_logger_.Log();
+ stream << "job" << job_id << "event"
+ << "recovery_started";
+ stream << "log_files";
+ stream.StartArray();
+ for (auto log_number : log_numbers) {
+ stream << log_number;
+ }
+ stream.EndArray();
+ }
+
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.wal_filter != nullptr) {
+ std::map<std::string, uint32_t> cf_name_id_map;
+ std::map<uint32_t, uint64_t> cf_lognumber_map;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID()));
+ cf_lognumber_map.insert(
+ std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
+ }
+
+ immutable_db_options_.wal_filter->ColumnFamilyLogNumberMap(cf_lognumber_map,
+ cf_name_id_map);
+ }
+#endif
+
+ bool stop_replay_by_wal_filter = false;
+ bool stop_replay_for_corruption = false;
+ bool flushed = false;
+ uint64_t corrupted_log_number = kMaxSequenceNumber;
+ uint64_t min_log_number = MinLogNumberToKeep();
+ for (auto log_number : log_numbers) {
+ if (log_number < min_log_number) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Skipping log #%" PRIu64
+ " since it is older than min log to keep #%" PRIu64,
+ log_number, min_log_number);
+ continue;
+ }
+ // The previous incarnation may not have written any MANIFEST
+ // records after allocating this log number. So we manually
+ // update the file number allocation counter in VersionSet.
+ versions_->MarkFileNumberUsed(log_number);
+ // Open the log file
+ std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Recovering log #%" PRIu64 " mode %d", log_number,
+ static_cast<int>(immutable_db_options_.wal_recovery_mode));
+ auto logFileDropped = [this, &fname]() {
+ uint64_t bytes;
+ if (env_->GetFileSize(fname, &bytes).ok()) {
+ auto info_log = immutable_db_options_.info_log.get();
+ ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(),
+ static_cast<int>(bytes));
+ }
+ };
+ if (stop_replay_by_wal_filter) {
+ logFileDropped();
+ continue;
+ }
+
+ std::unique_ptr<SequentialFileReader> file_reader;
+ {
+ std::unique_ptr<FSSequentialFile> file;
+ status = fs_->NewSequentialFile(fname,
+ fs_->OptimizeForLogRead(file_options_),
+ &file, nullptr);
+ if (!status.ok()) {
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ return status;
+ } else {
+ // Fail with one log file, but that's ok.
+ // Try next one.
+ continue;
+ }
+ }
+ file_reader.reset(new SequentialFileReader(
+ std::move(file), fname, immutable_db_options_.log_readahead_size));
+ }
+
+ // Create the log reader.
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = immutable_db_options_.info_log.get();
+ reporter.fname = fname.c_str();
+ if (!immutable_db_options_.paranoid_checks ||
+ immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kSkipAnyCorruptedRecords) {
+ reporter.status = nullptr;
+ } else {
+ reporter.status = &status;
+ }
+ // We intentially make log::Reader do checksumming even if
+ // paranoid_checks==false so that corruptions cause entire commits
+ // to be skipped instead of propagating bad information (like overly
+ // large sequence numbers).
+ log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
+ &reporter, true /*checksum*/, log_number);
+
+ // Determine if we should tolerate incomplete records at the tail end of the
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+
+ while (!stop_replay_by_wal_filter &&
+ reader.ReadRecord(&record, &scratch,
+ immutable_db_options_.wal_recovery_mode) &&
+ status.ok()) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reporter.Corruption(record.size(),
+ Status::Corruption("log record too small"));
+ continue;
+ }
+ WriteBatchInternal::SetContents(&batch, record);
+ SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
+
+ if (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kPointInTimeRecovery) {
+ // In point-in-time recovery mode, if sequence id of log files are
+ // consecutive, we continue recovery despite corruption. This could
+ // happen when we open and write to a corrupted DB, where sequence id
+ // will start from the last sequence id we recovered.
+ if (sequence == *next_sequence) {
+ stop_replay_for_corruption = false;
+ }
+ if (stop_replay_for_corruption) {
+ logFileDropped();
+ break;
+ }
+ }
+
+#ifndef ROCKSDB_LITE
+ if (immutable_db_options_.wal_filter != nullptr) {
+ WriteBatch new_batch;
+ bool batch_changed = false;
+
+ WalFilter::WalProcessingOption wal_processing_option =
+ immutable_db_options_.wal_filter->LogRecordFound(
+ log_number, fname, batch, &new_batch, &batch_changed);
+
+ switch (wal_processing_option) {
+ case WalFilter::WalProcessingOption::kContinueProcessing:
+ // do nothing, proceeed normally
+ break;
+ case WalFilter::WalProcessingOption::kIgnoreCurrentRecord:
+ // skip current record
+ continue;
+ case WalFilter::WalProcessingOption::kStopReplay:
+ // skip current record and stop replay
+ stop_replay_by_wal_filter = true;
+ continue;
+ case WalFilter::WalProcessingOption::kCorruptedRecord: {
+ status =
+ Status::Corruption("Corruption reported by Wal Filter ",
+ immutable_db_options_.wal_filter->Name());
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ reporter.Corruption(record.size(), status);
+ continue;
+ }
+ break;
+ }
+ default: {
+ assert(false); // unhandled case
+ status = Status::NotSupported(
+ "Unknown WalProcessingOption returned"
+ " by Wal Filter ",
+ immutable_db_options_.wal_filter->Name());
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ return status;
+ } else {
+ // Ignore the error with current record processing.
+ continue;
+ }
+ }
+ }
+
+ if (batch_changed) {
+ // Make sure that the count in the new batch is
+ // within the orignal count.
+ int new_count = WriteBatchInternal::Count(&new_batch);
+ int original_count = WriteBatchInternal::Count(&batch);
+ if (new_count > original_count) {
+ ROCKS_LOG_FATAL(
+ immutable_db_options_.info_log,
+ "Recovering log #%" PRIu64
+ " mode %d log filter %s returned "
+ "more records (%d) than original (%d) which is not allowed. "
+ "Aborting recovery.",
+ log_number,
+ static_cast<int>(immutable_db_options_.wal_recovery_mode),
+ immutable_db_options_.wal_filter->Name(), new_count,
+ original_count);
+ status = Status::NotSupported(
+ "More than original # of records "
+ "returned by Wal Filter ",
+ immutable_db_options_.wal_filter->Name());
+ return status;
+ }
+ // Set the same sequence number in the new_batch
+ // as the original batch.
+ WriteBatchInternal::SetSequence(&new_batch,
+ WriteBatchInternal::Sequence(&batch));
+ batch = new_batch;
+ }
+ }
+#endif // ROCKSDB_LITE
+
+ // If column family was not found, it might mean that the WAL write
+ // batch references to the column family that was dropped after the
+ // insert. We don't want to fail the whole write batch in that case --
+ // we just ignore the update.
+ // That's why we set ignore missing column families to true
+ bool has_valid_writes = false;
+ status = WriteBatchInternal::InsertInto(
+ &batch, column_family_memtables_.get(), &flush_scheduler_,
+ &trim_history_scheduler_, true, log_number, this,
+ false /* concurrent_memtable_writes */, next_sequence,
+ &has_valid_writes, seq_per_batch_, batch_per_txn_);
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ // We are treating this as a failure while reading since we read valid
+ // blocks that do not form coherent data
+ reporter.Corruption(record.size(), status);
+ continue;
+ }
+
+ if (has_valid_writes && !read_only) {
+ // we can do this because this is called before client has access to the
+ // DB and there is only a single thread operating on DB
+ ColumnFamilyData* cfd;
+
+ while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+ cfd->UnrefAndTryDelete();
+ // If this asserts, it means that InsertInto failed in
+ // filtering updates to already-flushed column families
+ assert(cfd->GetLogNumber() <= log_number);
+ auto iter = version_edits.find(cfd->GetID());
+ assert(iter != version_edits.end());
+ VersionEdit* edit = &iter->second;
+ status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+ if (!status.ok()) {
+ // Reflect errors immediately so that conditions like full
+ // file-systems cause the DB::Open() to fail.
+ return status;
+ }
+ flushed = true;
+
+ cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ *next_sequence);
+ }
+ }
+ }
+
+ if (!status.ok()) {
+ if (status.IsNotSupported()) {
+ // We should not treat NotSupported as corruption. It is rather a clear
+ // sign that we are processing a WAL that is produced by an incompatible
+ // version of the code.
+ return status;
+ }
+ if (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kSkipAnyCorruptedRecords) {
+ // We should ignore all errors unconditionally
+ status = Status::OK();
+ } else if (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kPointInTimeRecovery) {
+ // We should ignore the error but not continue replaying
+ status = Status::OK();
+ stop_replay_for_corruption = true;
+ corrupted_log_number = log_number;
+ if (corrupted_log_found != nullptr) {
+ *corrupted_log_found = true;
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Point in time recovered to log #%" PRIu64
+ " seq #%" PRIu64,
+ log_number, *next_sequence);
+ } else {
+ assert(immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kTolerateCorruptedTailRecords ||
+ immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kAbsoluteConsistency);
+ return status;
+ }
+ }
+
+ flush_scheduler_.Clear();
+ trim_history_scheduler_.Clear();
+ auto last_sequence = *next_sequence - 1;
+ if ((*next_sequence != kMaxSequenceNumber) &&
+ (versions_->LastSequence() <= last_sequence)) {
+ versions_->SetLastAllocatedSequence(last_sequence);
+ versions_->SetLastPublishedSequence(last_sequence);
+ versions_->SetLastSequence(last_sequence);
+ }
+ }
+ // Compare the corrupted log number to all columnfamily's current log number.
+ // Abort Open() if any column family's log number is greater than
+ // the corrupted log number, which means CF contains data beyond the point of
+ // corruption. This could during PIT recovery when the WAL is corrupted and
+ // some (but not all) CFs are flushed
+ // Exclude the PIT case where no log is dropped after the corruption point.
+ // This is to cover the case for empty logs after corrupted log, in which we
+ // don't reset stop_replay_for_corruption.
+ if (stop_replay_for_corruption == true &&
+ (immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kPointInTimeRecovery ||
+ immutable_db_options_.wal_recovery_mode ==
+ WALRecoveryMode::kTolerateCorruptedTailRecords)) {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->GetLogNumber() > corrupted_log_number) {
+ ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+ "Column family inconsistency: SST file contains data"
+ " beyond the point of corruption.");
+ return Status::Corruption("SST file is ahead of WALs");
+ }
+ }
+ }
+
+ // True if there's any data in the WALs; if not, we can skip re-processing
+ // them later
+ bool data_seen = false;
+ if (!read_only) {
+ // no need to refcount since client still doesn't have access
+ // to the DB and can not drop column families while we iterate
+ auto max_log_number = log_numbers.back();
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ auto iter = version_edits.find(cfd->GetID());
+ assert(iter != version_edits.end());
+ VersionEdit* edit = &iter->second;
+
+ if (cfd->GetLogNumber() > max_log_number) {
+ // Column family cfd has already flushed the data
+ // from all logs. Memtable has to be empty because
+ // we filter the updates based on log_number
+ // (in WriteBatch::InsertInto)
+ assert(cfd->mem()->GetFirstSequenceNumber() == 0);
+ assert(edit->NumEntries() == 0);
+ continue;
+ }
+
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr);
+
+ // flush the final memtable (if non-empty)
+ if (cfd->mem()->GetFirstSequenceNumber() != 0) {
+ // If flush happened in the middle of recovery (e.g. due to memtable
+ // being full), we flush at the end. Otherwise we'll need to record
+ // where we were on last flush, which make the logic complicated.
+ if (flushed || !immutable_db_options_.avoid_flush_during_recovery) {
+ status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+ if (!status.ok()) {
+ // Recovery failed
+ break;
+ }
+ flushed = true;
+
+ cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+ versions_->LastSequence());
+ }
+ data_seen = true;
+ }
+
+ // Update the log number info in the version edit corresponding to this
+ // column family. Note that the version edits will be written to MANIFEST
+ // together later.
+ // writing log_number in the manifest means that any log file
+ // with number strongly less than (log_number + 1) is already
+ // recovered and should be ignored on next reincarnation.
+ // Since we already recovered max_log_number, we want all logs
+ // with numbers `<= max_log_number` (includes this one) to be ignored
+ if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) {
+ edit->SetLogNumber(max_log_number + 1);
+ }
+ }
+ if (status.ok()) {
+ // we must mark the next log number as used, even though it's
+ // not actually used. that is because VersionSet assumes
+ // VersionSet::next_file_number_ always to be strictly greater than any
+ // log number
+ versions_->MarkFileNumberUsed(max_log_number + 1);
+
+ autovector<ColumnFamilyData*> cfds;
+ autovector<const MutableCFOptions*> cf_opts;
+ autovector<autovector<VersionEdit*>> edit_lists;
+ for (auto* cfd : *versions_->GetColumnFamilySet()) {
+ cfds.push_back(cfd);
+ cf_opts.push_back(cfd->GetLatestMutableCFOptions());
+ auto iter = version_edits.find(cfd->GetID());
+ assert(iter != version_edits.end());
+ edit_lists.push_back({&iter->second});
+ }
+ // write MANIFEST with update
+ status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_,
+ directories_.GetDbDir(),
+ /*new_descriptor_log=*/true);
+ }
+ }
+
+ if (status.ok() && data_seen && !flushed) {
+ status = RestoreAliveLogFiles(log_numbers);
+ }
+
+ event_logger_.Log() << "job" << job_id << "event"
+ << "recovery_finished";
+
+ return status;
+}
+
+Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers) {
+ if (log_numbers.empty()) {
+ return Status::OK();
+ }
+ Status s;
+ mutex_.AssertHeld();
+ assert(immutable_db_options_.avoid_flush_during_recovery);
+ if (two_write_queues_) {
+ log_write_mutex_.Lock();
+ }
+ // Mark these as alive so they'll be considered for deletion later by
+ // FindObsoleteFiles()
+ total_log_size_ = 0;
+ log_empty_ = false;
+ for (auto log_number : log_numbers) {
+ LogFileNumberSize log(log_number);
+ std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+ // This gets the appear size of the logs, not including preallocated space.
+ s = env_->GetFileSize(fname, &log.size);
+ if (!s.ok()) {
+ break;
+ }
+ total_log_size_ += log.size;
+ alive_log_files_.push_back(log);
+ // We preallocate space for logs, but then after a crash and restart, those
+ // preallocated space are not needed anymore. It is likely only the last
+ // log has such preallocated space, so we only truncate for the last log.
+ if (log_number == log_numbers.back()) {
+ std::unique_ptr<FSWritableFile> last_log;
+ Status truncate_status = fs_->ReopenWritableFile(
+ fname,
+ fs_->OptimizeForLogWrite(
+ file_options_,
+ BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+ &last_log, nullptr);
+ if (truncate_status.ok()) {
+ truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
+ }
+ if (truncate_status.ok()) {
+ truncate_status = last_log->Close(IOOptions(), nullptr);
+ }
+ // Not a critical error if fail to truncate.
+ if (!truncate_status.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "Failed to truncate log #%" PRIu64 ": %s", log_number,
+ truncate_status.ToString().c_str());
+ }
+ }
+ }
+ if (two_write_queues_) {
+ log_write_mutex_.Unlock();
+ }
+ return s;
+}
+
+Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+ MemTable* mem, VersionEdit* edit) {
+ mutex_.AssertHeld();
+ const uint64_t start_micros = env_->NowMicros();
+ FileMetaData meta;
+ std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+ new std::list<uint64_t>::iterator(
+ CaptureCurrentFileNumberInPendingOutputs()));
+ meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+ ReadOptions ro;
+ ro.total_order_seek = true;
+ Arena arena;
+ Status s;
+ TableProperties table_properties;
+ {
+ ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] [WriteLevel0TableForRecovery]"
+ " Level-0 table #%" PRIu64 ": started",
+ cfd->GetName().c_str(), meta.fd.GetNumber());
+
+ // Get the latest mutable cf options while the mutex is still locked
+ const MutableCFOptions mutable_cf_options =
+ *cfd->GetLatestMutableCFOptions();
+ bool paranoid_file_checks =
+ cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
+
+ int64_t _current_time = 0;
+ env_->GetCurrentTime(&_current_time); // ignore error
+ const uint64_t current_time = static_cast<uint64_t>(_current_time);
+ meta.oldest_ancester_time = current_time;
+
+ {
+ auto write_hint = cfd->CalculateSSTWriteHint(0);
+ mutex_.Unlock();
+
+ SequenceNumber earliest_write_conflict_snapshot;
+ std::vector<SequenceNumber> snapshot_seqs =
+ snapshots_.GetAll(&earliest_write_conflict_snapshot);
+ auto snapshot_checker = snapshot_checker_.get();
+ if (use_custom_gc_ && snapshot_checker == nullptr) {
+ snapshot_checker = DisableGCSnapshotChecker::Instance();
+ }
+ std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+ range_del_iters;
+ auto range_del_iter =
+ mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+ if (range_del_iter != nullptr) {
+ range_del_iters.emplace_back(range_del_iter);
+ }
+ s = BuildTable(
+ dbname_, env_, fs_.get(), *cfd->ioptions(), mutable_cf_options,
+ file_options_for_compaction_, cfd->table_cache(), iter.get(),
+ std::move(range_del_iters), &meta, cfd->internal_comparator(),
+ cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
+ snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+ GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+ mutable_cf_options.sample_for_compression,
+ cfd->ioptions()->compression_opts, paranoid_file_checks,
+ cfd->internal_stats(), TableFileCreationReason::kRecovery,
+ &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */,
+ -1 /* level */, current_time, write_hint);
+ LogFlush(immutable_db_options_.info_log);
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] [WriteLevel0TableForRecovery]"
+ " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
+ cfd->GetName().c_str(), meta.fd.GetNumber(),
+ meta.fd.GetFileSize(), s.ToString().c_str());
+ mutex_.Lock();
+ }
+ }
+ ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+ // Note that if file_size is zero, the file has been deleted and
+ // should not be added to the manifest.
+ int level = 0;
+ if (s.ok() && meta.fd.GetFileSize() > 0) {
+ edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+ meta.fd.GetFileSize(), meta.smallest, meta.largest,
+ meta.fd.smallest_seqno, meta.fd.largest_seqno,
+ meta.marked_for_compaction, meta.oldest_blob_file_number,
+ meta.oldest_ancester_time, meta.file_creation_time,
+ meta.file_checksum, meta.file_checksum_func_name);
+ }
+
+ InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
+ stats.micros = env_->NowMicros() - start_micros;
+ stats.bytes_written = meta.fd.GetFileSize();
+ stats.num_output_files = 1;
+ cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
+ cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
+ meta.fd.GetFileSize());
+ RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
+ return s;
+}
+
+Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ if (db_options.persist_stats_to_disk) {
+ column_families.push_back(
+ ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options));
+ }
+ std::vector<ColumnFamilyHandle*> handles;
+ Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
+ if (s.ok()) {
+ if (db_options.persist_stats_to_disk) {
+ assert(handles.size() == 2);
+ } else {
+ assert(handles.size() == 1);
+ }
+ // i can delete the handle since DBImpl is always holding a reference to
+ // default column family
+ if (db_options.persist_stats_to_disk && handles[1] != nullptr) {
+ delete handles[1];
+ }
+ delete handles[0];
+ }
+ return s;
+}
+
+Status DB::Open(const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+ const bool kSeqPerBatch = true;
+ const bool kBatchPerTxn = true;
+ return DBImpl::Open(db_options, dbname, column_families, handles, dbptr,
+ !kSeqPerBatch, kBatchPerTxn);
+}
+
+Status DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+ size_t preallocate_block_size, log::Writer** new_log) {
+ Status s;
+ std::unique_ptr<FSWritableFile> lfile;
+
+ DBOptions db_options =
+ BuildDBOptions(immutable_db_options_, mutable_db_options_);
+ FileOptions opt_file_options =
+ fs_->OptimizeForLogWrite(file_options_, db_options);
+ std::string log_fname =
+ LogFileName(immutable_db_options_.wal_dir, log_file_num);
+
+ if (recycle_log_number) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "reusing log %" PRIu64 " from recycle list\n",
+ recycle_log_number);
+ std::string old_log_fname =
+ LogFileName(immutable_db_options_.wal_dir, recycle_log_number);
+ TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1");
+ TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2");
+ s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options,
+ &lfile, /*dbg=*/nullptr);
+ } else {
+ s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
+ }
+
+ if (s.ok()) {
+ lfile->SetWriteLifeTimeHint(CalculateWALWriteHint());
+ lfile->SetPreallocationBlockSize(preallocate_block_size);
+
+ const auto& listeners = immutable_db_options_.listeners;
+ std::unique_ptr<WritableFileWriter> file_writer(
+ new WritableFileWriter(std::move(lfile), log_fname, opt_file_options,
+ env_, nullptr /* stats */, listeners));
+ *new_log = new log::Writer(std::move(file_writer), log_file_num,
+ immutable_db_options_.recycle_log_file_num > 0,
+ immutable_db_options_.manual_wal_flush);
+ }
+ return s;
+}
+
+Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ const bool seq_per_batch, const bool batch_per_txn) {
+ Status s = SanitizeOptionsByTable(db_options, column_families);
+ if (!s.ok()) {
+ return s;
+ }
+
+ s = ValidateOptions(db_options, column_families);
+ if (!s.ok()) {
+ return s;
+ }
+
+ *dbptr = nullptr;
+ handles->clear();
+
+ size_t max_write_buffer_size = 0;
+ for (auto cf : column_families) {
+ max_write_buffer_size =
+ std::max(max_write_buffer_size, cf.options.write_buffer_size);
+ }
+
+ DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn);
+ s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir);
+ if (s.ok()) {
+ std::vector<std::string> paths;
+ for (auto& db_path : impl->immutable_db_options_.db_paths) {
+ paths.emplace_back(db_path.path);
+ }
+ for (auto& cf : column_families) {
+ for (auto& cf_path : cf.options.cf_paths) {
+ paths.emplace_back(cf_path.path);
+ }
+ }
+ for (auto& path : paths) {
+ s = impl->env_->CreateDirIfMissing(path);
+ if (!s.ok()) {
+ break;
+ }
+ }
+
+ // For recovery from NoSpace() error, we can only handle
+ // the case where the database is stored in a single path
+ if (paths.size() <= 1) {
+ impl->error_handler_.EnableAutoRecovery();
+ }
+ }
+
+ if (!s.ok()) {
+ delete impl;
+ return s;
+ }
+
+ s = impl->CreateArchivalDirectory();
+ if (!s.ok()) {
+ delete impl;
+ return s;
+ }
+
+ impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+
+ impl->mutex_.Lock();
+ // Handles create_if_missing, error_if_exists
+ uint64_t recovered_seq(kMaxSequenceNumber);
+ s = impl->Recover(column_families, false, false, false, &recovered_seq);
+ if (s.ok()) {
+ uint64_t new_log_number = impl->versions_->NewFileNumber();
+ log::Writer* new_log = nullptr;
+ const size_t preallocate_block_size =
+ impl->GetWalPreallocateBlockSize(max_write_buffer_size);
+ s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/,
+ preallocate_block_size, &new_log);
+ if (s.ok()) {
+ InstrumentedMutexLock wl(&impl->log_write_mutex_);
+ impl->logfile_number_ = new_log_number;
+ assert(new_log != nullptr);
+ impl->logs_.emplace_back(new_log_number, new_log);
+ }
+
+ if (s.ok()) {
+ // set column family handles
+ for (auto cf : column_families) {
+ auto cfd =
+ impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+ if (cfd != nullptr) {
+ handles->push_back(
+ new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+ impl->NewThreadStatusCfInfo(cfd);
+ } else {
+ if (db_options.create_missing_column_families) {
+ // missing column family, create it
+ ColumnFamilyHandle* handle;
+ impl->mutex_.Unlock();
+ s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
+ impl->mutex_.Lock();
+ if (s.ok()) {
+ handles->push_back(handle);
+ } else {
+ break;
+ }
+ } else {
+ s = Status::InvalidArgument("Column family not found: ", cf.name);
+ break;
+ }
+ }
+ }
+ }
+ if (s.ok()) {
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ impl->InstallSuperVersionAndScheduleWork(
+ cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
+ }
+ sv_context.Clean();
+ if (impl->two_write_queues_) {
+ impl->log_write_mutex_.Lock();
+ }
+ impl->alive_log_files_.push_back(
+ DBImpl::LogFileNumberSize(impl->logfile_number_));
+ if (impl->two_write_queues_) {
+ impl->log_write_mutex_.Unlock();
+ }
+
+ impl->DeleteObsoleteFiles();
+ s = impl->directories_.GetDbDir()->Fsync();
+ }
+ if (s.ok()) {
+ // In WritePrepared there could be gap in sequence numbers. This breaks
+ // the trick we use in kPointInTimeRecovery which assumes the first seq in
+ // the log right after the corrupted log is one larger than the last seq
+ // we read from the logs. To let this trick keep working, we add a dummy
+ // entry with the expected sequence to the first log right after recovery.
+ // In non-WritePrepared case also the new log after recovery could be
+ // empty, and thus missing the consecutive seq hint to distinguish
+ // middle-log corruption to corrupted-log-remained-after-recovery. This
+ // case also will be addressed by a dummy write.
+ if (recovered_seq != kMaxSequenceNumber) {
+ WriteBatch empty_batch;
+ WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
+ WriteOptions write_options;
+ uint64_t log_used, log_size;
+ log::Writer* log_writer = impl->logs_.back().writer;
+ s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size);
+ if (s.ok()) {
+ // Need to fsync, otherwise it might get lost after a power reset.
+ s = impl->FlushWAL(false);
+ if (s.ok()) {
+ s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
+ }
+ }
+ }
+ }
+ }
+ if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+ // try to read format version but no need to fail Open() even if it fails
+ s = impl->PersistentStatsProcessFormatVersion();
+ }
+
+ if (s.ok()) {
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ if (cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+ auto* vstorage = cfd->current()->storage_info();
+ for (int i = 1; i < vstorage->num_levels(); ++i) {
+ int num_files = vstorage->NumLevelFiles(i);
+ if (num_files > 0) {
+ s = Status::InvalidArgument(
+ "Not all files are at level 0. Cannot "
+ "open with FIFO compaction style.");
+ break;
+ }
+ }
+ }
+ if (!cfd->mem()->IsSnapshotSupported()) {
+ impl->is_snapshot_supported_ = false;
+ }
+ if (cfd->ioptions()->merge_operator != nullptr &&
+ !cfd->mem()->IsMergeOperatorSupported()) {
+ s = Status::InvalidArgument(
+ "The memtable of column family %s does not support merge operator "
+ "its options.merge_operator is non-null",
+ cfd->GetName().c_str());
+ }
+ if (!s.ok()) {
+ break;
+ }
+ }
+ }
+ TEST_SYNC_POINT("DBImpl::Open:Opened");
+ Status persist_options_status;
+ if (s.ok()) {
+ // Persist RocksDB Options before scheduling the compaction.
+ // The WriteOptionsFile() will release and lock the mutex internally.
+ persist_options_status = impl->WriteOptionsFile(
+ false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+
+ *dbptr = impl;
+ impl->opened_successfully_ = true;
+ impl->MaybeScheduleFlushOrCompaction();
+ }
+ impl->mutex_.Unlock();
+
+#ifndef ROCKSDB_LITE
+ auto sfm = static_cast<SstFileManagerImpl*>(
+ impl->immutable_db_options_.sst_file_manager.get());
+ if (s.ok() && sfm) {
+ // Notify SstFileManager about all sst files that already exist in
+ // db_paths[0] and cf_paths[0] when the DB is opened.
+
+ // SstFileManagerImpl needs to know sizes of the files. For files whose size
+ // we already know (sst files that appear in manifest - typically that's the
+ // vast majority of all files), we'll pass the size to SstFileManager.
+ // For all other files SstFileManager will query the size from filesystem.
+
+ std::vector<LiveFileMetaData> metadata;
+
+ impl->mutex_.Lock();
+ impl->versions_->GetLiveFilesMetaData(&metadata);
+ impl->mutex_.Unlock();
+
+ std::unordered_map<std::string, uint64_t> known_file_sizes;
+ for (const auto& md : metadata) {
+ std::string name = md.name;
+ if (!name.empty() && name[0] == '/') {
+ name = name.substr(1);
+ }
+ known_file_sizes[name] = md.size;
+ }
+
+ std::vector<std::string> paths;
+ paths.emplace_back(impl->immutable_db_options_.db_paths[0].path);
+ for (auto& cf : column_families) {
+ if (!cf.options.cf_paths.empty()) {
+ paths.emplace_back(cf.options.cf_paths[0].path);
+ }
+ }
+ // Remove duplicate paths.
+ std::sort(paths.begin(), paths.end());
+ paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+ for (auto& path : paths) {
+ std::vector<std::string> existing_files;
+ impl->immutable_db_options_.env->GetChildren(path, &existing_files);
+ for (auto& file_name : existing_files) {
+ uint64_t file_number;
+ FileType file_type;
+ std::string file_path = path + "/" + file_name;
+ if (ParseFileName(file_name, &file_number, &file_type) &&
+ file_type == kTableFile) {
+ if (known_file_sizes.count(file_name)) {
+ // We're assuming that each sst file name exists in at most one of
+ // the paths.
+ sfm->OnAddFile(file_path, known_file_sizes.at(file_name),
+ /* compaction */ false);
+ } else {
+ sfm->OnAddFile(file_path);
+ }
+ }
+ }
+ }
+
+ // Reserve some disk buffer space. This is a heuristic - when we run out
+ // of disk space, this ensures that there is atleast write_buffer_size
+ // amount of free space before we resume DB writes. In low disk space
+ // conditions, we want to avoid a lot of small L0 files due to frequent
+ // WAL write failures and resultant forced flushes
+ sfm->ReserveDiskBuffer(max_write_buffer_size,
+ impl->immutable_db_options_.db_paths[0].path);
+ }
+#endif // !ROCKSDB_LITE
+
+ if (s.ok()) {
+ ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p",
+ impl);
+ LogFlush(impl->immutable_db_options_.info_log);
+ assert(impl->TEST_WALBufferIsEmpty());
+ // If the assert above fails then we need to FlushWAL before returning
+ // control back to the user.
+ if (!persist_options_status.ok()) {
+ s = Status::IOError(
+ "DB::Open() failed --- Unable to persist Options file",
+ persist_options_status.ToString());
+ }
+ }
+ if (s.ok()) {
+ impl->StartTimedTasks();
+ }
+ if (!s.ok()) {
+ for (auto* h : *handles) {
+ delete h;
+ }
+ handles->clear();
+ delete impl;
+ *dbptr = nullptr;
+ }
+ return s;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.cc b/src/rocksdb/db/db_impl/db_impl_readonly.cc
new file mode 100644
index 000000000..a4242bfe1
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_readonly.h"
+#include "db/arena_wrapped_db_iter.h"
+
+#include "db/compacted_db_impl.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/merge_context.h"
+#include "monitoring/perf_context_imp.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
+ const std::string& dbname)
+ : DBImpl(db_options, dbname) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Opening the db in read only mode");
+ LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplReadOnly::~DBImplReadOnly() {}
+
+// Implementations of the DB interface
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* pinnable_val) {
+ assert(pinnable_val != nullptr);
+ // TODO: stopwatch DB_GET needed?, perf timer needed?
+ PERF_TIMER_GUARD(get_snapshot_time);
+ Status s;
+ SequenceNumber snapshot = versions_->LastSequence();
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ tracer_->Get(column_family, key);
+ }
+ }
+ SuperVersion* super_version = cfd->GetSuperVersion();
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+ LookupKey lkey(key, snapshot);
+ PERF_TIMER_STOP(get_snapshot_time);
+ if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
+ &max_covering_tombstone_seq, read_options)) {
+ pinnable_val->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ super_version->current->Get(read_options, lkey, pinnable_val, &s,
+ &merge_context, &max_covering_tombstone_seq);
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+ RecordTick(stats_, NUMBER_KEYS_READ);
+ size_t size = pinnable_val->size();
+ RecordTick(stats_, BYTES_READ, size);
+ RecordInHistogram(stats_, BYTES_PER_READ, size);
+ PERF_COUNTER_ADD(get_read_bytes, size);
+ return s;
+}
+
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+ SequenceNumber latest_snapshot = versions_->LastSequence();
+ SequenceNumber read_seq =
+ read_options.snapshot != nullptr
+ ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+ ->number_
+ : latest_snapshot;
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ auto db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+ read_seq,
+ super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+ super_version->version_number, read_callback);
+ auto internal_iter =
+ NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
+ db_iter->GetRangeDelAggregator(), read_seq);
+ db_iter->SetIterUnderDBIter(internal_iter);
+ return db_iter;
+}
+
+Status DBImplReadOnly::NewIterators(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) {
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (iterators == nullptr) {
+ return Status::InvalidArgument("iterators not allowed to be nullptr");
+ }
+ iterators->clear();
+ iterators->reserve(column_families.size());
+ SequenceNumber latest_snapshot = versions_->LastSequence();
+ SequenceNumber read_seq =
+ read_options.snapshot != nullptr
+ ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+ ->number_
+ : latest_snapshot;
+
+ for (auto cfh : column_families) {
+ auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+ auto* sv = cfd->GetSuperVersion()->Ref();
+ auto* db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq,
+ sv->mutable_cf_options.max_sequential_skip_in_iterations,
+ sv->version_number, read_callback);
+ auto* internal_iter =
+ NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
+ db_iter->GetRangeDelAggregator(), read_seq);
+ db_iter->SetIterUnderDBIter(internal_iter);
+ iterators->push_back(db_iter);
+ }
+
+ return Status::OK();
+}
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+ DB** dbptr, bool /*error_if_log_file_exist*/) {
+ *dbptr = nullptr;
+
+ // Try to first open DB as fully compacted DB
+ Status s;
+ s = CompactedDBImpl::Open(options, dbname, dbptr);
+ if (s.ok()) {
+ return s;
+ }
+
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.push_back(
+ ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+ std::vector<ColumnFamilyHandle*> handles;
+
+ s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
+ if (s.ok()) {
+ assert(handles.size() == 1);
+ // i can delete the handle since DBImpl is always holding a
+ // reference to default column family
+ delete handles[0];
+ }
+ return s;
+}
+
+Status DB::OpenForReadOnly(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+ bool error_if_log_file_exist) {
+ *dbptr = nullptr;
+ handles->clear();
+
+ SuperVersionContext sv_context(/* create_superversion */ true);
+ DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
+ impl->mutex_.Lock();
+ Status s = impl->Recover(column_families, true /* read only */,
+ error_if_log_file_exist);
+ if (s.ok()) {
+ // set column family handles
+ for (auto cf : column_families) {
+ auto cfd =
+ impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+ if (cfd == nullptr) {
+ s = Status::InvalidArgument("Column family not found: ", cf.name);
+ break;
+ }
+ handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+ }
+ }
+ if (s.ok()) {
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ sv_context.NewSuperVersion();
+ cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+ }
+ }
+ impl->mutex_.Unlock();
+ sv_context.Clean();
+ if (s.ok()) {
+ *dbptr = impl;
+ for (auto* h : *handles) {
+ impl->NewThreadStatusCfInfo(
+ reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+ }
+ } else {
+ for (auto h : *handles) {
+ delete h;
+ }
+ handles->clear();
+ delete impl;
+ }
+ return s;
+}
+
+#else // !ROCKSDB_LITE
+
+Status DB::OpenForReadOnly(const Options& /*options*/,
+ const std::string& /*dbname*/, DB** /*dbptr*/,
+ bool /*error_if_log_file_exist*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenForReadOnly(
+ const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+ const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/,
+ bool /*error_if_log_file_exist*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.h b/src/rocksdb/db/db_impl/db_impl_readonly.h
new file mode 100644
index 000000000..04d06b4a1
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImplReadOnly : public DBImpl {
+ public:
+ DBImplReadOnly(const DBOptions& options, const std::string& dbname);
+ // No copying allowed
+ DBImplReadOnly(const DBImplReadOnly&) = delete;
+ void operator=(const DBImplReadOnly&) = delete;
+
+ virtual ~DBImplReadOnly();
+
+ // Implementations of the DB interface
+ using DB::Get;
+ virtual Status Get(const ReadOptions& options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) override;
+
+ // TODO: Implement ReadOnly MultiGet?
+
+ using DBImpl::NewIterator;
+ virtual Iterator* NewIterator(const ReadOptions&,
+ ColumnFamilyHandle* column_family) override;
+
+ virtual Status NewIterators(
+ const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override;
+
+ using DBImpl::Put;
+ virtual Status Put(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ using DBImpl::Merge;
+ virtual Status Merge(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/, const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ using DBImpl::Delete;
+ virtual Status Delete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ using DBImpl::SingleDelete;
+ virtual Status SingleDelete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status Write(const WriteOptions& /*options*/,
+ WriteBatch* /*updates*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ using DBImpl::CompactRange;
+ virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/,
+ const Slice* /*end*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DBImpl::CompactFiles;
+ virtual Status CompactFiles(
+ const CompactionOptions& /*compact_options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*input_file_names*/,
+ const int /*output_level*/, const int /*output_path_id*/ = -1,
+ std::vector<std::string>* const /*output_file_names*/ = nullptr,
+ CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ virtual Status DisableFileDeletions() override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ virtual Status EnableFileDeletions(bool /*force*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status GetLiveFiles(std::vector<std::string>& ret,
+ uint64_t* manifest_file_size,
+ bool /*flush_memtable*/) override {
+ return DBImpl::GetLiveFiles(ret, manifest_file_size,
+ false /* flush_memtable */);
+ }
+
+ using DBImpl::Flush;
+ virtual Status Flush(const FlushOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DBImpl::SyncWAL;
+ virtual Status SyncWAL() override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DB::IngestExternalFile;
+ virtual Status IngestExternalFile(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*external_files*/,
+ const IngestExternalFileOptions& /*ingestion_options*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ using DB::CreateColumnFamilyWithImport;
+ virtual Status CreateColumnFamilyWithImport(
+ const ColumnFamilyOptions& /*options*/,
+ const std::string& /*column_family_name*/,
+ const ImportColumnFamilyOptions& /*import_options*/,
+ const ExportImportFilesMetaData& /*metadata*/,
+ ColumnFamilyHandle** /*handle*/) override {
+ return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+ private:
+ friend class DB;
+};
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.cc b/src/rocksdb/db/db_impl/db_impl_secondary.cc
new file mode 100644
index 000000000..f0ec27c32
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.cc
@@ -0,0 +1,671 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_secondary.h"
+
+#include <cinttypes>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/merge_context.h"
+#include "logging/auto_roll_logger.h"
+#include "monitoring/perf_context_imp.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
+ const std::string& dbname)
+ : DBImpl(db_options, dbname) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Opening the db in secondary mode");
+ LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplSecondary::~DBImplSecondary() {}
+
+Status DBImplSecondary::Recover(
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool /*readonly*/, bool /*error_if_log_file_exist*/,
+ bool /*error_if_data_exists_in_logs*/, uint64_t*) {
+ mutex_.AssertHeld();
+
+ JobContext job_context(0);
+ Status s;
+ s = static_cast<ReactiveVersionSet*>(versions_.get())
+ ->Recover(column_families, &manifest_reader_, &manifest_reporter_,
+ &manifest_reader_status_);
+ if (!s.ok()) {
+ return s;
+ }
+ if (immutable_db_options_.paranoid_checks && s.ok()) {
+ s = CheckConsistency();
+ }
+ // Initial max_total_in_memory_state_ before recovery logs.
+ max_total_in_memory_state_ = 0;
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+ max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+ mutable_cf_options->max_write_buffer_number;
+ }
+ if (s.ok()) {
+ default_cf_handle_ = new ColumnFamilyHandleImpl(
+ versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+ default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+ single_column_family_mode_ =
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
+
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+ }
+
+ if (s.IsPathNotFound()) {
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Secondary tries to read WAL, but WAL file(s) have already "
+ "been purged by primary.");
+ s = Status::OK();
+ }
+ // TODO: update options_file_number_ needed?
+
+ job_context.Clean();
+ return s;
+}
+
+// find new WAL and apply them in order to the secondary instance
+Status DBImplSecondary::FindAndRecoverLogFiles(
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context) {
+ assert(nullptr != cfds_changed);
+ assert(nullptr != job_context);
+ Status s;
+ std::vector<uint64_t> logs;
+ s = FindNewLogNumbers(&logs);
+ if (s.ok() && !logs.empty()) {
+ SequenceNumber next_sequence(kMaxSequenceNumber);
+ s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context);
+ }
+ return s;
+}
+
+// List wal_dir and find all new WALs, return these log numbers
+Status DBImplSecondary::FindNewLogNumbers(std::vector<uint64_t>* logs) {
+ assert(logs != nullptr);
+ std::vector<std::string> filenames;
+ Status s;
+ s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
+ if (s.IsNotFound()) {
+ return Status::InvalidArgument("Failed to open wal_dir",
+ immutable_db_options_.wal_dir);
+ } else if (!s.ok()) {
+ return s;
+ }
+
+ // if log_readers_ is non-empty, it means we have applied all logs with log
+ // numbers smaller than the smallest log in log_readers_, so there is no
+ // need to pass these logs to RecoverLogFiles
+ uint64_t log_number_min = 0;
+ if (!log_readers_.empty()) {
+ log_number_min = log_readers_.begin()->first;
+ }
+ for (size_t i = 0; i < filenames.size(); i++) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(filenames[i], &number, &type) && type == kLogFile &&
+ number >= log_number_min) {
+ logs->push_back(number);
+ }
+ }
+ // Recover logs in the order that they were generated
+ if (!logs->empty()) {
+ std::sort(logs->begin(), logs->end());
+ }
+ return s;
+}
+
+Status DBImplSecondary::MaybeInitLogReader(
+ uint64_t log_number, log::FragmentBufferedReader** log_reader) {
+ auto iter = log_readers_.find(log_number);
+ // make sure the log file is still present
+ if (iter == log_readers_.end() ||
+ iter->second->reader_->GetLogNumber() != log_number) {
+ // delete the obsolete log reader if log number mismatch
+ if (iter != log_readers_.end()) {
+ log_readers_.erase(iter);
+ }
+ // initialize log reader from log_number
+ // TODO: min_log_number_to_keep_2pc check needed?
+ // Open the log file
+ std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Recovering log #%" PRIu64 " mode %d", log_number,
+ static_cast<int>(immutable_db_options_.wal_recovery_mode));
+
+ std::unique_ptr<SequentialFileReader> file_reader;
+ {
+ std::unique_ptr<FSSequentialFile> file;
+ Status status = fs_->NewSequentialFile(
+ fname, fs_->OptimizeForLogRead(file_options_), &file,
+ nullptr);
+ if (!status.ok()) {
+ *log_reader = nullptr;
+ return status;
+ }
+ file_reader.reset(new SequentialFileReader(
+ std::move(file), fname, immutable_db_options_.log_readahead_size));
+ }
+
+ // Create the log reader.
+ LogReaderContainer* log_reader_container = new LogReaderContainer(
+ env_, immutable_db_options_.info_log, std::move(fname),
+ std::move(file_reader), log_number);
+ log_readers_.insert(std::make_pair(
+ log_number, std::unique_ptr<LogReaderContainer>(log_reader_container)));
+ }
+ iter = log_readers_.find(log_number);
+ assert(iter != log_readers_.end());
+ *log_reader = iter->second->reader_;
+ return Status::OK();
+}
+
+// After manifest recovery, replay WALs and refresh log_readers_ if necessary
+// REQUIRES: log_numbers are sorted in ascending order
+Status DBImplSecondary::RecoverLogFiles(
+ const std::vector<uint64_t>& log_numbers, SequenceNumber* next_sequence,
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context) {
+ assert(nullptr != cfds_changed);
+ assert(nullptr != job_context);
+ mutex_.AssertHeld();
+ Status status;
+ for (auto log_number : log_numbers) {
+ log::FragmentBufferedReader* reader = nullptr;
+ status = MaybeInitLogReader(log_number, &reader);
+ if (!status.ok()) {
+ return status;
+ }
+ assert(reader != nullptr);
+ }
+ for (auto log_number : log_numbers) {
+ auto it = log_readers_.find(log_number);
+ assert(it != log_readers_.end());
+ log::FragmentBufferedReader* reader = it->second->reader_;
+ // Manually update the file number allocation counter in VersionSet.
+ versions_->MarkFileNumberUsed(log_number);
+
+ // Determine if we should tolerate incomplete records at the tail end of the
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+
+ while (reader->ReadRecord(&record, &scratch,
+ immutable_db_options_.wal_recovery_mode) &&
+ status.ok()) {
+ if (record.size() < WriteBatchInternal::kHeader) {
+ reader->GetReporter()->Corruption(
+ record.size(), Status::Corruption("log record too small"));
+ continue;
+ }
+ WriteBatchInternal::SetContents(&batch, record);
+ SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch);
+ std::vector<uint32_t> column_family_ids;
+ status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
+ if (status.ok()) {
+ for (const auto id : column_family_ids) {
+ ColumnFamilyData* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(id);
+ if (cfd == nullptr) {
+ continue;
+ }
+ if (cfds_changed->count(cfd) == 0) {
+ cfds_changed->insert(cfd);
+ }
+ const std::vector<FileMetaData*>& l0_files =
+ cfd->current()->storage_info()->LevelFiles(0);
+ SequenceNumber seq =
+ l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno;
+ // If the write batch's sequence number is smaller than the last
+ // sequence number of the largest sequence persisted for this column
+ // family, then its data must reside in an SST that has already been
+ // added in the prior MANIFEST replay.
+ if (seq_of_batch <= seq) {
+ continue;
+ }
+ auto curr_log_num = port::kMaxUint64;
+ if (cfd_to_current_log_.count(cfd) > 0) {
+ curr_log_num = cfd_to_current_log_[cfd];
+ }
+ // If the active memtable contains records added by replaying an
+ // earlier WAL, then we need to seal the memtable, add it to the
+ // immutable memtable list and create a new active memtable.
+ if (!cfd->mem()->IsEmpty() && (curr_log_num == port::kMaxUint64 ||
+ curr_log_num != log_number)) {
+ const MutableCFOptions mutable_cf_options =
+ *cfd->GetLatestMutableCFOptions();
+ MemTable* new_mem =
+ cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch);
+ cfd->mem()->SetNextLogNumber(log_number);
+ cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free);
+ new_mem->Ref();
+ cfd->SetMemtable(new_mem);
+ }
+ }
+ bool has_valid_writes = false;
+ status = WriteBatchInternal::InsertInto(
+ &batch, column_family_memtables_.get(),
+ nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/,
+ true, log_number, this, false /* concurrent_memtable_writes */,
+ next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
+ }
+ // If column family was not found, it might mean that the WAL write
+ // batch references to the column family that was dropped after the
+ // insert. We don't want to fail the whole write batch in that case --
+ // we just ignore the update.
+ // That's why we set ignore missing column families to true
+ // passing null flush_scheduler will disable memtable flushing which is
+ // needed for secondary instances
+ if (status.ok()) {
+ for (const auto id : column_family_ids) {
+ ColumnFamilyData* cfd =
+ versions_->GetColumnFamilySet()->GetColumnFamily(id);
+ if (cfd == nullptr) {
+ continue;
+ }
+ std::unordered_map<ColumnFamilyData*, uint64_t>::iterator iter =
+ cfd_to_current_log_.find(cfd);
+ if (iter == cfd_to_current_log_.end()) {
+ cfd_to_current_log_.insert({cfd, log_number});
+ } else if (log_number > iter->second) {
+ iter->second = log_number;
+ }
+ }
+ auto last_sequence = *next_sequence - 1;
+ if ((*next_sequence != kMaxSequenceNumber) &&
+ (versions_->LastSequence() <= last_sequence)) {
+ versions_->SetLastAllocatedSequence(last_sequence);
+ versions_->SetLastPublishedSequence(last_sequence);
+ versions_->SetLastSequence(last_sequence);
+ }
+ } else {
+ // We are treating this as a failure while reading since we read valid
+ // blocks that do not form coherent data
+ reader->GetReporter()->Corruption(record.size(), status);
+ }
+ }
+ if (!status.ok()) {
+ return status;
+ }
+ }
+ // remove logreaders from map after successfully recovering the WAL
+ if (log_readers_.size() > 1) {
+ auto erase_iter = log_readers_.begin();
+ std::advance(erase_iter, log_readers_.size() - 1);
+ log_readers_.erase(log_readers_.begin(), erase_iter);
+ }
+ return status;
+}
+
+// Implementation of the DB interface
+Status DBImplSecondary::Get(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family, const Slice& key,
+ PinnableSlice* value) {
+ return GetImpl(read_options, column_family, key, value);
+}
+
+Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* pinnable_val) {
+ assert(pinnable_val != nullptr);
+ PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+ StopWatch sw(env_, stats_, DB_GET);
+ PERF_TIMER_GUARD(get_snapshot_time);
+
+ auto cfh = static_cast<ColumnFamilyHandleImpl*>(column_family);
+ ColumnFamilyData* cfd = cfh->cfd();
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ tracer_->Get(column_family, key);
+ }
+ }
+ // Acquire SuperVersion
+ SuperVersion* super_version = GetAndRefSuperVersion(cfd);
+ SequenceNumber snapshot = versions_->LastSequence();
+ MergeContext merge_context;
+ SequenceNumber max_covering_tombstone_seq = 0;
+ Status s;
+ LookupKey lkey(key, snapshot);
+ PERF_TIMER_STOP(get_snapshot_time);
+
+ bool done = false;
+ if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
+ &max_covering_tombstone_seq, read_options)) {
+ done = true;
+ pinnable_val->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ } else if ((s.ok() || s.IsMergeInProgress()) &&
+ super_version->imm->Get(
+ lkey, pinnable_val->GetSelf(), &s, &merge_context,
+ &max_covering_tombstone_seq, read_options)) {
+ done = true;
+ pinnable_val->PinSelf();
+ RecordTick(stats_, MEMTABLE_HIT);
+ }
+ if (!done && !s.ok() && !s.IsMergeInProgress()) {
+ ReturnAndCleanupSuperVersion(cfd, super_version);
+ return s;
+ }
+ if (!done) {
+ PERF_TIMER_GUARD(get_from_output_files_time);
+ super_version->current->Get(read_options, lkey, pinnable_val, &s,
+ &merge_context, &max_covering_tombstone_seq);
+ RecordTick(stats_, MEMTABLE_MISS);
+ }
+ {
+ PERF_TIMER_GUARD(get_post_process_time);
+ ReturnAndCleanupSuperVersion(cfd, super_version);
+ RecordTick(stats_, NUMBER_KEYS_READ);
+ size_t size = pinnable_val->size();
+ RecordTick(stats_, BYTES_READ, size);
+ RecordTimeToHistogram(stats_, BYTES_PER_READ, size);
+ PERF_COUNTER_ADD(get_read_bytes, size);
+ }
+ return s;
+}
+
+Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options,
+ ColumnFamilyHandle* column_family) {
+ if (read_options.managed) {
+ return NewErrorIterator(
+ Status::NotSupported("Managed iterator is not supported anymore."));
+ }
+ if (read_options.read_tier == kPersistedTier) {
+ return NewErrorIterator(Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators."));
+ }
+ Iterator* result = nullptr;
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ auto cfd = cfh->cfd();
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (read_options.tailing) {
+ return NewErrorIterator(Status::NotSupported(
+ "tailing iterator not supported in secondary mode"));
+ } else if (read_options.snapshot != nullptr) {
+ // TODO (yanqin) support snapshot.
+ return NewErrorIterator(
+ Status::NotSupported("snapshot not supported in secondary mode"));
+ } else {
+ auto snapshot = versions_->LastSequence();
+ result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
+ }
+ return result;
+}
+
+ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl(
+ const ReadOptions& read_options, ColumnFamilyData* cfd,
+ SequenceNumber snapshot, ReadCallback* read_callback) {
+ assert(nullptr != cfd);
+ SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+ auto db_iter = NewArenaWrappedDbIterator(
+ env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+ snapshot,
+ super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+ super_version->version_number, read_callback);
+ auto internal_iter =
+ NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
+ db_iter->GetRangeDelAggregator(), snapshot);
+ db_iter->SetIterUnderDBIter(internal_iter);
+ return db_iter;
+}
+
+Status DBImplSecondary::NewIterators(
+ const ReadOptions& read_options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) {
+ if (read_options.managed) {
+ return Status::NotSupported("Managed iterator is not supported anymore.");
+ }
+ if (read_options.read_tier == kPersistedTier) {
+ return Status::NotSupported(
+ "ReadTier::kPersistedData is not yet supported in iterators.");
+ }
+ ReadCallback* read_callback = nullptr; // No read callback provided.
+ if (iterators == nullptr) {
+ return Status::InvalidArgument("iterators not allowed to be nullptr");
+ }
+ iterators->clear();
+ iterators->reserve(column_families.size());
+ if (read_options.tailing) {
+ return Status::NotSupported(
+ "tailing iterator not supported in secondary mode");
+ } else if (read_options.snapshot != nullptr) {
+ // TODO (yanqin) support snapshot.
+ return Status::NotSupported("snapshot not supported in secondary mode");
+ } else {
+ SequenceNumber read_seq = versions_->LastSequence();
+ for (auto cfh : column_families) {
+ ColumnFamilyData* cfd = static_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+ iterators->push_back(
+ NewIteratorImpl(read_options, cfd, read_seq, read_callback));
+ }
+ }
+ return Status::OK();
+}
+
+Status DBImplSecondary::CheckConsistency() {
+ mutex_.AssertHeld();
+ Status s = DBImpl::CheckConsistency();
+ // If DBImpl::CheckConsistency() which is stricter returns success, then we
+ // do not need to give a second chance.
+ if (s.ok()) {
+ return s;
+ }
+ // It's possible that DBImpl::CheckConssitency() can fail because the primary
+ // may have removed certain files, causing the GetFileSize(name) call to
+ // fail and returning a PathNotFound. In this case, we take a best-effort
+ // approach and just proceed.
+ TEST_SYNC_POINT_CALLBACK(
+ "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
+
+ if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+ return Status::OK();
+ }
+
+ std::vector<LiveFileMetaData> metadata;
+ versions_->GetLiveFilesMetaData(&metadata);
+
+ std::string corruption_messages;
+ for (const auto& md : metadata) {
+ // md.name has a leading "/".
+ std::string file_path = md.db_path + md.name;
+
+ uint64_t fsize = 0;
+ s = env_->GetFileSize(file_path, &fsize);
+ if (!s.ok() &&
+ (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() ||
+ s.IsPathNotFound())) {
+ s = Status::OK();
+ }
+ if (!s.ok()) {
+ corruption_messages +=
+ "Can't access " + md.name + ": " + s.ToString() + "\n";
+ }
+ }
+ return corruption_messages.empty() ? Status::OK()
+ : Status::Corruption(corruption_messages);
+}
+
+Status DBImplSecondary::TryCatchUpWithPrimary() {
+ assert(versions_.get() != nullptr);
+ assert(manifest_reader_.get() != nullptr);
+ Status s;
+ // read the manifest and apply new changes to the secondary instance
+ std::unordered_set<ColumnFamilyData*> cfds_changed;
+ JobContext job_context(0, true /*create_superversion*/);
+ {
+ InstrumentedMutexLock lock_guard(&mutex_);
+ s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
+ ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed);
+
+ ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
+ static_cast<uint64_t>(versions_->LastSequence()));
+ for (ColumnFamilyData* cfd : cfds_changed) {
+ if (cfd->IsDropped()) {
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
+ cfd->GetName().c_str());
+ continue;
+ }
+ VersionStorageInfo::LevelSummaryStorage tmp;
+ ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+ "[%s] Level summary: %s\n", cfd->GetName().c_str(),
+ cfd->current()->storage_info()->LevelSummary(&tmp));
+ }
+
+ // list wal_dir to discover new WALs and apply new changes to the secondary
+ // instance
+ if (s.ok()) {
+ s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+ }
+ if (s.IsPathNotFound()) {
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Secondary tries to read WAL, but WAL file(s) have already "
+ "been purged by primary.");
+ s = Status::OK();
+ }
+ if (s.ok()) {
+ for (auto cfd : cfds_changed) {
+ cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
+ &job_context.memtables_to_free);
+ auto& sv_context = job_context.superversion_contexts.back();
+ cfd->InstallSuperVersion(&sv_context, &mutex_);
+ sv_context.NewSuperVersion();
+ }
+ }
+ }
+ job_context.Clean();
+
+ // Cleanup unused, obsolete files.
+ JobContext purge_files_job_context(0);
+ {
+ InstrumentedMutexLock lock_guard(&mutex_);
+ // Currently, secondary instance does not own the database files, thus it
+ // is unnecessary for the secondary to force full scan.
+ FindObsoleteFiles(&purge_files_job_context, /*force=*/false);
+ }
+ if (purge_files_job_context.HaveSomethingToDelete()) {
+ PurgeObsoleteFiles(purge_files_job_context);
+ }
+ purge_files_job_context.Clean();
+ return s;
+}
+
+Status DB::OpenAsSecondary(const Options& options, const std::string& dbname,
+ const std::string& secondary_path, DB** dbptr) {
+ *dbptr = nullptr;
+
+ DBOptions db_options(options);
+ ColumnFamilyOptions cf_options(options);
+ std::vector<ColumnFamilyDescriptor> column_families;
+ column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
+ std::vector<ColumnFamilyHandle*> handles;
+
+ Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path,
+ column_families, &handles, dbptr);
+ if (s.ok()) {
+ assert(handles.size() == 1);
+ delete handles[0];
+ }
+ return s;
+}
+
+Status DB::OpenAsSecondary(
+ const DBOptions& db_options, const std::string& dbname,
+ const std::string& secondary_path,
+ const std::vector<ColumnFamilyDescriptor>& column_families,
+ std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+ *dbptr = nullptr;
+ if (db_options.max_open_files != -1) {
+ // TODO (yanqin) maybe support max_open_files != -1 by creating hard links
+ // on SST files so that db secondary can still have access to old SSTs
+ // while primary instance may delete original.
+ return Status::InvalidArgument("require max_open_files to be -1");
+ }
+
+ DBOptions tmp_opts(db_options);
+ Status s;
+ if (nullptr == tmp_opts.info_log) {
+ s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log);
+ if (!s.ok()) {
+ tmp_opts.info_log = nullptr;
+ }
+ }
+
+ handles->clear();
+ DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname);
+ impl->versions_.reset(new ReactiveVersionSet(
+ dbname, &impl->immutable_db_options_, impl->file_options_,
+ impl->table_cache_.get(), impl->write_buffer_manager_,
+ &impl->write_controller_));
+ impl->column_family_memtables_.reset(
+ new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
+ impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+
+ impl->mutex_.Lock();
+ s = impl->Recover(column_families, true, false, false);
+ if (s.ok()) {
+ for (auto cf : column_families) {
+ auto cfd =
+ impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+ if (nullptr == cfd) {
+ s = Status::InvalidArgument("Column family not found: ", cf.name);
+ break;
+ }
+ handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+ }
+ }
+ SuperVersionContext sv_context(true /* create_superversion */);
+ if (s.ok()) {
+ for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+ sv_context.NewSuperVersion();
+ cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+ }
+ }
+ impl->mutex_.Unlock();
+ sv_context.Clean();
+ if (s.ok()) {
+ *dbptr = impl;
+ for (auto h : *handles) {
+ impl->NewThreadStatusCfInfo(
+ reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+ }
+ } else {
+ for (auto h : *handles) {
+ delete h;
+ }
+ handles->clear();
+ delete impl;
+ }
+ return s;
+}
+#else // !ROCKSDB_LITE
+
+Status DB::OpenAsSecondary(const Options& /*options*/,
+ const std::string& /*name*/,
+ const std::string& /*secondary_path*/,
+ DB** /*dbptr*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenAsSecondary(
+ const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+ const std::string& /*secondary_path*/,
+ const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+ std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/) {
+ return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif // !ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.h b/src/rocksdb/db/db_impl/db_impl_secondary.h
new file mode 100644
index 000000000..24f2e7767
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.h
@@ -0,0 +1,333 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A wrapper class to hold log reader, log reporter, log status.
+class LogReaderContainer {
+ public:
+ LogReaderContainer()
+ : reader_(nullptr), reporter_(nullptr), status_(nullptr) {}
+ LogReaderContainer(Env* env, std::shared_ptr<Logger> info_log,
+ std::string fname,
+ std::unique_ptr<SequentialFileReader>&& file_reader,
+ uint64_t log_number) {
+ LogReporter* reporter = new LogReporter();
+ status_ = new Status();
+ reporter->env = env;
+ reporter->info_log = info_log.get();
+ reporter->fname = std::move(fname);
+ reporter->status = status_;
+ reporter_ = reporter;
+ // We intentially make log::Reader do checksumming even if
+ // paranoid_checks==false so that corruptions cause entire commits
+ // to be skipped instead of propagating bad information (like overly
+ // large sequence numbers).
+ reader_ = new log::FragmentBufferedReader(info_log, std::move(file_reader),
+ reporter, true /*checksum*/,
+ log_number);
+ }
+ log::FragmentBufferedReader* reader_;
+ log::Reader::Reporter* reporter_;
+ Status* status_;
+ ~LogReaderContainer() {
+ delete reader_;
+ delete reporter_;
+ delete status_;
+ }
+ private:
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ std::string fname;
+ Status* status; // nullptr if immutable_db_options_.paranoid_checks==false
+ void Corruption(size_t bytes, const Status& s) override {
+ ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+ (this->status == nullptr ? "(ignoring error) " : ""),
+ fname.c_str(), static_cast<int>(bytes),
+ s.ToString().c_str());
+ if (this->status != nullptr && this->status->ok()) {
+ *this->status = s;
+ }
+ }
+ };
+};
+
+// The secondary instance shares access to the storage as the primary.
+// The secondary is able to read and replay changes described in both the
+// MANIFEST and the WAL files without coordination with the primary.
+// The secondary instance can be opened using `DB::OpenAsSecondary`. After
+// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best
+// effort attempts to catch up with the primary.
+class DBImplSecondary : public DBImpl {
+ public:
+ DBImplSecondary(const DBOptions& options, const std::string& dbname);
+ ~DBImplSecondary() override;
+
+ // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_
+ // and log_readers_ to facilitate future operations.
+ Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+ bool read_only, bool error_if_log_file_exist,
+ bool error_if_data_exists_in_logs,
+ uint64_t* = nullptr) override;
+
+ // Implementations of the DB interface
+ using DB::Get;
+ Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value) override;
+
+ Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
+ const Slice& key, PinnableSlice* value);
+
+ using DBImpl::NewIterator;
+ Iterator* NewIterator(const ReadOptions&,
+ ColumnFamilyHandle* column_family) override;
+
+ ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options,
+ ColumnFamilyData* cfd,
+ SequenceNumber snapshot,
+ ReadCallback* read_callback);
+
+ Status NewIterators(const ReadOptions& options,
+ const std::vector<ColumnFamilyHandle*>& column_families,
+ std::vector<Iterator*>* iterators) override;
+
+ using DBImpl::Put;
+ Status Put(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+ const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::Merge;
+ Status Merge(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+ const Slice& /*value*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::Delete;
+ Status Delete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SingleDelete;
+ Status SingleDelete(const WriteOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice& /*key*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status Write(const WriteOptions& /*options*/,
+ WriteBatch* /*updates*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::CompactRange;
+ Status CompactRange(const CompactRangeOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const Slice* /*begin*/, const Slice* /*end*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::CompactFiles;
+ Status CompactFiles(
+ const CompactionOptions& /*compact_options*/,
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*input_file_names*/,
+ const int /*output_level*/, const int /*output_path_id*/ = -1,
+ std::vector<std::string>* const /*output_file_names*/ = nullptr,
+ CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status DisableFileDeletions() override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status EnableFileDeletions(bool /*force*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ Status GetLiveFiles(std::vector<std::string>&,
+ uint64_t* /*manifest_file_size*/,
+ bool /*flush_memtable*/ = true) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::Flush;
+ Status Flush(const FlushOptions& /*options*/,
+ ColumnFamilyHandle* /*column_family*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SetDBOptions;
+ Status SetDBOptions(const std::unordered_map<std::string, std::string>&
+ /*options_map*/) override {
+ // Currently not supported because changing certain options may cause
+ // flush/compaction.
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SetOptions;
+ Status SetOptions(
+ ColumnFamilyHandle* /*cfd*/,
+ const std::unordered_map<std::string, std::string>& /*options_map*/)
+ override {
+ // Currently not supported because changing certain options may cause
+ // flush/compaction and/or write to MANIFEST.
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DBImpl::SyncWAL;
+ Status SyncWAL() override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ using DB::IngestExternalFile;
+ Status IngestExternalFile(
+ ColumnFamilyHandle* /*column_family*/,
+ const std::vector<std::string>& /*external_files*/,
+ const IngestExternalFileOptions& /*ingestion_options*/) override {
+ return Status::NotSupported("Not supported operation in secondary mode.");
+ }
+
+ // Try to catch up with the primary by reading as much as possible from the
+ // log files until there is nothing more to read or encounters an error. If
+ // the amount of information in the log files to process is huge, this
+ // method can take long time due to all the I/O and CPU costs.
+ Status TryCatchUpWithPrimary() override;
+
+
+ // Try to find log reader using log_number from log_readers_ map, initialize
+ // if it doesn't exist
+ Status MaybeInitLogReader(uint64_t log_number,
+ log::FragmentBufferedReader** log_reader);
+
+ // Check if all live files exist on file system and that their file sizes
+ // matche to the in-memory records. It is possible that some live files may
+ // have been deleted by the primary. In this case, CheckConsistency() does
+ // not flag the missing file as inconsistency.
+ Status CheckConsistency() override;
+
+ protected:
+ // ColumnFamilyCollector is a write batch handler which does nothing
+ // except recording unique column family IDs
+ class ColumnFamilyCollector : public WriteBatch::Handler {
+ std::unordered_set<uint32_t> column_family_ids_;
+
+ Status AddColumnFamilyId(uint32_t column_family_id) {
+ if (column_family_ids_.find(column_family_id) ==
+ column_family_ids_.end()) {
+ column_family_ids_.insert(column_family_id);
+ }
+ return Status::OK();
+ }
+
+ public:
+ explicit ColumnFamilyCollector() {}
+
+ ~ColumnFamilyCollector() override {}
+
+ Status PutCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status DeleteCF(uint32_t column_family_id, const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status DeleteRangeCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status MergeCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ Status PutBlobIndexCF(uint32_t column_family_id, const Slice&,
+ const Slice&) override {
+ return AddColumnFamilyId(column_family_id);
+ }
+
+ const std::unordered_set<uint32_t>& column_families() const {
+ return column_family_ids_;
+ }
+ };
+
+ Status CollectColumnFamilyIdsFromWriteBatch(
+ const WriteBatch& batch, std::vector<uint32_t>* column_family_ids) {
+ assert(column_family_ids != nullptr);
+ column_family_ids->clear();
+ ColumnFamilyCollector handler;
+ Status s = batch.Iterate(&handler);
+ if (s.ok()) {
+ for (const auto& cf : handler.column_families()) {
+ column_family_ids->push_back(cf);
+ }
+ }
+ return s;
+ }
+
+ bool OwnTablesAndLogs() const override {
+ // Currently, the secondary instance does not own the database files. It
+ // simply opens the files of the primary instance and tracks their file
+ // descriptors until they become obsolete. In the future, the secondary may
+ // create links to database files. OwnTablesAndLogs will return true then.
+ return false;
+ }
+
+ private:
+ friend class DB;
+
+ // No copying allowed
+ DBImplSecondary(const DBImplSecondary&);
+ void operator=(const DBImplSecondary&);
+
+ using DBImpl::Recover;
+
+ Status FindAndRecoverLogFiles(
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context);
+ Status FindNewLogNumbers(std::vector<uint64_t>* logs);
+ // After manifest recovery, replay WALs and refresh log_readers_ if necessary
+ // REQUIRES: log_numbers are sorted in ascending order
+ Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+ SequenceNumber* next_sequence,
+ std::unordered_set<ColumnFamilyData*>* cfds_changed,
+ JobContext* job_context);
+
+ std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
+ std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
+ std::unique_ptr<Status> manifest_reader_status_;
+
+ // Cache log readers for each log number, used for continue WAL replay
+ // after recovery
+ std::map<uint64_t, std::unique_ptr<LogReaderContainer>> log_readers_;
+
+ // Current WAL number replayed for each column family.
+ std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
+};
+
+} // namespace ROCKSDB_NAMESPACE
+
+#endif // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_write.cc b/src/rocksdb/db/db_impl/db_impl_write.cc
new file mode 100644
index 000000000..8f6f685e4
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_write.cc
@@ -0,0 +1,1839 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <cinttypes>
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "monitoring/perf_context_imp.h"
+#include "options/options_helper.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& val) {
+ return DB::Put(o, column_family, key, val);
+}
+
+Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& val) {
+ auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+ if (!cfh->cfd()->ioptions()->merge_operator) {
+ return Status::NotSupported("Provide a merge_operator when opening DB");
+ } else {
+ return DB::Merge(o, column_family, key, val);
+ }
+}
+
+Status DBImpl::Delete(const WriteOptions& write_options,
+ ColumnFamilyHandle* column_family, const Slice& key) {
+ return DB::Delete(write_options, column_family, key);
+}
+
+Status DBImpl::SingleDelete(const WriteOptions& write_options,
+ ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ return DB::SingleDelete(write_options, column_family, key);
+}
+
+void DBImpl::SetRecoverableStatePreReleaseCallback(
+ PreReleaseCallback* callback) {
+ recoverable_state_pre_release_callback_.reset(callback);
+}
+
+Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
+ return WriteImpl(write_options, my_batch, nullptr, nullptr);
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
+ WriteBatch* my_batch,
+ WriteCallback* callback) {
+ return WriteImpl(write_options, my_batch, callback, nullptr);
+}
+#endif // ROCKSDB_LITE
+
+// The main write queue. This is the only write queue that updates LastSequence.
+// When using one write queue, the same sequence also indicates the last
+// published sequence.
+Status DBImpl::WriteImpl(const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback,
+ uint64_t* log_used, uint64_t log_ref,
+ bool disable_memtable, uint64_t* seq_used,
+ size_t batch_cnt,
+ PreReleaseCallback* pre_release_callback) {
+ assert(!seq_per_batch_ || batch_cnt != 0);
+ if (my_batch == nullptr) {
+ return Status::Corruption("Batch is nullptr!");
+ }
+ if (tracer_) {
+ InstrumentedMutexLock lock(&trace_mutex_);
+ if (tracer_) {
+ tracer_->Write(my_batch);
+ }
+ }
+ if (write_options.sync && write_options.disableWAL) {
+ return Status::InvalidArgument("Sync writes has to enable WAL.");
+ }
+ if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) {
+ return Status::NotSupported(
+ "pipelined_writes is not compatible with concurrent prepares");
+ }
+ if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
+ // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt
+ return Status::NotSupported(
+ "pipelined_writes is not compatible with seq_per_batch");
+ }
+ if (immutable_db_options_.unordered_write &&
+ immutable_db_options_.enable_pipelined_write) {
+ return Status::NotSupported(
+ "pipelined_writes is not compatible with unordered_write");
+ }
+ // Otherwise IsLatestPersistentState optimization does not make sense
+ assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
+ disable_memtable);
+
+ Status status;
+ if (write_options.low_pri) {
+ status = ThrottleLowPriWritesIfNeeded(write_options, my_batch);
+ if (!status.ok()) {
+ return status;
+ }
+ }
+
+ if (two_write_queues_ && disable_memtable) {
+ AssignOrder assign_order =
+ seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder;
+ // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and
+ // they don't consume sequence.
+ return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch,
+ callback, log_used, log_ref, seq_used, batch_cnt,
+ pre_release_callback, assign_order,
+ kDontPublishLastSeq, disable_memtable);
+ }
+
+ if (immutable_db_options_.unordered_write) {
+ const size_t sub_batch_cnt = batch_cnt != 0
+ ? batch_cnt
+ // every key is a sub-batch consuming a seq
+ : WriteBatchInternal::Count(my_batch);
+ uint64_t seq;
+ // Use a write thread to i) optimize for WAL write, ii) publish last
+ // sequence in in increasing order, iii) call pre_release_callback serially
+ status = WriteImplWALOnly(&write_thread_, write_options, my_batch, callback,
+ log_used, log_ref, &seq, sub_batch_cnt,
+ pre_release_callback, kDoAssignOrder,
+ kDoPublishLastSeq, disable_memtable);
+ TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
+ if (!status.ok()) {
+ return status;
+ }
+ if (seq_used) {
+ *seq_used = seq;
+ }
+ if (!disable_memtable) {
+ TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable");
+ status = UnorderedWriteMemtable(write_options, my_batch, callback,
+ log_ref, seq, sub_batch_cnt);
+ }
+ return status;
+ }
+
+ if (immutable_db_options_.enable_pipelined_write) {
+ return PipelinedWriteImpl(write_options, my_batch, callback, log_used,
+ log_ref, disable_memtable, seq_used);
+ }
+
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ disable_memtable, batch_cnt, pre_release_callback);
+
+ if (!write_options.disableWAL) {
+ RecordTick(stats_, WRITE_WITH_WAL);
+ }
+
+ StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+ write_thread_.JoinBatchGroup(&w);
+ if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+ // we are a non-leader in a parallel group
+
+ if (w.ShouldWriteToMemtable()) {
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+ PERF_TIMER_GUARD(write_memtable_time);
+
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_,
+ write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+ true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt,
+ batch_per_txn_, write_options.memtable_insert_hint_per_batch);
+
+ PERF_TIMER_START(write_pre_and_post_process_time);
+ }
+
+ if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+ // we're responsible for exit batch group
+ // TODO(myabandeh): propagate status to write_group
+ auto last_sequence = w.write_group->last_sequence;
+ versions_->SetLastSequence(last_sequence);
+ MemTableInsertStatusCheck(w.status);
+ write_thread_.ExitAsBatchGroupFollower(&w);
+ }
+ assert(w.state == WriteThread::STATE_COMPLETED);
+ // STATE_COMPLETED conditional below handles exit
+
+ status = w.FinalStatus();
+ }
+ if (w.state == WriteThread::STATE_COMPLETED) {
+ if (log_used != nullptr) {
+ *log_used = w.log_used;
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ // write is complete and leader has updated sequence
+ return w.FinalStatus();
+ }
+ // else we are the leader of the write batch group
+ assert(w.state == WriteThread::STATE_GROUP_LEADER);
+
+ // Once reaches this point, the current writer "w" will try to do its write
+ // job. It may also pick up some of the remaining writers in the "writers_"
+ // when it finds suitable, and finish them in the same write batch.
+ // This is how a write job could be done by the other writer.
+ WriteContext write_context;
+ WriteThread::WriteGroup write_group;
+ bool in_parallel_group = false;
+ uint64_t last_sequence = kMaxSequenceNumber;
+
+ mutex_.Lock();
+
+ bool need_log_sync = write_options.sync;
+ bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
+ if (!two_write_queues_ || !disable_memtable) {
+ // With concurrent writes we do preprocess only in the write thread that
+ // also does write to memtable to avoid sync issue on shared data structure
+ // with the other thread
+
+ // PreprocessWrite does its own perf timing.
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+ if (!two_write_queues_) {
+ // Assign it after ::PreprocessWrite since the sequence might advance
+ // inside it by WriteRecoverableState
+ last_sequence = versions_->LastSequence();
+ }
+
+ PERF_TIMER_START(write_pre_and_post_process_time);
+ }
+ log::Writer* log_writer = logs_.back().writer;
+
+ mutex_.Unlock();
+
+ // Add to log and apply to memtable. We can release the lock
+ // during this phase since &w is currently responsible for logging
+ // and protects against concurrent loggers and concurrent writes
+ // into memtables
+
+ TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters");
+ last_batch_group_size_ =
+ write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
+
+ if (status.ok()) {
+ // Rules for when we can update the memtable concurrently
+ // 1. supported by memtable
+ // 2. Puts are not okay if inplace_update_support
+ // 3. Merges are not okay
+ //
+ // Rules 1..2 are enforced by checking the options
+ // during startup (CheckConcurrentWritesSupported), so if
+ // options.allow_concurrent_memtable_write is true then they can be
+ // assumed to be true. Rule 3 is checked for each batch. We could
+ // relax rules 2 if we could prevent write batches from referring
+ // more than once to a particular key.
+ bool parallel = immutable_db_options_.allow_concurrent_memtable_write &&
+ write_group.size > 1;
+ size_t total_count = 0;
+ size_t valid_batches = 0;
+ size_t total_byte_size = 0;
+ size_t pre_release_callback_cnt = 0;
+ for (auto* writer : write_group) {
+ if (writer->CheckCallback(this)) {
+ valid_batches += writer->batch_cnt;
+ if (writer->ShouldWriteToMemtable()) {
+ total_count += WriteBatchInternal::Count(writer->batch);
+ parallel = parallel && !writer->batch->HasMerge();
+ }
+ total_byte_size = WriteBatchInternal::AppendedByteSize(
+ total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+ if (writer->pre_release_callback) {
+ pre_release_callback_cnt++;
+ }
+ }
+ }
+ // Note about seq_per_batch_: either disableWAL is set for the entire write
+ // group or not. In either case we inc seq for each write batch with no
+ // failed callback. This means that there could be a batch with
+ // disalbe_memtable in between; although we do not write this batch to
+ // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
+ // the seq per valid written key to mem.
+ size_t seq_inc = seq_per_batch_ ? valid_batches : total_count;
+
+ const bool concurrent_update = two_write_queues_;
+ // Update stats while we are an exclusive group leader, so we know
+ // that nobody else can be writing to these particular stats.
+ // We're optimistic, updating the stats before we successfully
+ // commit. That lets us release our leader status early.
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count,
+ concurrent_update);
+ RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+ stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+ concurrent_update);
+ RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+ concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_SELF);
+ auto write_done_by_other = write_group.size - 1;
+ if (write_done_by_other > 0) {
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+ write_done_by_other, concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+ }
+ RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+ if (write_options.disableWAL) {
+ has_unpersisted_data_.store(true, std::memory_order_relaxed);
+ }
+
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ if (!two_write_queues_) {
+ if (status.ok() && !write_options.disableWAL) {
+ PERF_TIMER_GUARD(write_wal_time);
+ status = WriteToWAL(write_group, log_writer, log_used, need_log_sync,
+ need_log_dir_sync, last_sequence + 1);
+ }
+ } else {
+ if (status.ok() && !write_options.disableWAL) {
+ PERF_TIMER_GUARD(write_wal_time);
+ // LastAllocatedSequence is increased inside WriteToWAL under
+ // wal_write_mutex_ to ensure ordered events in WAL
+ status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
+ seq_inc);
+ } else {
+ // Otherwise we inc seq number for memtable writes
+ last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+ }
+ }
+ assert(last_sequence != kMaxSequenceNumber);
+ const SequenceNumber current_sequence = last_sequence + 1;
+ last_sequence += seq_inc;
+
+ // PreReleaseCallback is called after WAL write and before memtable write
+ if (status.ok()) {
+ SequenceNumber next_sequence = current_sequence;
+ size_t index = 0;
+ // Note: the logic for advancing seq here must be consistent with the
+ // logic in WriteBatchInternal::InsertInto(write_group...) as well as
+ // with WriteBatchInternal::InsertInto(write_batch...) that is called on
+ // the merged batch during recovery from the WAL.
+ for (auto* writer : write_group) {
+ if (writer->CallbackFailed()) {
+ continue;
+ }
+ writer->sequence = next_sequence;
+ if (writer->pre_release_callback) {
+ Status ws = writer->pre_release_callback->Callback(
+ writer->sequence, disable_memtable, writer->log_used, index++,
+ pre_release_callback_cnt);
+ if (!ws.ok()) {
+ status = ws;
+ break;
+ }
+ }
+ if (seq_per_batch_) {
+ assert(writer->batch_cnt);
+ next_sequence += writer->batch_cnt;
+ } else if (writer->ShouldWriteToMemtable()) {
+ next_sequence += WriteBatchInternal::Count(writer->batch);
+ }
+ }
+ }
+
+ if (status.ok()) {
+ PERF_TIMER_GUARD(write_memtable_time);
+
+ if (!parallel) {
+ // w.sequence will be set inside InsertInto
+ w.status = WriteBatchInternal::InsertInto(
+ write_group, current_sequence, column_family_memtables_.get(),
+ &flush_scheduler_, &trim_history_scheduler_,
+ write_options.ignore_missing_column_families,
+ 0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
+ batch_per_txn_);
+ } else {
+ write_group.last_sequence = last_sequence;
+ write_thread_.LaunchParallelMemTableWriters(&write_group);
+ in_parallel_group = true;
+
+ // Each parallel follower is doing each own writes. The leader should
+ // also do its own.
+ if (w.ShouldWriteToMemtable()) {
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ assert(w.sequence == current_sequence);
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_,
+ write_options.ignore_missing_column_families, 0 /*log_number*/,
+ this, true /*concurrent_memtable_writes*/, seq_per_batch_,
+ w.batch_cnt, batch_per_txn_,
+ write_options.memtable_insert_hint_per_batch);
+ }
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ }
+ }
+ PERF_TIMER_START(write_pre_and_post_process_time);
+
+ if (!w.CallbackFailed()) {
+ WriteStatusCheck(status);
+ }
+
+ if (need_log_sync) {
+ mutex_.Lock();
+ MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
+ mutex_.Unlock();
+ // Requesting sync with two_write_queues_ is expected to be very rare. We
+ // hence provide a simple implementation that is not necessarily efficient.
+ if (two_write_queues_) {
+ if (manual_wal_flush_) {
+ status = FlushWAL(true);
+ } else {
+ status = SyncWAL();
+ }
+ }
+ }
+
+ bool should_exit_batch_group = true;
+ if (in_parallel_group) {
+ // CompleteParallelWorker returns true if this thread should
+ // handle exit, false means somebody else did
+ should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
+ }
+ if (should_exit_batch_group) {
+ if (status.ok()) {
+ // Note: if we are to resume after non-OK statuses we need to revisit how
+ // we reacts to non-OK statuses here.
+ versions_->SetLastSequence(last_sequence);
+ }
+ MemTableInsertStatusCheck(w.status);
+ write_thread_.ExitAsBatchGroupLeader(write_group, status);
+ }
+
+ if (status.ok()) {
+ status = w.FinalStatus();
+ }
+ return status;
+}
+
+Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback,
+ uint64_t* log_used, uint64_t log_ref,
+ bool disable_memtable, uint64_t* seq_used) {
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+ WriteContext write_context;
+
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ disable_memtable);
+ write_thread_.JoinBatchGroup(&w);
+ if (w.state == WriteThread::STATE_GROUP_LEADER) {
+ WriteThread::WriteGroup wal_write_group;
+ if (w.callback && !w.callback->AllowWriteBatching()) {
+ write_thread_.WaitForMemTableWriters();
+ }
+ mutex_.Lock();
+ bool need_log_sync = !write_options.disableWAL && write_options.sync;
+ bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
+ // PreprocessWrite does its own perf timing.
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+ w.status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+ PERF_TIMER_START(write_pre_and_post_process_time);
+ log::Writer* log_writer = logs_.back().writer;
+ mutex_.Unlock();
+
+ // This can set non-OK status if callback fail.
+ last_batch_group_size_ =
+ write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group);
+ const SequenceNumber current_sequence =
+ write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1;
+ size_t total_count = 0;
+ size_t total_byte_size = 0;
+
+ if (w.status.ok()) {
+ SequenceNumber next_sequence = current_sequence;
+ for (auto writer : wal_write_group) {
+ if (writer->CheckCallback(this)) {
+ if (writer->ShouldWriteToMemtable()) {
+ writer->sequence = next_sequence;
+ size_t count = WriteBatchInternal::Count(writer->batch);
+ next_sequence += count;
+ total_count += count;
+ }
+ total_byte_size = WriteBatchInternal::AppendedByteSize(
+ total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+ }
+ }
+ if (w.disable_wal) {
+ has_unpersisted_data_.store(true, std::memory_order_relaxed);
+ }
+ write_thread_.UpdateLastSequence(current_sequence + total_count - 1);
+ }
+
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+ RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+ stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size);
+ RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+ RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ if (w.status.ok() && !write_options.disableWAL) {
+ PERF_TIMER_GUARD(write_wal_time);
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
+ RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
+ if (wal_write_group.size > 1) {
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+ wal_write_group.size - 1);
+ RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
+ }
+ w.status = WriteToWAL(wal_write_group, log_writer, log_used,
+ need_log_sync, need_log_dir_sync, current_sequence);
+ }
+
+ if (!w.CallbackFailed()) {
+ WriteStatusCheck(w.status);
+ }
+
+ if (need_log_sync) {
+ mutex_.Lock();
+ MarkLogsSynced(logfile_number_, need_log_dir_sync, w.status);
+ mutex_.Unlock();
+ }
+
+ write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
+ }
+
+ WriteThread::WriteGroup memtable_write_group;
+ if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) {
+ PERF_TIMER_GUARD(write_memtable_time);
+ assert(w.ShouldWriteToMemtable());
+ write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group);
+ if (memtable_write_group.size > 1 &&
+ immutable_db_options_.allow_concurrent_memtable_write) {
+ write_thread_.LaunchParallelMemTableWriters(&memtable_write_group);
+ } else {
+ memtable_write_group.status = WriteBatchInternal::InsertInto(
+ memtable_write_group, w.sequence, column_family_memtables_.get(),
+ &flush_scheduler_, &trim_history_scheduler_,
+ write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+ false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
+ versions_->SetLastSequence(memtable_write_group.last_sequence);
+ write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
+ }
+ }
+
+ if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+ assert(w.ShouldWriteToMemtable());
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_, write_options.ignore_missing_column_families,
+ 0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+ false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/,
+ write_options.memtable_insert_hint_per_batch);
+ if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+ MemTableInsertStatusCheck(w.status);
+ versions_->SetLastSequence(w.write_group->last_sequence);
+ write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
+ }
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+
+ assert(w.state == WriteThread::STATE_COMPLETED);
+ return w.FinalStatus();
+}
+
+Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
+ WriteBatch* my_batch,
+ WriteCallback* callback, uint64_t log_ref,
+ SequenceNumber seq,
+ const size_t sub_batch_cnt) {
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ false /*disable_memtable*/);
+
+ if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) {
+ w.sequence = seq;
+ size_t total_count = WriteBatchInternal::Count(my_batch);
+ InternalStats* stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+ RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+
+ ColumnFamilyMemTablesImpl column_family_memtables(
+ versions_->GetColumnFamilySet());
+ w.status = WriteBatchInternal::InsertInto(
+ &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+ &trim_history_scheduler_, write_options.ignore_missing_column_families,
+ 0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+ seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/,
+ write_options.memtable_insert_hint_per_batch);
+
+ WriteStatusCheck(w.status);
+ if (write_options.disableWAL) {
+ has_unpersisted_data_.store(true, std::memory_order_relaxed);
+ }
+ }
+
+ size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1;
+ if (pending_cnt == 0) {
+ // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex
+ // before notify ensures that cv is in waiting state when it is notified
+ // thus not missing the update to pending_memtable_writes_ even though it is
+ // not modified under the mutex.
+ std::lock_guard<std::mutex> lck(switch_mutex_);
+ switch_cv_.notify_all();
+ }
+
+ if (!w.FinalStatus().ok()) {
+ return w.FinalStatus();
+ }
+ return Status::OK();
+}
+
+// The 2nd write queue. If enabled it will be used only for WAL-only writes.
+// This is the only queue that updates LastPublishedSequence which is only
+// applicable in a two-queue setting.
+Status DBImpl::WriteImplWALOnly(
+ WriteThread* write_thread, const WriteOptions& write_options,
+ WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used,
+ const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+ PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+ const PublishLastSeq publish_last_seq, const bool disable_memtable) {
+ Status status;
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+ WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+ disable_memtable, sub_batch_cnt, pre_release_callback);
+ RecordTick(stats_, WRITE_WITH_WAL);
+ StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+ write_thread->JoinBatchGroup(&w);
+ assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
+ if (w.state == WriteThread::STATE_COMPLETED) {
+ if (log_used != nullptr) {
+ *log_used = w.log_used;
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ return w.FinalStatus();
+ }
+ // else we are the leader of the write batch group
+ assert(w.state == WriteThread::STATE_GROUP_LEADER);
+
+ if (publish_last_seq == kDoPublishLastSeq) {
+ // Currently we only use kDoPublishLastSeq in unordered_write
+ assert(immutable_db_options_.unordered_write);
+ WriteContext write_context;
+ if (error_handler_.IsDBStopped()) {
+ status = error_handler_.GetBGError();
+ }
+ // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
+ // without paying the cost of obtaining the mutex.
+ if (status.ok()) {
+ InstrumentedMutexLock l(&mutex_);
+ bool need_log_sync = false;
+ status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+ WriteStatusCheck(status);
+ }
+ if (!status.ok()) {
+ WriteThread::WriteGroup write_group;
+ write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+ write_thread->ExitAsBatchGroupLeader(write_group, status);
+ return status;
+ }
+ }
+
+ WriteThread::WriteGroup write_group;
+ uint64_t last_sequence;
+ write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+ // Note: no need to update last_batch_group_size_ here since the batch writes
+ // to WAL only
+
+ size_t pre_release_callback_cnt = 0;
+ size_t total_byte_size = 0;
+ for (auto* writer : write_group) {
+ if (writer->CheckCallback(this)) {
+ total_byte_size = WriteBatchInternal::AppendedByteSize(
+ total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+ if (writer->pre_release_callback) {
+ pre_release_callback_cnt++;
+ }
+ }
+ }
+
+ const bool concurrent_update = true;
+ // Update stats while we are an exclusive group leader, so we know
+ // that nobody else can be writing to these particular stats.
+ // We're optimistic, updating the stats before we successfully
+ // commit. That lets us release our leader status early.
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+ concurrent_update);
+ RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+ concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_SELF);
+ auto write_done_by_other = write_group.size - 1;
+ if (write_done_by_other > 0) {
+ stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+ write_done_by_other, concurrent_update);
+ RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+ }
+ RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+ PERF_TIMER_GUARD(write_wal_time);
+ // LastAllocatedSequence is increased inside WriteToWAL under
+ // wal_write_mutex_ to ensure ordered events in WAL
+ size_t seq_inc = 0 /* total_count */;
+ if (assign_order == kDoAssignOrder) {
+ size_t total_batch_cnt = 0;
+ for (auto* writer : write_group) {
+ assert(writer->batch_cnt || !seq_per_batch_);
+ if (!writer->CallbackFailed()) {
+ total_batch_cnt += writer->batch_cnt;
+ }
+ }
+ seq_inc = total_batch_cnt;
+ }
+ if (!write_options.disableWAL) {
+ status =
+ ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
+ } else {
+ // Otherwise we inc seq number to do solely the seq allocation
+ last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+ }
+
+ size_t memtable_write_cnt = 0;
+ auto curr_seq = last_sequence + 1;
+ for (auto* writer : write_group) {
+ if (writer->CallbackFailed()) {
+ continue;
+ }
+ writer->sequence = curr_seq;
+ if (assign_order == kDoAssignOrder) {
+ assert(writer->batch_cnt || !seq_per_batch_);
+ curr_seq += writer->batch_cnt;
+ }
+ if (!writer->disable_memtable) {
+ memtable_write_cnt++;
+ }
+ // else seq advances only by memtable writes
+ }
+ if (status.ok() && write_options.sync) {
+ assert(!write_options.disableWAL);
+ // Requesting sync with two_write_queues_ is expected to be very rare. We
+ // hance provide a simple implementation that is not necessarily efficient.
+ if (manual_wal_flush_) {
+ status = FlushWAL(true);
+ } else {
+ status = SyncWAL();
+ }
+ }
+ PERF_TIMER_START(write_pre_and_post_process_time);
+
+ if (!w.CallbackFailed()) {
+ WriteStatusCheck(status);
+ }
+ if (status.ok()) {
+ size_t index = 0;
+ for (auto* writer : write_group) {
+ if (!writer->CallbackFailed() && writer->pre_release_callback) {
+ assert(writer->sequence != kMaxSequenceNumber);
+ Status ws = writer->pre_release_callback->Callback(
+ writer->sequence, disable_memtable, writer->log_used, index++,
+ pre_release_callback_cnt);
+ if (!ws.ok()) {
+ status = ws;
+ break;
+ }
+ }
+ }
+ }
+ if (publish_last_seq == kDoPublishLastSeq) {
+ versions_->SetLastSequence(last_sequence + seq_inc);
+ // Currently we only use kDoPublishLastSeq in unordered_write
+ assert(immutable_db_options_.unordered_write);
+ }
+ if (immutable_db_options_.unordered_write && status.ok()) {
+ pending_memtable_writes_ += memtable_write_cnt;
+ }
+ write_thread->ExitAsBatchGroupLeader(write_group, status);
+ if (status.ok()) {
+ status = w.FinalStatus();
+ }
+ if (seq_used != nullptr) {
+ *seq_used = w.sequence;
+ }
+ return status;
+}
+
+void DBImpl::WriteStatusCheck(const Status& status) {
+ // Is setting bg_error_ enough here? This will at least stop
+ // compaction and fail any further writes.
+ if (immutable_db_options_.paranoid_checks && !status.ok() &&
+ !status.IsBusy() && !status.IsIncomplete()) {
+ mutex_.Lock();
+ error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
+ mutex_.Unlock();
+ }
+}
+
+void DBImpl::MemTableInsertStatusCheck(const Status& status) {
+ // A non-OK status here indicates that the state implied by the
+ // WAL has diverged from the in-memory state. This could be
+ // because of a corrupt write_batch (very bad), or because the
+ // client specified an invalid column family and didn't specify
+ // ignore_missing_column_families.
+ if (!status.ok()) {
+ mutex_.Lock();
+ assert(!error_handler_.IsBGWorkStopped());
+ error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
+ mutex_.Unlock();
+ }
+}
+
+Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
+ bool* need_log_sync,
+ WriteContext* write_context) {
+ mutex_.AssertHeld();
+ assert(write_context != nullptr && need_log_sync != nullptr);
+ Status status;
+
+ if (error_handler_.IsDBStopped()) {
+ status = error_handler_.GetBGError();
+ }
+
+ PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
+
+ assert(!single_column_family_mode_ ||
+ versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
+ if (UNLIKELY(status.ok() && !single_column_family_mode_ &&
+ total_log_size_ > GetMaxTotalWalSize())) {
+ WaitForPendingWrites();
+ status = SwitchWAL(write_context);
+ }
+
+ if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) {
+ // Before a new memtable is added in SwitchMemtable(),
+ // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+ // thread is writing to another DB with the same write buffer, they may also
+ // be flushed. We may end up with flushing much more DBs than needed. It's
+ // suboptimal but still correct.
+ WaitForPendingWrites();
+ status = HandleWriteBufferFull(write_context);
+ }
+
+ if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
+ status = TrimMemtableHistory(write_context);
+ }
+
+ if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
+ WaitForPendingWrites();
+ status = ScheduleFlushes(write_context);
+ }
+
+ PERF_TIMER_STOP(write_scheduling_flushes_compactions_time);
+ PERF_TIMER_GUARD(write_pre_and_post_process_time);
+
+ if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
+ write_controller_.NeedsDelay()))) {
+ PERF_TIMER_STOP(write_pre_and_post_process_time);
+ PERF_TIMER_GUARD(write_delay_time);
+ // We don't know size of curent batch so that we always use the size
+ // for previous one. It might create a fairness issue that expiration
+ // might happen for smaller writes but larger writes can go through.
+ // Can optimize it if it is an issue.
+ status = DelayWrite(last_batch_group_size_, write_options);
+ PERF_TIMER_START(write_pre_and_post_process_time);
+ }
+
+ if (status.ok() && *need_log_sync) {
+ // Wait until the parallel syncs are finished. Any sync process has to sync
+ // the front log too so it is enough to check the status of front()
+ // We do a while loop since log_sync_cv_ is signalled when any sync is
+ // finished
+ // Note: there does not seem to be a reason to wait for parallel sync at
+ // this early step but it is not important since parallel sync (SyncWAL) and
+ // need_log_sync are usually not used together.
+ while (logs_.front().getting_synced) {
+ log_sync_cv_.Wait();
+ }
+ for (auto& log : logs_) {
+ assert(!log.getting_synced);
+ // This is just to prevent the logs to be synced by a parallel SyncWAL
+ // call. We will do the actual syncing later after we will write to the
+ // WAL.
+ // Note: there does not seem to be a reason to set this early before we
+ // actually write to the WAL
+ log.getting_synced = true;
+ }
+ } else {
+ *need_log_sync = false;
+ }
+
+ return status;
+}
+
+WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
+ WriteBatch* tmp_batch, size_t* write_with_wal,
+ WriteBatch** to_be_cached_state) {
+ assert(write_with_wal != nullptr);
+ assert(tmp_batch != nullptr);
+ assert(*to_be_cached_state == nullptr);
+ WriteBatch* merged_batch = nullptr;
+ *write_with_wal = 0;
+ auto* leader = write_group.leader;
+ assert(!leader->disable_wal); // Same holds for all in the batch group
+ if (write_group.size == 1 && !leader->CallbackFailed() &&
+ leader->batch->GetWalTerminationPoint().is_cleared()) {
+ // we simply write the first WriteBatch to WAL if the group only
+ // contains one batch, that batch should be written to the WAL,
+ // and the batch is not wanting to be truncated
+ merged_batch = leader->batch;
+ if (WriteBatchInternal::IsLatestPersistentState(merged_batch)) {
+ *to_be_cached_state = merged_batch;
+ }
+ *write_with_wal = 1;
+ } else {
+ // WAL needs all of the batches flattened into a single batch.
+ // We could avoid copying here with an iov-like AddRecord
+ // interface
+ merged_batch = tmp_batch;
+ for (auto writer : write_group) {
+ if (!writer->CallbackFailed()) {
+ WriteBatchInternal::Append(merged_batch, writer->batch,
+ /*WAL_only*/ true);
+ if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) {
+ // We only need to cache the last of such write batch
+ *to_be_cached_state = writer->batch;
+ }
+ (*write_with_wal)++;
+ }
+ }
+ }
+ return merged_batch;
+}
+
+// When two_write_queues_ is disabled, this function is called from the only
+// write thread. Otherwise this must be called holding log_write_mutex_.
+Status DBImpl::WriteToWAL(const WriteBatch& merged_batch,
+ log::Writer* log_writer, uint64_t* log_used,
+ uint64_t* log_size) {
+ assert(log_size != nullptr);
+ Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
+ *log_size = log_entry.size();
+ // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
+ // from the two queues anyway and log_write_mutex_ is already held. Otherwise
+ // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
+ // from possible concurrent calls via the FlushWAL by the application.
+ const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+ // Due to performance cocerns of missed branch prediction penalize the new
+ // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
+ // when we do not need any locking.
+ if (UNLIKELY(needs_locking)) {
+ log_write_mutex_.Lock();
+ }
+ Status status = log_writer->AddRecord(log_entry);
+ if (UNLIKELY(needs_locking)) {
+ log_write_mutex_.Unlock();
+ }
+ if (log_used != nullptr) {
+ *log_used = logfile_number_;
+ }
+ total_log_size_ += log_entry.size();
+ // TODO(myabandeh): it might be unsafe to access alive_log_files_.back() here
+ // since alive_log_files_ might be modified concurrently
+ alive_log_files_.back().AddSize(log_entry.size());
+ log_empty_ = false;
+ return status;
+}
+
+Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
+ log::Writer* log_writer, uint64_t* log_used,
+ bool need_log_sync, bool need_log_dir_sync,
+ SequenceNumber sequence) {
+ Status status;
+
+ assert(!write_group.leader->disable_wal);
+ // Same holds for all in the batch group
+ size_t write_with_wal = 0;
+ WriteBatch* to_be_cached_state = nullptr;
+ WriteBatch* merged_batch = MergeBatch(write_group, &tmp_batch_,
+ &write_with_wal, &to_be_cached_state);
+ if (merged_batch == write_group.leader->batch) {
+ write_group.leader->log_used = logfile_number_;
+ } else if (write_with_wal > 1) {
+ for (auto writer : write_group) {
+ writer->log_used = logfile_number_;
+ }
+ }
+
+ WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+ uint64_t log_size;
+ status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+ if (to_be_cached_state) {
+ cached_recoverable_state_ = *to_be_cached_state;
+ cached_recoverable_state_empty_ = false;
+ }
+
+ if (status.ok() && need_log_sync) {
+ StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
+ // It's safe to access logs_ with unlocked mutex_ here because:
+ // - we've set getting_synced=true for all logs,
+ // so other threads won't pop from logs_ while we're here,
+ // - only writer thread can push to logs_, and we're in
+ // writer thread, so no one will push to logs_,
+ // - as long as other threads don't modify it, it's safe to read
+ // from std::deque from multiple threads concurrently.
+ for (auto& log : logs_) {
+ status = log.writer->file()->Sync(immutable_db_options_.use_fsync);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (status.ok() && need_log_dir_sync) {
+ // We only sync WAL directory the first time WAL syncing is
+ // requested, so that in case users never turn on WAL sync,
+ // we can avoid the disk I/O in the write code path.
+ status = directories_.GetWalDir()->Fsync();
+ }
+ }
+
+ if (merged_batch == &tmp_batch_) {
+ tmp_batch_.Clear();
+ }
+ if (status.ok()) {
+ auto stats = default_cf_internal_stats_;
+ if (need_log_sync) {
+ stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
+ RecordTick(stats_, WAL_FILE_SYNCED);
+ }
+ stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size);
+ RecordTick(stats_, WAL_FILE_BYTES, log_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal);
+ RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+ }
+ return status;
+}
+
+Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
+ uint64_t* log_used,
+ SequenceNumber* last_sequence,
+ size_t seq_inc) {
+ Status status;
+
+ assert(!write_group.leader->disable_wal);
+ // Same holds for all in the batch group
+ WriteBatch tmp_batch;
+ size_t write_with_wal = 0;
+ WriteBatch* to_be_cached_state = nullptr;
+ WriteBatch* merged_batch =
+ MergeBatch(write_group, &tmp_batch, &write_with_wal, &to_be_cached_state);
+
+ // We need to lock log_write_mutex_ since logs_ and alive_log_files might be
+ // pushed back concurrently
+ log_write_mutex_.Lock();
+ if (merged_batch == write_group.leader->batch) {
+ write_group.leader->log_used = logfile_number_;
+ } else if (write_with_wal > 1) {
+ for (auto writer : write_group) {
+ writer->log_used = logfile_number_;
+ }
+ }
+ *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+ auto sequence = *last_sequence + 1;
+ WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+ log::Writer* log_writer = logs_.back().writer;
+ uint64_t log_size;
+ status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+ if (to_be_cached_state) {
+ cached_recoverable_state_ = *to_be_cached_state;
+ cached_recoverable_state_empty_ = false;
+ }
+ log_write_mutex_.Unlock();
+
+ if (status.ok()) {
+ const bool concurrent = true;
+ auto stats = default_cf_internal_stats_;
+ stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size,
+ concurrent);
+ RecordTick(stats_, WAL_FILE_BYTES, log_size);
+ stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal,
+ concurrent);
+ RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+ }
+ return status;
+}
+
+Status DBImpl::WriteRecoverableState() {
+ mutex_.AssertHeld();
+ if (!cached_recoverable_state_empty_) {
+ bool dont_care_bool;
+ SequenceNumber next_seq;
+ if (two_write_queues_) {
+ log_write_mutex_.Lock();
+ }
+ SequenceNumber seq;
+ if (two_write_queues_) {
+ seq = versions_->FetchAddLastAllocatedSequence(0);
+ } else {
+ seq = versions_->LastSequence();
+ }
+ WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1);
+ auto status = WriteBatchInternal::InsertInto(
+ &cached_recoverable_state_, column_family_memtables_.get(),
+ &flush_scheduler_, &trim_history_scheduler_, true,
+ 0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
+ &next_seq, &dont_care_bool, seq_per_batch_);
+ auto last_seq = next_seq - 1;
+ if (two_write_queues_) {
+ versions_->FetchAddLastAllocatedSequence(last_seq - seq);
+ versions_->SetLastPublishedSequence(last_seq);
+ }
+ versions_->SetLastSequence(last_seq);
+ if (two_write_queues_) {
+ log_write_mutex_.Unlock();
+ }
+ if (status.ok() && recoverable_state_pre_release_callback_) {
+ const bool DISABLE_MEMTABLE = true;
+ for (uint64_t sub_batch_seq = seq + 1;
+ sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) {
+ uint64_t const no_log_num = 0;
+ // Unlock it since the callback might end up locking mutex. e.g.,
+ // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB
+ mutex_.Unlock();
+ status = recoverable_state_pre_release_callback_->Callback(
+ sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1);
+ mutex_.Lock();
+ }
+ }
+ if (status.ok()) {
+ cached_recoverable_state_.Clear();
+ cached_recoverable_state_empty_ = true;
+ }
+ return status;
+ }
+ return Status::OK();
+}
+
+void DBImpl::SelectColumnFamiliesForAtomicFlush(
+ autovector<ColumnFamilyData*>* cfds) {
+ for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+ !cached_recoverable_state_empty_.load()) {
+ cfds->push_back(cfd);
+ }
+ }
+}
+
+// Assign sequence number for atomic flush.
+void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
+ assert(immutable_db_options_.atomic_flush);
+ auto seq = versions_->LastSequence();
+ for (auto cfd : cfds) {
+ cfd->imm()->AssignAtomicFlushSeq(seq);
+ }
+}
+
+Status DBImpl::SwitchWAL(WriteContext* write_context) {
+ mutex_.AssertHeld();
+ assert(write_context != nullptr);
+ Status status;
+
+ if (alive_log_files_.begin()->getting_flushed) {
+ return status;
+ }
+
+ auto oldest_alive_log = alive_log_files_.begin()->number;
+ bool flush_wont_release_oldest_log = false;
+ if (allow_2pc()) {
+ auto oldest_log_with_uncommitted_prep =
+ logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+
+ assert(oldest_log_with_uncommitted_prep == 0 ||
+ oldest_log_with_uncommitted_prep >= oldest_alive_log);
+ if (oldest_log_with_uncommitted_prep > 0 &&
+ oldest_log_with_uncommitted_prep == oldest_alive_log) {
+ if (unable_to_release_oldest_log_) {
+ // we already attempted to flush all column families dependent on
+ // the oldest alive log but the log still contained uncommitted
+ // transactions so there is still nothing that we can do.
+ return status;
+ } else {
+ ROCKS_LOG_WARN(
+ immutable_db_options_.info_log,
+ "Unable to release oldest log due to uncommitted transaction");
+ unable_to_release_oldest_log_ = true;
+ flush_wont_release_oldest_log = true;
+ }
+ }
+ }
+ if (!flush_wont_release_oldest_log) {
+ // we only mark this log as getting flushed if we have successfully
+ // flushed all data in this log. If this log contains outstanding prepared
+ // transactions then we cannot flush this log until those transactions are
+ // commited.
+ unable_to_release_oldest_log_ = false;
+ alive_log_files_.begin()->getting_flushed = true;
+ }
+
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Flushing all column families with data in WAL number %" PRIu64
+ ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
+ oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
+ // no need to refcount because drop is happening in write thread, so can't
+ // happen while we're in the write thread
+ autovector<ColumnFamilyData*> cfds;
+ if (immutable_db_options_.atomic_flush) {
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ } else {
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (cfd->OldestLogToKeep() <= oldest_alive_log) {
+ cfds.push_back(cfd);
+ }
+ }
+ MaybeFlushStatsCF(&cfds);
+ }
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ for (const auto cfd : cfds) {
+ cfd->Ref();
+ status = SwitchMemtable(cfd, write_context);
+ cfd->UnrefAndTryDelete();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+
+ if (status.ok()) {
+ if (immutable_db_options_.atomic_flush) {
+ AssignAtomicFlushSeq(cfds);
+ }
+ for (auto cfd : cfds) {
+ cfd->imm()->FlushRequested();
+ }
+ FlushRequest flush_req;
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+ MaybeScheduleFlushOrCompaction();
+ }
+ return status;
+}
+
+Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
+ mutex_.AssertHeld();
+ assert(write_context != nullptr);
+ Status status;
+
+ // Before a new memtable is added in SwitchMemtable(),
+ // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+ // thread is writing to another DB with the same write buffer, they may also
+ // be flushed. We may end up with flushing much more DBs than needed. It's
+ // suboptimal but still correct.
+ ROCKS_LOG_INFO(
+ immutable_db_options_.info_log,
+ "Flushing column family with oldest memtable entry. Write buffer is "
+ "using %" ROCKSDB_PRIszt " bytes out of a total of %" ROCKSDB_PRIszt ".",
+ write_buffer_manager_->memory_usage(),
+ write_buffer_manager_->buffer_size());
+ // no need to refcount because drop is happening in write thread, so can't
+ // happen while we're in the write thread
+ autovector<ColumnFamilyData*> cfds;
+ if (immutable_db_options_.atomic_flush) {
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ } else {
+ ColumnFamilyData* cfd_picked = nullptr;
+ SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
+
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
+ if (cfd->IsDropped()) {
+ continue;
+ }
+ if (!cfd->mem()->IsEmpty()) {
+ // We only consider active mem table, hoping immutable memtable is
+ // already in the process of flushing.
+ uint64_t seq = cfd->mem()->GetCreationSeq();
+ if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
+ cfd_picked = cfd;
+ seq_num_for_cf_picked = seq;
+ }
+ }
+ }
+ if (cfd_picked != nullptr) {
+ cfds.push_back(cfd_picked);
+ }
+ MaybeFlushStatsCF(&cfds);
+ }
+
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+ for (const auto cfd : cfds) {
+ if (cfd->mem()->IsEmpty()) {
+ continue;
+ }
+ cfd->Ref();
+ status = SwitchMemtable(cfd, write_context);
+ cfd->UnrefAndTryDelete();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+
+ if (status.ok()) {
+ if (immutable_db_options_.atomic_flush) {
+ AssignAtomicFlushSeq(cfds);
+ }
+ for (const auto cfd : cfds) {
+ cfd->imm()->FlushRequested();
+ }
+ FlushRequest flush_req;
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+ MaybeScheduleFlushOrCompaction();
+ }
+ return status;
+}
+
+uint64_t DBImpl::GetMaxTotalWalSize() const {
+ mutex_.AssertHeld();
+ return mutable_db_options_.max_total_wal_size == 0
+ ? 4 * max_total_in_memory_state_
+ : mutable_db_options_.max_total_wal_size;
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::DelayWrite(uint64_t num_bytes,
+ const WriteOptions& write_options) {
+ uint64_t time_delayed = 0;
+ bool delayed = false;
+ {
+ StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed);
+ uint64_t delay = write_controller_.GetDelay(env_, num_bytes);
+ if (delay > 0) {
+ if (write_options.no_slowdown) {
+ return Status::Incomplete("Write stall");
+ }
+ TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
+
+ // Notify write_thread_ about the stall so it can setup a barrier and
+ // fail any pending writers with no_slowdown
+ write_thread_.BeginWriteStall();
+ TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone");
+ mutex_.Unlock();
+ // We will delay the write until we have slept for delay ms or
+ // we don't need a delay anymore
+ const uint64_t kDelayInterval = 1000;
+ uint64_t stall_end = sw.start_time() + delay;
+ while (write_controller_.NeedsDelay()) {
+ if (env_->NowMicros() >= stall_end) {
+ // We already delayed this write `delay` microseconds
+ break;
+ }
+
+ delayed = true;
+ // Sleep for 0.001 seconds
+ env_->SleepForMicroseconds(kDelayInterval);
+ }
+ mutex_.Lock();
+ write_thread_.EndWriteStall();
+ }
+
+ // Don't wait if there's a background error, even if its a soft error. We
+ // might wait here indefinitely as the background compaction may never
+ // finish successfully, resulting in the stall condition lasting
+ // indefinitely
+ while (error_handler_.GetBGError().ok() && write_controller_.IsStopped()) {
+ if (write_options.no_slowdown) {
+ return Status::Incomplete("Write stall");
+ }
+ delayed = true;
+
+ // Notify write_thread_ about the stall so it can setup a barrier and
+ // fail any pending writers with no_slowdown
+ write_thread_.BeginWriteStall();
+ TEST_SYNC_POINT("DBImpl::DelayWrite:Wait");
+ bg_cv_.Wait();
+ write_thread_.EndWriteStall();
+ }
+ }
+ assert(!delayed || !write_options.no_slowdown);
+ if (delayed) {
+ default_cf_internal_stats_->AddDBStats(
+ InternalStats::kIntStatsWriteStallMicros, time_delayed);
+ RecordTick(stats_, STALL_MICROS, time_delayed);
+ }
+
+ // If DB is not in read-only mode and write_controller is not stopping
+ // writes, we can ignore any background errors and allow the write to
+ // proceed
+ Status s;
+ if (write_controller_.IsStopped()) {
+ // If writes are still stopped, it means we bailed due to a background
+ // error
+ s = Status::Incomplete(error_handler_.GetBGError().ToString());
+ }
+ if (error_handler_.IsDBStopped()) {
+ s = error_handler_.GetBGError();
+ }
+ return s;
+}
+
+Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+ WriteBatch* my_batch) {
+ assert(write_options.low_pri);
+ // This is called outside the DB mutex. Although it is safe to make the call,
+ // the consistency condition is not guaranteed to hold. It's OK to live with
+ // it in this case.
+ // If we need to speed compaction, it means the compaction is left behind
+ // and we start to limit low pri writes to a limit.
+ if (write_controller_.NeedSpeedupCompaction()) {
+ if (allow_2pc() && (my_batch->HasCommit() || my_batch->HasRollback())) {
+ // For 2PC, we only rate limit prepare, not commit.
+ return Status::OK();
+ }
+ if (write_options.no_slowdown) {
+ return Status::Incomplete("Low priority write stall");
+ } else {
+ assert(my_batch != nullptr);
+ // Rate limit those writes. The reason that we don't completely wait
+ // is that in case the write is heavy, low pri writes may never have
+ // a chance to run. Now we guarantee we are still slowly making
+ // progress.
+ PERF_TIMER_GUARD(write_delay_time);
+ write_controller_.low_pri_rate_limiter()->Request(
+ my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */,
+ RateLimiter::OpType::kWrite);
+ }
+ }
+ return Status::OK();
+}
+
+void DBImpl::MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds) {
+ assert(cfds != nullptr);
+ if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) {
+ ColumnFamilyData* cfd_stats =
+ versions_->GetColumnFamilySet()->GetColumnFamily(
+ kPersistentStatsColumnFamilyName);
+ if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) {
+ for (ColumnFamilyData* cfd : *cfds) {
+ if (cfd == cfd_stats) {
+ // stats CF already included in cfds
+ return;
+ }
+ }
+ // force flush stats CF when its log number is less than all other CF's
+ // log numbers
+ bool force_flush_stats_cf = true;
+ for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+ if (loop_cfd == cfd_stats) {
+ continue;
+ }
+ if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+ force_flush_stats_cf = false;
+ }
+ }
+ if (force_flush_stats_cf) {
+ cfds->push_back(cfd_stats);
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "Force flushing stats CF with automated flush "
+ "to avoid holding old logs");
+ }
+ }
+ }
+}
+
+Status DBImpl::TrimMemtableHistory(WriteContext* context) {
+ autovector<ColumnFamilyData*> cfds;
+ ColumnFamilyData* tmp_cfd;
+ while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) !=
+ nullptr) {
+ cfds.push_back(tmp_cfd);
+ }
+ for (auto& cfd : cfds) {
+ autovector<MemTable*> to_delete;
+ cfd->imm()->TrimHistory(&to_delete, cfd->mem()->ApproximateMemoryUsage());
+ if (!to_delete.empty()) {
+ for (auto m : to_delete) {
+ delete m;
+ }
+ context->superversion_context.NewSuperVersion();
+ assert(context->superversion_context.new_superversion.get() != nullptr);
+ cfd->InstallSuperVersion(&context->superversion_context, &mutex_);
+ }
+
+ if (cfd->UnrefAndTryDelete()) {
+ cfd = nullptr;
+ }
+ }
+ return Status::OK();
+}
+
+Status DBImpl::ScheduleFlushes(WriteContext* context) {
+ autovector<ColumnFamilyData*> cfds;
+ if (immutable_db_options_.atomic_flush) {
+ SelectColumnFamiliesForAtomicFlush(&cfds);
+ for (auto cfd : cfds) {
+ cfd->Ref();
+ }
+ flush_scheduler_.Clear();
+ } else {
+ ColumnFamilyData* tmp_cfd;
+ while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+ cfds.push_back(tmp_cfd);
+ }
+ MaybeFlushStatsCF(&cfds);
+ }
+ Status status;
+ WriteThread::Writer nonmem_w;
+ if (two_write_queues_) {
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+ }
+
+ for (auto& cfd : cfds) {
+ if (!cfd->mem()->IsEmpty()) {
+ status = SwitchMemtable(cfd, context);
+ }
+ if (cfd->UnrefAndTryDelete()) {
+ cfd = nullptr;
+ }
+ if (!status.ok()) {
+ break;
+ }
+ }
+
+ if (two_write_queues_) {
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+ }
+
+ if (status.ok()) {
+ if (immutable_db_options_.atomic_flush) {
+ AssignAtomicFlushSeq(cfds);
+ }
+ FlushRequest flush_req;
+ GenerateFlushRequest(cfds, &flush_req);
+ SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+ MaybeScheduleFlushOrCompaction();
+ }
+ return status;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
+ const MemTableInfo& mem_table_info) {
+ if (immutable_db_options_.listeners.size() == 0U) {
+ return;
+ }
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ return;
+ }
+
+ for (auto listener : immutable_db_options_.listeners) {
+ listener->OnMemTableSealed(mem_table_info);
+ }
+}
+#endif // ROCKSDB_LITE
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+// REQUIRES: this thread is currently at the front of the 2nd writer queue if
+// two_write_queues_ is true (This is to simplify the reasoning.)
+Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
+ mutex_.AssertHeld();
+ WriteThread::Writer nonmem_w;
+ std::unique_ptr<WritableFile> lfile;
+ log::Writer* new_log = nullptr;
+ MemTable* new_mem = nullptr;
+
+ // Recoverable state is persisted in WAL. After memtable switch, WAL might
+ // be deleted, so we write the state to memtable to be persisted as well.
+ Status s = WriteRecoverableState();
+ if (!s.ok()) {
+ return s;
+ }
+
+ // Attempt to switch to a new memtable and trigger flush of old.
+ // Do this without holding the dbmutex lock.
+ assert(versions_->prev_log_number() == 0);
+ if (two_write_queues_) {
+ log_write_mutex_.Lock();
+ }
+ bool creating_new_log = !log_empty_;
+ if (two_write_queues_) {
+ log_write_mutex_.Unlock();
+ }
+ uint64_t recycle_log_number = 0;
+ if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
+ !log_recycle_files_.empty()) {
+ recycle_log_number = log_recycle_files_.front();
+ }
+ uint64_t new_log_number =
+ creating_new_log ? versions_->NewFileNumber() : logfile_number_;
+ const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+
+ // Set memtable_info for memtable sealed callback
+#ifndef ROCKSDB_LITE
+ MemTableInfo memtable_info;
+ memtable_info.cf_name = cfd->GetName();
+ memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
+ memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
+ memtable_info.num_entries = cfd->mem()->num_entries();
+ memtable_info.num_deletes = cfd->mem()->num_deletes();
+#endif // ROCKSDB_LITE
+ // Log this later after lock release. It may be outdated, e.g., if background
+ // flush happens before logging, but that should be ok.
+ int num_imm_unflushed = cfd->imm()->NumNotFlushed();
+ const auto preallocate_block_size =
+ GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
+ mutex_.Unlock();
+ if (creating_new_log) {
+ // TODO: Write buffer size passed in should be max of all CF's instead
+ // of mutable_cf_options.write_buffer_size.
+ s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size,
+ &new_log);
+ }
+ if (s.ok()) {
+ SequenceNumber seq = versions_->LastSequence();
+ new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
+ context->superversion_context.NewSuperVersion();
+ }
+ ROCKS_LOG_INFO(immutable_db_options_.info_log,
+ "[%s] New memtable created with log file: #%" PRIu64
+ ". Immutable memtables: %d.\n",
+ cfd->GetName().c_str(), new_log_number, num_imm_unflushed);
+ mutex_.Lock();
+ if (recycle_log_number != 0) {
+ // Since renaming the file is done outside DB mutex, we need to ensure
+ // concurrent full purges don't delete the file while we're recycling it.
+ // To achieve that we hold the old log number in the recyclable list until
+ // after it has been renamed.
+ assert(log_recycle_files_.front() == recycle_log_number);
+ log_recycle_files_.pop_front();
+ }
+ if (s.ok() && creating_new_log) {
+ log_write_mutex_.Lock();
+ assert(new_log != nullptr);
+ if (!logs_.empty()) {
+ // Alway flush the buffer of the last log before switching to a new one
+ log::Writer* cur_log_writer = logs_.back().writer;
+ s = cur_log_writer->WriteBuffer();
+ if (!s.ok()) {
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
+ "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64
+ " WAL file\n",
+ cfd->GetName().c_str(), cur_log_writer->get_log_number(),
+ new_log_number);
+ }
+ }
+ if (s.ok()) {
+ logfile_number_ = new_log_number;
+ log_empty_ = true;
+ log_dir_synced_ = false;
+ logs_.emplace_back(logfile_number_, new_log);
+ alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
+ }
+ log_write_mutex_.Unlock();
+ }
+
+ if (!s.ok()) {
+ // how do we fail if we're not creating new log?
+ assert(creating_new_log);
+ if (new_mem) {
+ delete new_mem;
+ }
+ if (new_log) {
+ delete new_log;
+ }
+ SuperVersion* new_superversion =
+ context->superversion_context.new_superversion.release();
+ if (new_superversion != nullptr) {
+ delete new_superversion;
+ }
+ // We may have lost data from the WritableFileBuffer in-memory buffer for
+ // the current log, so treat it as a fatal error and set bg_error
+ error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
+ // Read back bg_error in order to get the right severity
+ s = error_handler_.GetBGError();
+ return s;
+ }
+
+ for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
+ // all this is just optimization to delete logs that
+ // are no longer needed -- if CF is empty, that means it
+ // doesn't need that particular log to stay alive, so we just
+ // advance the log number. no need to persist this in the manifest
+ if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 &&
+ loop_cfd->imm()->NumNotFlushed() == 0) {
+ if (creating_new_log) {
+ loop_cfd->SetLogNumber(logfile_number_);
+ }
+ loop_cfd->mem()->SetCreationSeq(versions_->LastSequence());
+ }
+ }
+
+ cfd->mem()->SetNextLogNumber(logfile_number_);
+ cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
+ new_mem->Ref();
+ cfd->SetMemtable(new_mem);
+ InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
+ mutable_cf_options);
+#ifndef ROCKSDB_LITE
+ mutex_.Unlock();
+ // Notify client that memtable is sealed, now that we have successfully
+ // installed a new memtable
+ NotifyOnMemTableSealed(cfd, memtable_info);
+ mutex_.Lock();
+#endif // ROCKSDB_LITE
+ return s;
+}
+
+size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
+ mutex_.AssertHeld();
+ size_t bsize =
+ static_cast<size_t>(write_buffer_size / 10 + write_buffer_size);
+ // Some users might set very high write_buffer_size and rely on
+ // max_total_wal_size or other parameters to control the WAL size.
+ if (mutable_db_options_.max_total_wal_size > 0) {
+ bsize = std::min<size_t>(
+ bsize, static_cast<size_t>(mutable_db_options_.max_total_wal_size));
+ }
+ if (immutable_db_options_.db_write_buffer_size > 0) {
+ bsize = std::min<size_t>(bsize, immutable_db_options_.db_write_buffer_size);
+ }
+ if (immutable_db_options_.write_buffer_manager &&
+ immutable_db_options_.write_buffer_manager->enabled()) {
+ bsize = std::min<size_t>(
+ bsize, immutable_db_options_.write_buffer_manager->buffer_size());
+ }
+
+ return bsize;
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) {
+ if (nullptr == opt.timestamp) {
+ // Pre-allocate size of write batch conservatively.
+ // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+ // and we allocate 11 extra bytes for key length, as well as value length.
+ WriteBatch batch(key.size() + value.size() + 24);
+ Status s = batch.Put(column_family, key, value);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+ }
+ const Slice* ts = opt.timestamp;
+ assert(nullptr != ts);
+ size_t ts_sz = ts->size();
+ WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0,
+ ts_sz);
+ Status s = batch.Put(column_family, key, value);
+ if (!s.ok()) {
+ return s;
+ }
+ s = batch.AssignTimestamp(*ts);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key) {
+ WriteBatch batch;
+ batch.Delete(column_family, key);
+ return Write(opt, &batch);
+}
+
+Status DB::SingleDelete(const WriteOptions& opt,
+ ColumnFamilyHandle* column_family, const Slice& key) {
+ WriteBatch batch;
+ batch.SingleDelete(column_family, key);
+ return Write(opt, &batch);
+}
+
+Status DB::DeleteRange(const WriteOptions& opt,
+ ColumnFamilyHandle* column_family,
+ const Slice& begin_key, const Slice& end_key) {
+ WriteBatch batch;
+ batch.DeleteRange(column_family, begin_key, end_key);
+ return Write(opt, &batch);
+}
+
+Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+ const Slice& key, const Slice& value) {
+ WriteBatch batch;
+ Status s = batch.Merge(column_family, key, value);
+ if (!s.ok()) {
+ return s;
+ }
+ return Write(opt, &batch);
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_secondary_test.cc b/src/rocksdb/db/db_impl/db_secondary_test.cc
new file mode 100644
index 000000000..0b34181de
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_secondary_test.cc
@@ -0,0 +1,869 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl/db_impl_secondary.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class DBSecondaryTest : public DBTestBase {
+ public:
+ DBSecondaryTest()
+ : DBTestBase("/db_secondary_test"),
+ secondary_path_(),
+ handles_secondary_(),
+ db_secondary_(nullptr) {
+ secondary_path_ =
+ test::PerThreadDBPath(env_, "/db_secondary_test_secondary");
+ }
+
+ ~DBSecondaryTest() override {
+ CloseSecondary();
+ if (getenv("KEEP_DB") != nullptr) {
+ fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str());
+ } else {
+ Options options;
+ options.env = env_;
+ EXPECT_OK(DestroyDB(secondary_path_, options));
+ }
+ }
+
+ protected:
+ Status ReopenAsSecondary(const Options& options) {
+ return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_);
+ }
+
+ void OpenSecondary(const Options& options);
+
+ void OpenSecondaryWithColumnFamilies(
+ const std::vector<std::string>& column_families, const Options& options);
+
+ void CloseSecondary() {
+ for (auto h : handles_secondary_) {
+ db_secondary_->DestroyColumnFamilyHandle(h);
+ }
+ handles_secondary_.clear();
+ delete db_secondary_;
+ db_secondary_ = nullptr;
+ }
+
+ DBImplSecondary* db_secondary_full() {
+ return static_cast<DBImplSecondary*>(db_secondary_);
+ }
+
+ void CheckFileTypeCounts(const std::string& dir, int expected_log,
+ int expected_sst, int expected_manifest) const;
+
+ std::string secondary_path_;
+ std::vector<ColumnFamilyHandle*> handles_secondary_;
+ DB* db_secondary_;
+};
+
+void DBSecondaryTest::OpenSecondary(const Options& options) {
+ Status s =
+ DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_);
+ ASSERT_OK(s);
+}
+
+void DBSecondaryTest::OpenSecondaryWithColumnFamilies(
+ const std::vector<std::string>& column_families, const Options& options) {
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+ for (const auto& cf_name : column_families) {
+ cf_descs.emplace_back(cf_name, options);
+ }
+ Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs,
+ &handles_secondary_, &db_secondary_);
+ ASSERT_OK(s);
+}
+
+void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir,
+ int expected_log, int expected_sst,
+ int expected_manifest) const {
+ std::vector<std::string> filenames;
+ env_->GetChildren(dir, &filenames);
+
+ int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+ for (auto file : filenames) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(file, &number, &type)) {
+ log_cnt += (type == kLogFile);
+ sst_cnt += (type == kTableFile);
+ manifest_cnt += (type == kDescriptorFile);
+ }
+ }
+ ASSERT_EQ(expected_log, log_cnt);
+ ASSERT_EQ(expected_sst, sst_cnt);
+ ASSERT_EQ(expected_manifest, manifest_cnt);
+}
+
+TEST_F(DBSecondaryTest, ReopenAsSecondary) {
+ Options options;
+ options.env = env_;
+ Reopen(options);
+ ASSERT_OK(Put("foo", "foo_value"));
+ ASSERT_OK(Put("bar", "bar_value"));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ Close();
+
+ ASSERT_OK(ReopenAsSecondary(options));
+ ASSERT_EQ("foo_value", Get("foo"));
+ ASSERT_EQ("bar_value", Get("bar"));
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ auto db1 = static_cast<DBImplSecondary*>(db_);
+ ASSERT_NE(nullptr, db1);
+ Iterator* iter = db1->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ if (0 == count) {
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ("bar_value", iter->value().ToString());
+ } else if (1 == count) {
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("foo_value", iter->value().ToString());
+ }
+ ++count;
+ }
+ delete iter;
+ ASSERT_EQ(2, count);
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondary) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(Flush());
+ }
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ const auto verify_db_func = [&](const std::string& foo_val,
+ const std::string& bar_val) {
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ(foo_val, value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ(bar_val, value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ(foo_val, iter->value().ToString());
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ(bar_val, iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+ };
+
+ verify_db_func("foo_value2", "bar_value2");
+
+ ASSERT_OK(Put("foo", "new_foo_value"));
+ ASSERT_OK(Put("bar", "new_bar_value"));
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db_func("new_foo_value", "new_bar_value");
+}
+
+namespace {
+class TraceFileEnv : public EnvWrapper {
+ public:
+ explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {}
+ Status NewRandomAccessFile(const std::string& f,
+ std::unique_ptr<RandomAccessFile>* r,
+ const EnvOptions& env_options) override {
+ class TracedRandomAccessFile : public RandomAccessFile {
+ public:
+ TracedRandomAccessFile(std::unique_ptr<RandomAccessFile>&& target,
+ std::atomic<int>& counter)
+ : target_(std::move(target)), files_closed_(counter) {}
+ ~TracedRandomAccessFile() override {
+ files_closed_.fetch_add(1, std::memory_order_relaxed);
+ }
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ return target_->Read(offset, n, result, scratch);
+ }
+
+ private:
+ std::unique_ptr<RandomAccessFile> target_;
+ std::atomic<int>& files_closed_;
+ };
+ Status s = target()->NewRandomAccessFile(f, r, env_options);
+ if (s.ok()) {
+ r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_));
+ }
+ return s;
+ }
+
+ int files_closed() const {
+ return files_closed_.load(std::memory_order_relaxed);
+ }
+
+ private:
+ std::atomic<int> files_closed_{0};
+};
+} // namespace
+
+TEST_F(DBSecondaryTest, SecondaryCloseFiles) {
+ Options options;
+ options.env = env_;
+ options.max_open_files = 1;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ Options options1;
+ std::unique_ptr<Env> traced_env(new TraceFileEnv(env_));
+ options1.env = traced_env.get();
+ OpenSecondary(options1);
+
+ static const auto verify_db = [&]() {
+ std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+ std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(ReadOptions()));
+ for (iter1->SeekToFirst(), iter2->SeekToFirst();
+ iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
+ ASSERT_EQ(iter1->key(), iter2->key());
+ ASSERT_EQ(iter1->value(), iter2->value());
+ }
+ ASSERT_FALSE(iter1->Valid());
+ ASSERT_FALSE(iter2->Valid());
+ };
+
+ ASSERT_OK(Put("a", "value"));
+ ASSERT_OK(Put("c", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db();
+
+ ASSERT_OK(Put("b", "value"));
+ ASSERT_OK(Put("d", "value"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db();
+
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ASSERT_EQ(2, static_cast<TraceFileEnv*>(traced_env.get())->files_closed());
+
+ Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}});
+ ASSERT_TRUE(s.IsNotSupported());
+ CloseSecondary();
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ }
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ const auto verify_db_func = [&](const std::string& foo_val,
+ const std::string& bar_val) {
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ(foo_val, value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ(bar_val, value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ(foo_val, iter->value().ToString());
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ(bar_val, iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+ };
+
+ verify_db_func("foo_value2", "bar_value2");
+
+ ASSERT_OK(Put("foo", "new_foo_value"));
+ ASSERT_OK(Put("bar", "new_bar_value"));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db_func("new_foo_value", "new_bar_value");
+
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("foo", "new_foo_value_1"));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db_func("new_foo_value_1", "new_bar_value");
+}
+
+TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {
+ Options options;
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ std::vector<ColumnFamilyDescriptor> cf_descs;
+ cf_descs.emplace_back(kDefaultColumnFamilyName, options1);
+ cf_descs.emplace_back("pikachu", options1);
+ cf_descs.emplace_back("eevee", options1);
+ Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs,
+ &handles_secondary_, &db_secondary_);
+ ASSERT_NOK(s);
+}
+
+TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) {
+ Options options;
+ options.env = env_;
+ CreateAndReopenWithCF({"pikachu"}, options);
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+ ASSERT_EQ(0, handles_secondary_.size());
+ ASSERT_NE(nullptr, db_secondary_);
+
+ ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value"));
+ ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value"));
+ ASSERT_OK(Flush(0 /*cf*/));
+ ASSERT_OK(Flush(1 /*cf*/));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ("foo_value", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) {
+ Options options;
+ options.env = env_;
+ Reopen(options);
+ Close();
+
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0",
+ "VersionSet::ProcessManifestWrites:BeforeNewManifest"},
+ {"VersionSet::ProcessManifestWrites:AfterNewManifest",
+ "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:"
+ "1"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ // Make sure db calls RecoverLogFiles so as to trigger a manifest write,
+ // which causes the db to switch to a new MANIFEST upon start.
+ port::Thread ro_db_thread([&]() {
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+ CloseSecondary();
+ });
+ Reopen(options);
+ ro_db_thread.join();
+}
+
+TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+ for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+}
+
+TEST_F(DBSecondaryTest, MissingTableFile) {
+ int table_files_not_exist = 0;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers",
+ [&](void* arg) {
+ Status s = *reinterpret_cast<Status*>(arg);
+ if (s.IsPathNotFound()) {
+ ++table_files_not_exist;
+ } else if (!s.ok()) {
+ assert(false); // Should not reach here
+ }
+ });
+ SyncPoint::GetInstance()->EnableProcessing();
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+ ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+ ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+ ASSERT_OK(dbfull()->Flush(FlushOptions()));
+ }
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ ASSERT_NE(nullptr, db_secondary_full());
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist);
+ ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ value);
+ Iterator* iter = db_secondary_->NewIterator(ropts);
+ ASSERT_NE(nullptr, iter);
+ iter->Seek("bar");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("bar", iter->key().ToString());
+ ASSERT_EQ("bar_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ iter->Seek("foo");
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("foo_value" +
+ std::to_string(options.level0_file_num_compaction_trigger - 1),
+ iter->value().ToString());
+ size_t count = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ++count;
+ }
+ ASSERT_EQ(2, count);
+ delete iter;
+}
+
+TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) {
+ Options options;
+ options.env = env_;
+ const std::string kCfName1 = "pikachu";
+ CreateAndReopenWithCF({kCfName1}, options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondaryWithColumnFamilies({kCfName1}, options1);
+ ASSERT_EQ(2, handles_secondary_.size());
+
+ ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1"));
+ ASSERT_OK(Flush(1 /*cf*/));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ std::string value;
+ ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+ ASSERT_EQ("foo_val_1", value);
+
+ ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+ Close();
+ CheckFileTypeCounts(dbname_, 1, 0, 1);
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ value.clear();
+ ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+ ASSERT_EQ("foo_val_1", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchManifest) {
+ Options options;
+ options.env = env_;
+ options.level0_file_num_compaction_trigger = 4;
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ const int kNumFiles = options.level0_file_num_compaction_trigger - 1;
+ // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1,
+ // ..., 9.
+ const int kNumKeys = 10;
+ // Create two sst
+ for (int i = 0; i != kNumFiles; ++i) {
+ for (int j = 0; j != kNumKeys; ++j) {
+ ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i)));
+ }
+ ASSERT_OK(Flush());
+ }
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ const auto& range_scan_db = [&]() {
+ ReadOptions tmp_ropts;
+ tmp_ropts.total_order_seek = true;
+ tmp_ropts.verify_checksums = true;
+ std::unique_ptr<Iterator> iter(db_secondary_->NewIterator(tmp_ropts));
+ int cnt = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) {
+ ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString());
+ ASSERT_EQ("value_" + std::to_string(kNumFiles - 1),
+ iter->value().ToString());
+ }
+ };
+
+ range_scan_db();
+
+ // While secondary instance still keeps old MANIFEST open, we close primary,
+ // restart primary, performs full compaction, close again, restart again so
+ // that next time secondary tries to catch up with primary, the secondary
+ // will skip the MANIFEST in middle.
+ Reopen(options);
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+ Reopen(options);
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ range_scan_db();
+}
+
+// Here, "Snapshot" refers to the version edits written by
+// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after
+// switching from the old one.
+TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) {
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ ASSERT_OK(Put("0", "value0"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ std::string value;
+ ReadOptions ropts;
+ ropts.verify_checksums = true;
+ ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+ ASSERT_EQ("value0", value);
+
+ Reopen(options);
+ ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+}
+
+TEST_F(DBSecondaryTest, SwitchWAL) {
+ const int kNumKeysPerMemtable = 1;
+ Options options;
+ options.env = env_;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 2;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(kNumKeysPerMemtable));
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ const auto& verify_db = [](DB* db1, DB* db2) {
+ ASSERT_NE(nullptr, db1);
+ ASSERT_NE(nullptr, db2);
+ ReadOptions read_opts;
+ read_opts.verify_checksums = true;
+ std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts));
+ std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts));
+ it1->SeekToFirst();
+ it2->SeekToFirst();
+ for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+ ASSERT_EQ(it1->key(), it2->key());
+ ASSERT_EQ(it1->value(), it2->value());
+ }
+ ASSERT_FALSE(it1->Valid());
+ ASSERT_FALSE(it2->Valid());
+
+ for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+ std::string value;
+ ASSERT_OK(db2->Get(read_opts, it1->key(), &value));
+ ASSERT_EQ(it1->value(), value);
+ }
+ for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+ std::string value;
+ ASSERT_OK(db1->Get(read_opts, it2->key(), &value));
+ ASSERT_EQ(it2->value(), value);
+ }
+ };
+ for (int k = 0; k != 16; ++k) {
+ ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db(dbfull(), db_secondary_);
+ }
+}
+
+TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
+ const int kNumKeysPerMemtable = 1;
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::BackgroundCallFlush:ContextCleanedUp",
+ "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+ const std::string kCFName1 = "pikachu";
+ Options options;
+ options.env = env_;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 2;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(kNumKeysPerMemtable));
+ CreateAndReopenWithCF({kCFName1}, options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondaryWithColumnFamilies({kCFName1}, options1);
+ ASSERT_EQ(2, handles_secondary_.size());
+
+ const auto& verify_db = [](DB* db1,
+ const std::vector<ColumnFamilyHandle*>& handles1,
+ DB* db2,
+ const std::vector<ColumnFamilyHandle*>& handles2) {
+ ASSERT_NE(nullptr, db1);
+ ASSERT_NE(nullptr, db2);
+ ReadOptions read_opts;
+ read_opts.verify_checksums = true;
+ ASSERT_EQ(handles1.size(), handles2.size());
+ for (size_t i = 0; i != handles1.size(); ++i) {
+ std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts, handles1[i]));
+ std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts, handles2[i]));
+ it1->SeekToFirst();
+ it2->SeekToFirst();
+ for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+ ASSERT_EQ(it1->key(), it2->key());
+ ASSERT_EQ(it1->value(), it2->value());
+ }
+ ASSERT_FALSE(it1->Valid());
+ ASSERT_FALSE(it2->Valid());
+
+ for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+ std::string value;
+ ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value));
+ ASSERT_EQ(it1->value(), value);
+ }
+ for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+ std::string value;
+ ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value));
+ ASSERT_EQ(it2->value(), value);
+ }
+ }
+ };
+ for (int k = 0; k != 8; ++k) {
+ ASSERT_OK(
+ Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+ ASSERT_OK(
+ Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+ TEST_SYNC_POINT(
+ "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+ SyncPoint::GetInstance()->ClearTrace();
+ }
+}
+
+TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
+ const int kNumKeysPerMemtable = 16;
+ Options options;
+ options.env = env_;
+ options.max_write_buffer_number = 4;
+ options.min_write_buffer_number_to_merge = 2;
+ options.memtable_factory.reset(
+ new SpecialSkipListFactory(kNumKeysPerMemtable));
+ Reopen(options);
+
+ Options options1;
+ options1.env = env_;
+ options1.max_open_files = -1;
+ OpenSecondary(options1);
+
+ WriteOptions write_opts;
+ WriteBatch wb;
+ wb.Put("key0", "value0");
+ wb.Put("key1", "value1");
+ ASSERT_OK(dbfull()->Write(write_opts, &wb));
+ ReadOptions read_opts;
+ std::unique_ptr<Iterator> iter1(db_secondary_->NewIterator(read_opts));
+ iter1->Seek("key0");
+ ASSERT_FALSE(iter1->Valid());
+ iter1->Seek("key1");
+ ASSERT_FALSE(iter1->Valid());
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ iter1->Seek("key0");
+ ASSERT_FALSE(iter1->Valid());
+ iter1->Seek("key1");
+ ASSERT_FALSE(iter1->Valid());
+ std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(read_opts));
+ iter2->Seek("key0");
+ ASSERT_TRUE(iter2->Valid());
+ ASSERT_EQ("value0", iter2->value());
+ iter2->Seek("key1");
+ ASSERT_TRUE(iter2->Valid());
+ ASSERT_EQ("value1", iter2->value());
+
+ {
+ WriteBatch wb1;
+ wb1.Put("key0", "value01");
+ wb1.Put("key1", "value11");
+ ASSERT_OK(dbfull()->Write(write_opts, &wb1));
+ }
+
+ {
+ WriteBatch wb2;
+ wb2.Put("key0", "new_value0");
+ wb2.Delete("key1");
+ ASSERT_OK(dbfull()->Write(write_opts, &wb2));
+ }
+
+ ASSERT_OK(Flush());
+
+ ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+ std::unique_ptr<Iterator> iter3(db_secondary_->NewIterator(read_opts));
+ // iter3 should not see value01 and value11 at all.
+ iter3->Seek("key0");
+ ASSERT_TRUE(iter3->Valid());
+ ASSERT_EQ("new_value0", iter3->value());
+ iter3->Seek("key1");
+ ASSERT_FALSE(iter3->Valid());
+}
+
+TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
+ bool called = false;
+ Options options;
+ options.env = env_;
+ options.disable_auto_compactions = true;
+ Reopen(options);
+ SyncPoint::GetInstance()->DisableProcessing();
+ SyncPoint::GetInstance()->ClearAllCallBacks();
+ SyncPoint::GetInstance()->SetCallBack(
+ "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
+ ASSERT_NE(nullptr, arg);
+ called = true;
+ auto* s = reinterpret_cast<Status*>(arg);
+ ASSERT_NOK(*s);
+ });
+ SyncPoint::GetInstance()->LoadDependency(
+ {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
+ "BackgroundCallCompaction:0"},
+ {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+ "DBImpl::CheckConsistency:BeforeGetFileSize"}});
+ SyncPoint::GetInstance()->EnableProcessing();
+
+ ASSERT_OK(Put("a", "value0"));
+ ASSERT_OK(Put("c", "value0"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Put("b", "value1"));
+ ASSERT_OK(Put("d", "value1"));
+ ASSERT_OK(Flush());
+ port::Thread thread([this]() {
+ Options opts;
+ opts.env = env_;
+ opts.max_open_files = -1;
+ OpenSecondary(opts);
+ });
+ ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
+ thread.join();
+ ASSERT_TRUE(called);
+}
+#endif //! ROCKSDB_LITE
+
+} // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+ ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}