From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- src/rocksdb/db/db_impl/db_impl.cc | 4550 ++++++++++++++++++++ src/rocksdb/db/db_impl/db_impl.h | 2107 +++++++++ src/rocksdb/db/db_impl/db_impl_compaction_flush.cc | 3116 ++++++++++++++ src/rocksdb/db/db_impl/db_impl_debug.cc | 294 ++ src/rocksdb/db/db_impl/db_impl_experimental.cc | 151 + src/rocksdb/db/db_impl/db_impl_files.cc | 667 +++ src/rocksdb/db/db_impl/db_impl_open.cc | 1651 +++++++ src/rocksdb/db/db_impl/db_impl_readonly.cc | 221 + src/rocksdb/db/db_impl/db_impl_readonly.h | 137 + src/rocksdb/db/db_impl/db_impl_secondary.cc | 671 +++ src/rocksdb/db/db_impl/db_impl_secondary.h | 333 ++ src/rocksdb/db/db_impl/db_impl_write.cc | 1839 ++++++++ src/rocksdb/db/db_impl/db_secondary_test.cc | 869 ++++ 13 files changed, 16606 insertions(+) create mode 100644 src/rocksdb/db/db_impl/db_impl.cc create mode 100644 src/rocksdb/db/db_impl/db_impl.h create mode 100644 src/rocksdb/db/db_impl/db_impl_compaction_flush.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_debug.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_experimental.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_files.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_open.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_readonly.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_readonly.h create mode 100644 src/rocksdb/db/db_impl/db_impl_secondary.cc create mode 100644 src/rocksdb/db/db_impl/db_impl_secondary.h create mode 100644 src/rocksdb/db/db_impl/db_impl_write.cc create mode 100644 src/rocksdb/db/db_impl/db_secondary_test.cc (limited to 'src/rocksdb/db/db_impl') diff --git a/src/rocksdb/db/db_impl/db_impl.cc b/src/rocksdb/db/db_impl/db_impl.cc new file mode 100644 index 000000000..d7880fc1a --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl.cc @@ -0,0 +1,4550 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include +#ifdef OS_SOLARIS +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/builder.h" +#include "db/compaction/compaction_job.h" +#include "db/db_info_dumper.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/flush_job.h" +#include "db/forward_iterator.h" +#include "db/import_column_family_job.h" +#include "db/job_context.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/malloc_stats.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/merge_helper.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/table_cache.h" +#include "db/table_properties_collector.h" +#include "db/transaction_log_impl.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "db/write_callback.h" +#include "env/composite_env_wrapper.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "file/sst_file_manager_impl.h" +#include "logging/auto_roll_logger.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" +#include "memtable/hash_linklist_rep.h" +#include "memtable/hash_skiplist_rep.h" +#include "monitoring/in_memory_stats_history.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/persistent_stats_history.h" +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" +#include "options/cf_options.h" +#include "options/options_helper.h" +#include "options/options_parser.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/statistics.h" +#include "rocksdb/stats_history.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/get_context.h" +#include "table/merging_iterator.h" +#include "table/multiget_context.h" +#include "table/table_builder.h" +#include "table/two_level_iterator.h" +#include "test_util/sync_point.h" +#include "tools/sst_dump_tool_imp.h" +#include "util/autovector.h" +#include "util/build_version.h" +#include "util/cast_util.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/mutexlock.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +const std::string kDefaultColumnFamilyName("default"); +const std::string kPersistentStatsColumnFamilyName( + "___rocksdb_stats_history___"); +void DumpRocksDBBuildVersion(Logger* log); + +CompressionType GetCompressionFlush( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options) { + // Compressing memtable flushes might not help unless the sequential load + // optimization is used for leveled compaction. Otherwise the CPU and + // latency overhead is not offset by saving much space. + if (ioptions.compaction_style == kCompactionStyleUniversal) { + if (mutable_cf_options.compaction_options_universal + .compression_size_percent < 0) { + return mutable_cf_options.compression; + } else { + return kNoCompression; + } + } else if (!ioptions.compression_per_level.empty()) { + // For leveled compress when min_level_to_compress != 0. + return ioptions.compression_per_level[0]; + } else { + return mutable_cf_options.compression; + } +} + +namespace { +void DumpSupportInfo(Logger* logger) { + ROCKS_LOG_HEADER(logger, "Compression algorithms supported:"); + for (auto& compression : OptionsHelper::compression_type_string_map) { + if (compression.second != kNoCompression && + compression.second != kDisableCompressionOption) { + ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(), + CompressionTypeSupported(compression.second)); + } + } + ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s", + crc32c::IsFastCrc32Supported().c_str()); +} +} // namespace + +DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, + const bool seq_per_batch, const bool batch_per_txn) + : dbname_(dbname), + own_info_log_(options.info_log == nullptr), + initial_db_options_(SanitizeOptions(dbname, options)), + env_(initial_db_options_.env), + fs_(initial_db_options_.file_system), + immutable_db_options_(initial_db_options_), + mutable_db_options_(initial_db_options_), + stats_(immutable_db_options_.statistics.get()), + mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS, + immutable_db_options_.use_adaptive_mutex), + default_cf_handle_(nullptr), + max_total_in_memory_state_(0), + file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)), + file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite( + file_options_, immutable_db_options_)), + seq_per_batch_(seq_per_batch), + batch_per_txn_(batch_per_txn), + db_lock_(nullptr), + shutting_down_(false), + manual_compaction_paused_(false), + bg_cv_(&mutex_), + logfile_number_(0), + log_dir_synced_(false), + log_empty_(true), + persist_stats_cf_handle_(nullptr), + log_sync_cv_(&mutex_), + total_log_size_(0), + is_snapshot_supported_(true), + write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()), + write_thread_(immutable_db_options_), + nonmem_write_thread_(immutable_db_options_), + write_controller_(mutable_db_options_.delayed_write_rate), + last_batch_group_size_(0), + unscheduled_flushes_(0), + unscheduled_compactions_(0), + bg_bottom_compaction_scheduled_(0), + bg_compaction_scheduled_(0), + num_running_compactions_(0), + bg_flush_scheduled_(0), + num_running_flushes_(0), + bg_purge_scheduled_(0), + disable_delete_obsolete_files_(0), + pending_purge_obsolete_files_(0), + delete_obsolete_files_last_run_(env_->NowMicros()), + last_stats_dump_time_microsec_(0), + next_job_id_(1), + has_unpersisted_data_(false), + unable_to_release_oldest_log_(false), + num_running_ingest_file_(0), +#ifndef ROCKSDB_LITE + wal_manager_(immutable_db_options_, file_options_, seq_per_batch), +#endif // ROCKSDB_LITE + event_logger_(immutable_db_options_.info_log.get()), + bg_work_paused_(0), + bg_compaction_paused_(0), + refitting_level_(false), + opened_successfully_(false), + two_write_queues_(options.two_write_queues), + manual_wal_flush_(options.manual_wal_flush), + // last_sequencee_ is always maintained by the main queue that also writes + // to the memtable. When two_write_queues_ is disabled last seq in + // memtable is the same as last seq published to the readers. When it is + // enabled but seq_per_batch_ is disabled, last seq in memtable still + // indicates last published seq since wal-only writes that go to the 2nd + // queue do not consume a sequence number. Otherwise writes performed by + // the 2nd queue could change what is visible to the readers. In this + // cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a + // separate variable to indicate the last published sequence. + last_seq_same_as_publish_seq_( + !(seq_per_batch && options.two_write_queues)), + // Since seq_per_batch_ is currently set only by WritePreparedTxn which + // requires a custom gc for compaction, we use that to set use_custom_gc_ + // as well. + use_custom_gc_(seq_per_batch), + shutdown_initiated_(false), + own_sfm_(options.sst_file_manager == nullptr), + preserve_deletes_(options.preserve_deletes), + closed_(false), + error_handler_(this, immutable_db_options_, &mutex_), + atomic_flush_install_cv_(&mutex_) { + // !batch_per_trx_ implies seq_per_batch_ because it is only unset for + // WriteUnprepared, which should use seq_per_batch_. + assert(batch_per_txn_ || seq_per_batch_); + env_->GetAbsolutePath(dbname, &db_absolute_path_); + + // Reserve ten files or so for other uses and give the rest to TableCache. + // Give a large number for setting of "infinite" open files. + const int table_cache_size = (mutable_db_options_.max_open_files == -1) + ? TableCache::kInfiniteCapacity + : mutable_db_options_.max_open_files - 10; + LRUCacheOptions co; + co.capacity = table_cache_size; + co.num_shard_bits = immutable_db_options_.table_cache_numshardbits; + co.metadata_charge_policy = kDontChargeCacheMetadata; + table_cache_ = NewLRUCache(co); + + versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_, + table_cache_.get(), write_buffer_manager_, + &write_controller_, &block_cache_tracer_)); + column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); + + DumpRocksDBBuildVersion(immutable_db_options_.info_log.get()); + DumpDBFileSummary(immutable_db_options_, dbname_); + immutable_db_options_.Dump(immutable_db_options_.info_log.get()); + mutable_db_options_.Dump(immutable_db_options_.info_log.get()); + DumpSupportInfo(immutable_db_options_.info_log.get()); + + // always open the DB with 0 here, which means if preserve_deletes_==true + // we won't drop any deletion markers until SetPreserveDeletesSequenceNumber() + // is called by client and this seqnum is advanced. + preserve_deletes_seqnum_.store(0); +} + +Status DBImpl::Resume() { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB"); + + InstrumentedMutexLock db_mutex(&mutex_); + + if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) { + // Nothing to do + return Status::OK(); + } + + if (error_handler_.IsRecoveryInProgress()) { + // Don't allow a mix of manual and automatic recovery + return Status::Busy(); + } + + mutex_.Unlock(); + Status s = error_handler_.RecoverFromBGError(true); + mutex_.Lock(); + return s; +} + +// This function implements the guts of recovery from a background error. It +// is eventually called for both manual as well as automatic recovery. It does +// the following - +// 1. Wait for currently scheduled background flush/compaction to exit, in +// order to inadvertently causing an error and thinking recovery failed +// 2. Flush memtables if there's any data for all the CFs. This may result +// another error, which will be saved by error_handler_ and reported later +// as the recovery status +// 3. Find and delete any obsolete files +// 4. Schedule compactions if needed for all the CFs. This is needed as the +// flush in the prior step might have been a no-op for some CFs, which +// means a new super version wouldn't have been installed +Status DBImpl::ResumeImpl() { + mutex_.AssertHeld(); + WaitForBackgroundWork(); + + Status bg_error = error_handler_.GetBGError(); + Status s; + if (shutdown_initiated_) { + // Returning shutdown status to SFM during auto recovery will cause it + // to abort the recovery and allow the shutdown to progress + s = Status::ShutdownInProgress(); + } + if (s.ok() && bg_error.severity() > Status::Severity::kHardError) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "DB resume requested but failed due to Fatal/Unrecoverable error"); + s = bg_error; + } + + // We cannot guarantee consistency of the WAL. So force flush Memtables of + // all the column families + if (s.ok()) { + FlushOptions flush_opts; + // We allow flush to stall write since we are trying to resume from error. + flush_opts.allow_write_stall = true; + if (immutable_db_options_.atomic_flush) { + autovector cfds; + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + s = AtomicFlushMemTables(cfds, flush_opts, FlushReason::kErrorRecovery); + mutex_.Lock(); + } else { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + cfd->Ref(); + mutex_.Unlock(); + s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery); + mutex_.Lock(); + cfd->UnrefAndTryDelete(); + if (!s.ok()) { + break; + } + } + } + if (!s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "DB resume requested but failed due to Flush failure [%s]", + s.ToString().c_str()); + } + } + + JobContext job_context(0); + FindObsoleteFiles(&job_context, true); + if (s.ok()) { + s = error_handler_.ClearBGError(); + } + mutex_.Unlock(); + + job_context.manifest_file_number = 1; + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); + } + mutex_.Lock(); + // Check for shutdown again before scheduling further compactions, + // since we released and re-acquired the lock above + if (shutdown_initiated_) { + s = Status::ShutdownInProgress(); + } + if (s.ok()) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + SchedulePendingCompaction(cfd); + } + MaybeScheduleFlushOrCompaction(); + } + + // Wake up any waiters - in this case, it could be the shutdown thread + bg_cv_.SignalAll(); + + // No need to check BGError again. If something happened, event listener would + // be notified and the operation causing it would have failed + return s; +} + +void DBImpl::WaitForBackgroundWork() { + // Wait for background work to finish + while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_) { + bg_cv_.Wait(); + } +} + +// Will lock the mutex_, will wait for completion if wait is true +void DBImpl::CancelAllBackgroundWork(bool wait) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Shutdown: canceling all background work"); + + if (thread_dump_stats_ != nullptr) { + thread_dump_stats_->cancel(); + thread_dump_stats_.reset(); + } + if (thread_persist_stats_ != nullptr) { + thread_persist_stats_->cancel(); + thread_persist_stats_.reset(); + } + InstrumentedMutexLock l(&mutex_); + if (!shutting_down_.load(std::memory_order_acquire) && + has_unpersisted_data_.load(std::memory_order_relaxed) && + !mutable_db_options_.avoid_flush_during_shutdown) { + if (immutable_db_options_.atomic_flush) { + autovector cfds; + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown); + mutex_.Lock(); + } else { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) { + cfd->Ref(); + mutex_.Unlock(); + FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown); + mutex_.Lock(); + cfd->UnrefAndTryDelete(); + } + } + } + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + } + + shutting_down_.store(true, std::memory_order_release); + bg_cv_.SignalAll(); + if (!wait) { + return; + } + WaitForBackgroundWork(); +} + +Status DBImpl::CloseHelper() { + // Guarantee that there is no background error recovery in progress before + // continuing with the shutdown + mutex_.Lock(); + shutdown_initiated_ = true; + error_handler_.CancelErrorRecovery(); + while (error_handler_.IsRecoveryInProgress()) { + bg_cv_.Wait(); + } + mutex_.Unlock(); + + // CancelAllBackgroundWork called with false means we just set the shutdown + // marker. After this we do a variant of the waiting and unschedule work + // (to consider: moving all the waiting into CancelAllBackgroundWork(true)) + CancelAllBackgroundWork(false); + int bottom_compactions_unscheduled = + env_->UnSchedule(this, Env::Priority::BOTTOM); + int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW); + int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH); + Status ret; + mutex_.Lock(); + bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled; + bg_compaction_scheduled_ -= compactions_unscheduled; + bg_flush_scheduled_ -= flushes_unscheduled; + + // Wait for background work to finish + while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_ || bg_purge_scheduled_ || + pending_purge_obsolete_files_ || + error_handler_.IsRecoveryInProgress()) { + TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob"); + bg_cv_.Wait(); + } + TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished", + &files_grabbed_for_purge_); + EraseThreadStatusDbInfo(); + flush_scheduler_.Clear(); + trim_history_scheduler_.Clear(); + + while (!flush_queue_.empty()) { + const FlushRequest& flush_req = PopFirstFromFlushQueue(); + for (const auto& iter : flush_req) { + iter.first->UnrefAndTryDelete(); + } + } + while (!compaction_queue_.empty()) { + auto cfd = PopFirstFromCompactionQueue(); + cfd->UnrefAndTryDelete(); + } + + if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) { + // we need to delete handle outside of lock because it does its own locking + mutex_.Unlock(); + if (default_cf_handle_) { + delete default_cf_handle_; + default_cf_handle_ = nullptr; + } + if (persist_stats_cf_handle_) { + delete persist_stats_cf_handle_; + persist_stats_cf_handle_ = nullptr; + } + mutex_.Lock(); + } + + // Clean up obsolete files due to SuperVersion release. + // (1) Need to delete to obsolete files before closing because RepairDB() + // scans all existing files in the file system and builds manifest file. + // Keeping obsolete files confuses the repair process. + // (2) Need to check if we Open()/Recover() the DB successfully before + // deleting because if VersionSet recover fails (may be due to corrupted + // manifest file), it is not able to identify live files correctly. As a + // result, all "live" files can get deleted by accident. However, corrupted + // manifest is recoverable by RepairDB(). + if (opened_successfully_) { + JobContext job_context(next_job_id_.fetch_add(1)); + FindObsoleteFiles(&job_context, true); + + mutex_.Unlock(); + // manifest number starting from 2 + job_context.manifest_file_number = 1; + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); + } + + for (auto l : logs_to_free_) { + delete l; + } + for (auto& log : logs_) { + uint64_t log_number = log.writer->get_log_number(); + Status s = log.ClearWriter(); + if (!s.ok()) { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Unable to Sync WAL file %s with error -- %s", + LogFileName(immutable_db_options_.wal_dir, log_number).c_str(), + s.ToString().c_str()); + // Retain the first error + if (ret.ok()) { + ret = s; + } + } + } + logs_.clear(); + + // Table cache may have table handles holding blocks from the block cache. + // We need to release them before the block cache is destroyed. The block + // cache may be destroyed inside versions_.reset(), when column family data + // list is destroyed, so leaving handles in table cache after + // versions_.reset() may cause issues. + // Here we clean all unreferenced handles in table cache. + // Now we assume all user queries have finished, so only version set itself + // can possibly hold the blocks from block cache. After releasing unreferenced + // handles here, only handles held by version set left and inside + // versions_.reset(), we will release them. There, we need to make sure every + // time a handle is released, we erase it from the cache too. By doing that, + // we can guarantee that after versions_.reset(), table cache is empty + // so the cache can be safely destroyed. + table_cache_->EraseUnRefEntries(); + + for (auto& txn_entry : recovered_transactions_) { + delete txn_entry.second; + } + + // versions need to be destroyed before table_cache since it can hold + // references to table_cache. + versions_.reset(); + mutex_.Unlock(); + if (db_lock_ != nullptr) { + env_->UnlockFile(db_lock_); + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete"); + LogFlush(immutable_db_options_.info_log); + +#ifndef ROCKSDB_LITE + // If the sst_file_manager was allocated by us during DB::Open(), ccall + // Close() on it before closing the info_log. Otherwise, background thread + // in SstFileManagerImpl might try to log something + if (immutable_db_options_.sst_file_manager && own_sfm_) { + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + sfm->Close(); + } +#endif // ROCKSDB_LITE + + if (immutable_db_options_.info_log && own_info_log_) { + Status s = immutable_db_options_.info_log->Close(); + if (ret.ok()) { + ret = s; + } + } + + if (ret.IsAborted()) { + // Reserve IsAborted() error for those where users didn't release + // certain resource and they can release them and come back and + // retry. In this case, we wrap this exception to something else. + return Status::Incomplete(ret.ToString()); + } + return ret; +} + +Status DBImpl::CloseImpl() { return CloseHelper(); } + +DBImpl::~DBImpl() { + if (!closed_) { + closed_ = true; + CloseHelper(); + } +} + +void DBImpl::MaybeIgnoreError(Status* s) const { + if (s->ok() || immutable_db_options_.paranoid_checks) { + // No change needed + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "Ignoring error %s", + s->ToString().c_str()); + *s = Status::OK(); + } +} + +const Status DBImpl::CreateArchivalDirectory() { + if (immutable_db_options_.wal_ttl_seconds > 0 || + immutable_db_options_.wal_size_limit_mb > 0) { + std::string archivalPath = ArchivalDirectory(immutable_db_options_.wal_dir); + return env_->CreateDirIfMissing(archivalPath); + } + return Status::OK(); +} + +void DBImpl::PrintStatistics() { + auto dbstats = immutable_db_options_.statistics.get(); + if (dbstats) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s", + dbstats->ToString().c_str()); + } +} + +void DBImpl::StartTimedTasks() { + unsigned int stats_dump_period_sec = 0; + unsigned int stats_persist_period_sec = 0; + { + InstrumentedMutexLock l(&mutex_); + stats_dump_period_sec = mutable_db_options_.stats_dump_period_sec; + if (stats_dump_period_sec > 0) { + if (!thread_dump_stats_) { + thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( + [this]() { DBImpl::DumpStats(); }, "dump_st", env_, + static_cast(stats_dump_period_sec) * kMicrosInSecond)); + } + } + stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec; + if (stats_persist_period_sec > 0) { + if (!thread_persist_stats_) { + thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( + [this]() { DBImpl::PersistStats(); }, "pst_st", env_, + static_cast(stats_persist_period_sec) * kMicrosInSecond)); + } + } + } +} + +// esitmate the total size of stats_history_ +size_t DBImpl::EstimateInMemoryStatsHistorySize() const { + size_t size_total = + sizeof(std::map>); + if (stats_history_.size() == 0) return size_total; + size_t size_per_slice = + sizeof(uint64_t) + sizeof(std::map); + // non-empty map, stats_history_.begin() guaranteed to exist + std::map sample_slice(stats_history_.begin()->second); + for (const auto& pairs : sample_slice) { + size_per_slice += + pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second); + } + size_total = size_per_slice * stats_history_.size(); + return size_total; +} + +void DBImpl::PersistStats() { + TEST_SYNC_POINT("DBImpl::PersistStats:Entry"); +#ifndef ROCKSDB_LITE + if (shutdown_initiated_) { + return; + } + uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond; + Statistics* statistics = immutable_db_options_.statistics.get(); + if (!statistics) { + return; + } + size_t stats_history_size_limit = 0; + { + InstrumentedMutexLock l(&mutex_); + stats_history_size_limit = mutable_db_options_.stats_history_buffer_size; + } + + std::map stats_map; + if (!statistics->getTickerMap(&stats_map)) { + return; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- PERSISTING STATS -------"); + + if (immutable_db_options_.persist_stats_to_disk) { + WriteBatch batch; + if (stats_slice_initialized_) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Reading %" ROCKSDB_PRIszt " stats from statistics\n", + stats_slice_.size()); + for (const auto& stat : stats_map) { + char key[100]; + int length = + EncodePersistentStatsKey(now_seconds, stat.first, 100, key); + // calculate the delta from last time + if (stats_slice_.find(stat.first) != stats_slice_.end()) { + uint64_t delta = stat.second - stats_slice_[stat.first]; + batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)), + ToString(delta)); + } + } + } + stats_slice_initialized_ = true; + std::swap(stats_slice_, stats_map); + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + Status s = Write(wo, &batch); + if (!s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing to persistent stats CF failed -- %s", + s.ToString().c_str()); + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to persistent stats CF succeeded", + stats_slice_.size(), now_seconds); + } + // TODO(Zhongyi): add purging for persisted data + } else { + InstrumentedMutexLock l(&stats_history_mutex_); + // calculate the delta from last time + if (stats_slice_initialized_) { + std::map stats_delta; + for (const auto& stat : stats_map) { + if (stats_slice_.find(stat.first) != stats_slice_.end()) { + stats_delta[stat.first] = stat.second - stats_slice_[stat.first]; + } + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to in-memory stats history", + stats_slice_.size(), now_seconds); + stats_history_[now_seconds] = stats_delta; + } + stats_slice_initialized_ = true; + std::swap(stats_slice_, stats_map); + TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied"); + + // delete older stats snapshots to control memory consumption + size_t stats_history_size = EstimateInMemoryStatsHistorySize(); + bool purge_needed = stats_history_size > stats_history_size_limit; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); + while (purge_needed && !stats_history_.empty()) { + stats_history_.erase(stats_history_.begin()); + purge_needed = + EstimateInMemoryStatsHistorySize() > stats_history_size_limit; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); + } +#endif // !ROCKSDB_LITE +} + +bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time, + uint64_t* new_time, + std::map* stats_map) { + assert(new_time); + assert(stats_map); + if (!new_time || !stats_map) return false; + // lock when search for start_time + { + InstrumentedMutexLock l(&stats_history_mutex_); + auto it = stats_history_.lower_bound(start_time); + if (it != stats_history_.end() && it->first < end_time) { + // make a copy for timestamp and stats_map + *new_time = it->first; + *stats_map = it->second; + return true; + } else { + return false; + } + } +} + +Status DBImpl::GetStatsHistory( + uint64_t start_time, uint64_t end_time, + std::unique_ptr* stats_iterator) { + if (!stats_iterator) { + return Status::InvalidArgument("stats_iterator not preallocated."); + } + if (immutable_db_options_.persist_stats_to_disk) { + stats_iterator->reset( + new PersistentStatsHistoryIterator(start_time, end_time, this)); + } else { + stats_iterator->reset( + new InMemoryStatsHistoryIterator(start_time, end_time, this)); + } + return (*stats_iterator)->status(); +} + +void DBImpl::DumpStats() { + TEST_SYNC_POINT("DBImpl::DumpStats:1"); +#ifndef ROCKSDB_LITE + const DBPropertyInfo* cf_property_info = + GetPropertyInfo(DB::Properties::kCFStats); + assert(cf_property_info != nullptr); + const DBPropertyInfo* db_property_info = + GetPropertyInfo(DB::Properties::kDBStats); + assert(db_property_info != nullptr); + + std::string stats; + if (shutdown_initiated_) { + return; + } + { + InstrumentedMutexLock l(&mutex_); + default_cf_internal_stats_->GetStringProperty( + *db_property_info, DB::Properties::kDBStats, &stats); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->initialized()) { + cfd->internal_stats()->GetStringProperty( + *cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats); + } + } + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->initialized()) { + cfd->internal_stats()->GetStringProperty( + *cf_property_info, DB::Properties::kCFFileHistogram, &stats); + } + } + } + TEST_SYNC_POINT("DBImpl::DumpStats:2"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- DUMPING STATS -------"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str()); + if (immutable_db_options_.dump_malloc_stats) { + stats.clear(); + DumpMallocStats(&stats); + if (!stats.empty()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- Malloc STATS -------"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str()); + } + } +#endif // !ROCKSDB_LITE + + PrintStatistics(); +} + +Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str) { + auto* cfh = + static_cast_with_check( + column_family); + ColumnFamilyData* cfd = cfh->cfd(); + + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + Version* version = super_version->current; + + Status s = + version->TablesRangeTombstoneSummary(max_entries_to_print, out_str); + + CleanupSuperVersion(super_version); + return s; +} + +void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) { + if (!job_context->logs_to_free.empty()) { + for (auto l : job_context->logs_to_free) { + AddToLogsToFreeQueue(l); + } + job_context->logs_to_free.clear(); + } +} + +Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { + assert(cfd); + Directory* ret_dir = cfd->GetDataDir(path_id); + if (ret_dir == nullptr) { + return directories_.GetDataDir(path_id); + } + return ret_dir; +} + +Status DBImpl::SetOptions( + ColumnFamilyHandle* column_family, + const std::unordered_map& options_map) { +#ifdef ROCKSDB_LITE + (void)column_family; + (void)options_map; + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + auto* cfd = reinterpret_cast(column_family)->cfd(); + if (options_map.empty()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "SetOptions() on column family [%s], empty input", + cfd->GetName().c_str()); + return Status::InvalidArgument("empty input"); + } + + MutableCFOptions new_options; + Status s; + Status persist_options_status; + SuperVersionContext sv_context(/* create_superversion */ true); + { + auto db_options = GetDBOptions(); + InstrumentedMutexLock l(&mutex_); + s = cfd->SetOptions(db_options, options_map); + if (s.ok()) { + new_options = *cfd->GetLatestMutableCFOptions(); + // Append new version to recompute compaction score. + VersionEdit dummy_edit; + versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + // Trigger possible flush/compactions. This has to be before we persist + // options to file, otherwise there will be a deadlock with writer + // thread. + InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options); + + persist_options_status = WriteOptionsFile( + false /*need_mutex_lock*/, true /*need_enter_write_thread*/); + bg_cv_.SignalAll(); + } + } + sv_context.Clean(); + + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str()); + for (const auto& o : options_map) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), + o.second.c_str()); + } + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] SetOptions() succeeded", cfd->GetName().c_str()); + new_options.Dump(immutable_db_options_.info_log.get()); + if (!persist_options_status.ok()) { + s = persist_options_status; + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed", + cfd->GetName().c_str()); + } + LogFlush(immutable_db_options_.info_log); + return s; +#endif // ROCKSDB_LITE +} + +Status DBImpl::SetDBOptions( + const std::unordered_map& options_map) { +#ifdef ROCKSDB_LITE + (void)options_map; + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + if (options_map.empty()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "SetDBOptions(), empty input."); + return Status::InvalidArgument("empty input"); + } + + MutableDBOptions new_options; + Status s; + Status persist_options_status; + bool wal_changed = false; + WriteContext write_context; + { + InstrumentedMutexLock l(&mutex_); + s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map, + &new_options); + if (new_options.bytes_per_sync == 0) { + new_options.bytes_per_sync = 1024 * 1024; + } + DBOptions new_db_options = + BuildDBOptions(immutable_db_options_, new_options); + if (s.ok()) { + s = ValidateOptions(new_db_options); + } + if (s.ok()) { + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->IsDropped()) { + auto cf_options = c->GetLatestCFOptions(); + s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options); + if (!s.ok()) { + break; + } + } + } + } + if (s.ok()) { + const BGJobLimits current_bg_job_limits = + GetBGJobLimits(immutable_db_options_.max_background_flushes, + mutable_db_options_.max_background_compactions, + mutable_db_options_.max_background_jobs, + /* parallelize_compactions */ true); + const BGJobLimits new_bg_job_limits = GetBGJobLimits( + immutable_db_options_.max_background_flushes, + new_options.max_background_compactions, + new_options.max_background_jobs, /* parallelize_compactions */ true); + + const bool max_flushes_increased = + new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes; + const bool max_compactions_increased = + new_bg_job_limits.max_compactions > + current_bg_job_limits.max_compactions; + + if (max_flushes_increased || max_compactions_increased) { + if (max_flushes_increased) { + env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_flushes, + Env::Priority::HIGH); + } + + if (max_compactions_increased) { + env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_compactions, + Env::Priority::LOW); + } + + MaybeScheduleFlushOrCompaction(); + } + + if (new_options.stats_dump_period_sec != + mutable_db_options_.stats_dump_period_sec) { + if (thread_dump_stats_) { + mutex_.Unlock(); + thread_dump_stats_->cancel(); + mutex_.Lock(); + } + if (new_options.stats_dump_period_sec > 0) { + thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( + [this]() { DBImpl::DumpStats(); }, "dump_st", env_, + static_cast(new_options.stats_dump_period_sec) * + kMicrosInSecond)); + } else { + thread_dump_stats_.reset(); + } + } + if (new_options.stats_persist_period_sec != + mutable_db_options_.stats_persist_period_sec) { + if (thread_persist_stats_) { + mutex_.Unlock(); + thread_persist_stats_->cancel(); + mutex_.Lock(); + } + if (new_options.stats_persist_period_sec > 0) { + thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread( + [this]() { DBImpl::PersistStats(); }, "pst_st", env_, + static_cast(new_options.stats_persist_period_sec) * + kMicrosInSecond)); + } else { + thread_persist_stats_.reset(); + } + } + write_controller_.set_max_delayed_write_rate( + new_options.delayed_write_rate); + table_cache_.get()->SetCapacity(new_options.max_open_files == -1 + ? TableCache::kInfiniteCapacity + : new_options.max_open_files - 10); + wal_changed = mutable_db_options_.wal_bytes_per_sync != + new_options.wal_bytes_per_sync; + mutable_db_options_ = new_options; + file_options_for_compaction_ = FileOptions(new_db_options); + file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite( + file_options_for_compaction_, immutable_db_options_); + versions_->ChangeFileOptions(mutable_db_options_); + //TODO(xiez): clarify why apply optimize for read to write options + file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead( + file_options_for_compaction_, immutable_db_options_); + file_options_for_compaction_.compaction_readahead_size = + mutable_db_options_.compaction_readahead_size; + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) { + Status purge_wal_status = SwitchWAL(&write_context); + if (!purge_wal_status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to purge WAL files in SetDBOptions() -- %s", + purge_wal_status.ToString().c_str()); + } + } + persist_options_status = WriteOptionsFile( + false /*need_mutex_lock*/, false /*need_enter_write_thread*/); + write_thread_.ExitUnbatched(&w); + } + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:"); + for (const auto& o : options_map) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(), + o.second.c_str()); + } + if (s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded"); + new_options.Dump(immutable_db_options_.info_log.get()); + if (!persist_options_status.ok()) { + if (immutable_db_options_.fail_if_options_file_error) { + s = Status::IOError( + "SetDBOptions() succeeded, but unable to persist options", + persist_options_status.ToString()); + } + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to persist options in SetDBOptions() -- %s", + persist_options_status.ToString().c_str()); + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed"); + } + LogFlush(immutable_db_options_.info_log); + return s; +#endif // ROCKSDB_LITE +} + +// return the same level if it cannot be moved +int DBImpl::FindMinimumEmptyLevelFitting( + ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/, + int level) { + mutex_.AssertHeld(); + const auto* vstorage = cfd->current()->storage_info(); + int minimum_level = level; + for (int i = level - 1; i > 0; --i) { + // stop if level i is not empty + if (vstorage->NumLevelFiles(i) > 0) break; + // stop if level i is too small (cannot fit the level files) + if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) { + break; + } + + minimum_level = i; + } + return minimum_level; +} + +Status DBImpl::FlushWAL(bool sync) { + if (manual_wal_flush_) { + Status s; + { + // We need to lock log_write_mutex_ since logs_ might change concurrently + InstrumentedMutexLock wl(&log_write_mutex_); + log::Writer* cur_log_writer = logs_.back().writer; + s = cur_log_writer->WriteBuffer(); + } + if (!s.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", + s.ToString().c_str()); + // In case there is a fs error we should set it globally to prevent the + // future writes + WriteStatusCheck(s); + // whether sync or not, we should abort the rest of function upon error + return s; + } + if (!sync) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false"); + return s; + } + } + if (!sync) { + return Status::OK(); + } + // sync = true + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true"); + return SyncWAL(); +} + +Status DBImpl::SyncWAL() { + autovector logs_to_sync; + bool need_log_dir_sync; + uint64_t current_log_number; + + { + InstrumentedMutexLock l(&mutex_); + assert(!logs_.empty()); + + // This SyncWAL() call only cares about logs up to this number. + current_log_number = logfile_number_; + + while (logs_.front().number <= current_log_number && + logs_.front().getting_synced) { + log_sync_cv_.Wait(); + } + // First check that logs are safe to sync in background. + for (auto it = logs_.begin(); + it != logs_.end() && it->number <= current_log_number; ++it) { + if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) { + return Status::NotSupported( + "SyncWAL() is not supported for this implementation of WAL file", + immutable_db_options_.allow_mmap_writes + ? "try setting Options::allow_mmap_writes to false" + : Slice()); + } + } + for (auto it = logs_.begin(); + it != logs_.end() && it->number <= current_log_number; ++it) { + auto& log = *it; + assert(!log.getting_synced); + log.getting_synced = true; + logs_to_sync.push_back(log.writer); + } + + need_log_dir_sync = !log_dir_synced_; + } + + TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1"); + RecordTick(stats_, WAL_FILE_SYNCED); + Status status; + for (log::Writer* log : logs_to_sync) { + status = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync); + if (!status.ok()) { + break; + } + } + if (status.ok() && need_log_dir_sync) { + status = directories_.GetWalDir()->Fsync(); + } + TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2"); + + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); + { + InstrumentedMutexLock l(&mutex_); + MarkLogsSynced(current_log_number, need_log_dir_sync, status); + } + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); + + return status; +} + +Status DBImpl::LockWAL() { + log_write_mutex_.Lock(); + auto cur_log_writer = logs_.back().writer; + auto status = cur_log_writer->WriteBuffer(); + if (!status.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", + status.ToString().c_str()); + // In case there is a fs error we should set it globally to prevent the + // future writes + WriteStatusCheck(status); + } + return status; +} + +Status DBImpl::UnlockWAL() { + log_write_mutex_.Unlock(); + return Status::OK(); +} + +void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, + const Status& status) { + mutex_.AssertHeld(); + if (synced_dir && logfile_number_ == up_to && status.ok()) { + log_dir_synced_ = true; + } + for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { + auto& log = *it; + assert(log.getting_synced); + if (status.ok() && logs_.size() > 1) { + logs_to_free_.push_back(log.ReleaseWriter()); + // To modify logs_ both mutex_ and log_write_mutex_ must be held + InstrumentedMutexLock l(&log_write_mutex_); + it = logs_.erase(it); + } else { + log.getting_synced = false; + ++it; + } + } + assert(!status.ok() || logs_.empty() || logs_[0].number > up_to || + (logs_.size() == 1 && !logs_[0].getting_synced)); + log_sync_cv_.SignalAll(); +} + +SequenceNumber DBImpl::GetLatestSequenceNumber() const { + return versions_->LastSequence(); +} + +void DBImpl::SetLastPublishedSequence(SequenceNumber seq) { + versions_->SetLastPublishedSequence(seq); +} + +bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) { + if (seqnum > preserve_deletes_seqnum_.load()) { + preserve_deletes_seqnum_.store(seqnum); + return true; + } else { + return false; + } +} + +InternalIterator* DBImpl::NewInternalIterator( + Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence, + ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + + mutex_.Lock(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + mutex_.Unlock(); + ReadOptions roptions; + return NewInternalIterator(roptions, cfd, super_version, arena, range_del_agg, + sequence); +} + +void DBImpl::SchedulePurge() { + mutex_.AssertHeld(); + assert(opened_successfully_); + + // Purge operations are put into High priority queue + bg_purge_scheduled_++; + env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr); +} + +void DBImpl::BackgroundCallPurge() { + mutex_.Lock(); + + while (!logs_to_free_queue_.empty()) { + assert(!logs_to_free_queue_.empty()); + log::Writer* log_writer = *(logs_to_free_queue_.begin()); + logs_to_free_queue_.pop_front(); + mutex_.Unlock(); + delete log_writer; + mutex_.Lock(); + } + while (!superversions_to_free_queue_.empty()) { + assert(!superversions_to_free_queue_.empty()); + SuperVersion* sv = superversions_to_free_queue_.front(); + superversions_to_free_queue_.pop_front(); + mutex_.Unlock(); + delete sv; + mutex_.Lock(); + } + + // Can't use iterator to go over purge_files_ because inside the loop we're + // unlocking the mutex that protects purge_files_. + while (!purge_files_.empty()) { + auto it = purge_files_.begin(); + // Need to make a copy of the PurgeFilesInfo before unlocking the mutex. + PurgeFileInfo purge_file = it->second; + + const std::string& fname = purge_file.fname; + const std::string& dir_to_sync = purge_file.dir_to_sync; + FileType type = purge_file.type; + uint64_t number = purge_file.number; + int job_id = purge_file.job_id; + + purge_files_.erase(it); + + mutex_.Unlock(); + DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number); + mutex_.Lock(); + } + + bg_purge_scheduled_--; + + bg_cv_.SignalAll(); + // IMPORTANT:there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + mutex_.Unlock(); +} + +namespace { +struct IterState { + IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version, + bool _background_purge) + : db(_db), + mu(_mu), + super_version(_super_version), + background_purge(_background_purge) {} + + DBImpl* db; + InstrumentedMutex* mu; + SuperVersion* super_version; + bool background_purge; +}; + +static void CleanupIteratorState(void* arg1, void* /*arg2*/) { + IterState* state = reinterpret_cast(arg1); + + if (state->super_version->Unref()) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + + state->mu->Lock(); + state->super_version->Cleanup(); + state->db->FindObsoleteFiles(&job_context, false, true); + if (state->background_purge) { + state->db->ScheduleBgLogWriterClose(&job_context); + state->db->AddSuperVersionsToFreeQueue(state->super_version); + state->db->SchedulePurge(); + } + state->mu->Unlock(); + + if (!state->background_purge) { + delete state->super_version; + } + if (job_context.HaveSomethingToDelete()) { + if (state->background_purge) { + // PurgeObsoleteFiles here does not delete files. Instead, it adds the + // files to be deleted to a job queue, and deletes it in a separate + // background thread. + state->db->PurgeObsoleteFiles(job_context, true /* schedule only */); + state->mu->Lock(); + state->db->SchedulePurge(); + state->mu->Unlock(); + } else { + state->db->PurgeObsoleteFiles(job_context); + } + } + job_context.Clean(); + } + + delete state; +} +} // namespace + +InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SuperVersion* super_version, + Arena* arena, + RangeDelAggregator* range_del_agg, + SequenceNumber sequence) { + InternalIterator* internal_iter; + assert(arena != nullptr); + assert(range_del_agg != nullptr); + // Need to create internal iterator from the arena. + MergeIteratorBuilder merge_iter_builder( + &cfd->internal_comparator(), arena, + !read_options.total_order_seek && + super_version->mutable_cf_options.prefix_extractor != nullptr); + // Collect iterator for mutable mem + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(read_options, arena)); + std::unique_ptr range_del_iter; + Status s; + if (!read_options.ignore_range_deletions) { + range_del_iter.reset( + super_version->mem->NewRangeTombstoneIterator(read_options, sequence)); + range_del_agg->AddTombstones(std::move(range_del_iter)); + } + // Collect all needed child iterators for immutable memtables + if (s.ok()) { + super_version->imm->AddIterators(read_options, &merge_iter_builder); + if (!read_options.ignore_range_deletions) { + s = super_version->imm->AddRangeTombstoneIterators(read_options, arena, + range_del_agg); + } + } + TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s); + if (s.ok()) { + // Collect iterators for files in L0 - Ln + if (read_options.read_tier != kMemtableTier) { + super_version->current->AddIterators(read_options, file_options_, + &merge_iter_builder, range_del_agg); + } + internal_iter = merge_iter_builder.Finish(); + IterState* cleanup = + new IterState(this, &mutex_, super_version, + read_options.background_purge_on_iterator_cleanup || + immutable_db_options_.avoid_unnecessary_blocking_io); + internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); + + return internal_iter; + } else { + CleanupSuperVersion(super_version); + } + return NewErrorInternalIterator(s, arena); +} + +ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { + return default_cf_handle_; +} + +ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const { + return persist_stats_cf_handle_; +} + +Status DBImpl::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = value; + return GetImpl(read_options, key, get_impl_options); +} + +Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, + GetImplOptions get_impl_options) { + assert(get_impl_options.value != nullptr || + get_impl_options.merge_operands != nullptr); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); + StopWatch sw(env_, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); + + auto cfh = + reinterpret_cast(get_impl_options.column_family); + auto cfd = cfh->cfd(); + + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(get_impl_options.column_family, key); + } + } + + // Acquire SuperVersion + SuperVersion* sv = GetAndRefSuperVersion(cfd); + + TEST_SYNC_POINT("DBImpl::GetImpl:1"); + TEST_SYNC_POINT("DBImpl::GetImpl:2"); + + SequenceNumber snapshot; + if (read_options.snapshot != nullptr) { + if (get_impl_options.callback) { + // Already calculated based on read_options.snapshot + snapshot = get_impl_options.callback->max_visible_seq(); + } else { + snapshot = + reinterpret_cast(read_options.snapshot)->number_; + } + } else { + // Note that the snapshot is assigned AFTER referencing the super + // version because otherwise a flush happening in between may compact away + // data for the snapshot, so the reader would see neither data that was be + // visible to the snapshot before compaction nor the newer data inserted + // afterwards. + snapshot = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + if (get_impl_options.callback) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. + get_impl_options.callback->Refresh(snapshot); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + snapshot = get_impl_options.callback->max_visible_seq(); + } + } + TEST_SYNC_POINT("DBImpl::GetImpl:3"); + TEST_SYNC_POINT("DBImpl::GetImpl:4"); + + // Prepare to store a list of merge operations if merge occurs. + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + + Status s; + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + LookupKey lkey(key, snapshot, read_options.timestamp); + PERF_TIMER_STOP(get_snapshot_time); + + bool skip_memtable = (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + bool done = false; + if (!skip_memtable) { + // Get value associated with key + if (get_impl_options.get_value) { + if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s, + &merge_context, &max_covering_tombstone_seq, + read_options, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + get_impl_options.value->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s, + &merge_context, &max_covering_tombstone_seq, + read_options, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + get_impl_options.value->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } + } else { + // Get Merge Operands associated with key, Merge Operands should not be + // merged and raw values should be returned to the user. + if (sv->mem->Get(lkey, nullptr, &s, &merge_context, + &max_covering_tombstone_seq, read_options, nullptr, + nullptr, false)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->GetMergeOperands(lkey, &s, &merge_context, + &max_covering_tombstone_seq, + read_options)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } + } + if (!done && !s.ok() && !s.IsMergeInProgress()) { + ReturnAndCleanupSuperVersion(cfd, sv); + return s; + } + } + if (!done) { + PERF_TIMER_GUARD(get_from_output_files_time); + sv->current->Get( + read_options, lkey, get_impl_options.value, &s, &merge_context, + &max_covering_tombstone_seq, + get_impl_options.get_value ? get_impl_options.value_found : nullptr, + nullptr, nullptr, + get_impl_options.get_value ? get_impl_options.callback : nullptr, + get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr, + get_impl_options.get_value); + RecordTick(stats_, MEMTABLE_MISS); + } + + { + PERF_TIMER_GUARD(get_post_process_time); + + ReturnAndCleanupSuperVersion(cfd, sv); + + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = 0; + if (s.ok()) { + if (get_impl_options.get_value) { + size = get_impl_options.value->size(); + } else { + // Return all merge operands for get_impl_options.key + *get_impl_options.number_of_operands = + static_cast(merge_context.GetNumOperands()); + if (*get_impl_options.number_of_operands > + get_impl_options.get_merge_operands_options + ->expected_max_number_of_operands) { + s = Status::Incomplete( + Status::SubCode::KMergeOperandsInsufficientCapacity); + } else { + for (const Slice& sl : merge_context.GetOperands()) { + size += sl.size(); + get_impl_options.merge_operands->PinSelf(sl); + get_impl_options.merge_operands++; + } + } + } + RecordTick(stats_, BYTES_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + } + RecordInHistogram(stats_, BYTES_PER_READ, size); + } + return s; +} + +std::vector DBImpl::MultiGet( + const ReadOptions& read_options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); + StopWatch sw(env_, stats_, DB_MULTIGET); + PERF_TIMER_GUARD(get_snapshot_time); + + SequenceNumber consistent_seqnum; + ; + + std::unordered_map multiget_cf_data( + column_family.size()); + for (auto cf : column_family) { + auto cfh = reinterpret_cast(cf); + auto cfd = cfh->cfd(); + if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { + multiget_cf_data.emplace(cfd->GetID(), + MultiGetColumnFamilyData(cfh, nullptr)); + } + } + + std::function::iterator&)> + iter_deref_lambda = + [](std::unordered_map::iterator& + cf_iter) { return &cf_iter->second; }; + + bool unref_only = + MultiCFSnapshot>( + read_options, nullptr, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); + + // Contain a list of merge operations if merge occurs. + MergeContext merge_context; + + // Note: this always resizes the values array + size_t num_keys = keys.size(); + std::vector stat_list(num_keys); + values->resize(num_keys); + + // Keep track of bytes that we read for statistics-recording later + uint64_t bytes_read = 0; + PERF_TIMER_STOP(get_snapshot_time); + + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + size_t num_found = 0; + for (size_t i = 0; i < num_keys; ++i) { + merge_context.Clear(); + Status& s = stat_list[i]; + std::string* value = &(*values)[i]; + + LookupKey lkey(keys[i], consistent_seqnum); + auto cfh = reinterpret_cast(column_family[i]); + SequenceNumber max_covering_tombstone_seq = 0; + auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); + assert(mgd_iter != multiget_cf_data.end()); + auto mgd = mgd_iter->second; + auto super_version = mgd.super_version; + bool skip_memtable = + (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + bool done = false; + if (!skip_memtable) { + if (super_version->mem->Get(lkey, value, &s, &merge_context, + &max_covering_tombstone_seq, read_options)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if (super_version->imm->Get(lkey, value, &s, &merge_context, + &max_covering_tombstone_seq, + read_options)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } + } + if (!done) { + PinnableSlice pinnable_val; + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->Get(read_options, lkey, &pinnable_val, &s, + &merge_context, &max_covering_tombstone_seq); + value->assign(pinnable_val.data(), pinnable_val.size()); + RecordTick(stats_, MEMTABLE_MISS); + } + + if (s.ok()) { + bytes_read += value->size(); + num_found++; + } + } + + // Post processing (decrement reference counts and record statistics) + PERF_TIMER_GUARD(get_post_process_time); + autovector superversions_to_delete; + + for (auto mgd_iter : multiget_cf_data) { + auto mgd = mgd_iter.second; + if (!unref_only) { + ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version); + } else { + mgd.cfd->GetSuperVersion()->Unref(); + } + } + RecordTick(stats_, NUMBER_MULTIGET_CALLS); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found); + RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); + RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read); + PERF_COUNTER_ADD(multiget_read_bytes, bytes_read); + PERF_TIMER_STOP(get_post_process_time); + + return stat_list; +} + +template +bool DBImpl::MultiCFSnapshot( + const ReadOptions& read_options, ReadCallback* callback, + std::function& + iter_deref_func, + T* cf_list, SequenceNumber* snapshot) { + PERF_TIMER_GUARD(get_snapshot_time); + + bool last_try = false; + if (cf_list->size() == 1) { + // Fast path for a single column family. We can simply get the thread loca + // super version + auto cf_iter = cf_list->begin(); + auto node = iter_deref_func(cf_iter); + node->super_version = GetAndRefSuperVersion(node->cfd); + if (read_options.snapshot != nullptr) { + // Note: In WritePrepared txns this is not necessary but not harmful + // either. Because prep_seq > snapshot => commit_seq > snapshot so if + // a snapshot is specified we should be fine with skipping seq numbers + // that are greater than that. + // + // In WriteUnprepared, we cannot set snapshot in the lookup key because we + // may skip uncommitted data that should be visible to the transaction for + // reading own writes. + *snapshot = + static_cast(read_options.snapshot)->number_; + if (callback) { + *snapshot = std::max(*snapshot, callback->max_visible_seq()); + } + } else { + // Since we get and reference the super version before getting + // the snapshot number, without a mutex protection, it is possible + // that a memtable switch happened in the middle and not all the + // data for this snapshot is available. But it will contain all + // the data available in the super version we have, which is also + // a valid snapshot to read from. + // We shouldn't get snapshot before finding and referencing the super + // version because a flush happening in between may compact away data for + // the snapshot, but the snapshot is earlier than the data overwriting it, + // so users may see wrong results. + *snapshot = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + } + } else { + // If we end up with the same issue of memtable geting sealed during 2 + // consecutive retries, it means the write rate is very high. In that case + // its probably ok to take the mutex on the 3rd try so we can succeed for + // sure + static const int num_retries = 3; + for (int i = 0; i < num_retries; ++i) { + last_try = (i == num_retries - 1); + bool retry = false; + + if (i > 0) { + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + SuperVersion* super_version = node->super_version; + ColumnFamilyData* cfd = node->cfd; + if (super_version != nullptr) { + ReturnAndCleanupSuperVersion(cfd, super_version); + } + node->super_version = nullptr; + } + } + if (read_options.snapshot == nullptr) { + if (last_try) { + TEST_SYNC_POINT("DBImpl::MultiGet::LastTry"); + // We're close to max number of retries. For the last retry, + // acquire the lock so we're sure to succeed + mutex_.Lock(); + } + *snapshot = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + } else { + *snapshot = reinterpret_cast(read_options.snapshot) + ->number_; + } + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + if (!last_try) { + node->super_version = GetAndRefSuperVersion(node->cfd); + } else { + node->super_version = node->cfd->GetSuperVersion()->Ref(); + } + TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV"); + if (read_options.snapshot != nullptr || last_try) { + // If user passed a snapshot, then we don't care if a memtable is + // sealed or compaction happens because the snapshot would ensure + // that older key versions are kept around. If this is the last + // retry, then we have the lock so nothing bad can happen + continue; + } + // We could get the earliest sequence number for the whole list of + // memtables, which will include immutable memtables as well, but that + // might be tricky to maintain in case we decide, in future, to do + // memtable compaction. + if (!last_try) { + SequenceNumber seq = + node->super_version->mem->GetEarliestSequenceNumber(); + if (seq > *snapshot) { + retry = true; + break; + } + } + } + if (!retry) { + if (last_try) { + mutex_.Unlock(); + } + break; + } + } + } + + // Keep track of bytes that we read for statistics-recording later + PERF_TIMER_STOP(get_snapshot_time); + + return last_try; +} + +void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + if (num_keys == 0) { + return; + } + autovector key_context; + autovector sorted_keys; + sorted_keys.resize(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + key_context.emplace_back(column_families[i], keys[i], &values[i], + &statuses[i]); + } + for (size_t i = 0; i < num_keys; ++i) { + sorted_keys[i] = &key_context[i]; + } + PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys); + + autovector + multiget_cf_data; + size_t cf_start = 0; + ColumnFamilyHandle* cf = sorted_keys[0]->column_family; + for (size_t i = 0; i < num_keys; ++i) { + KeyContext* key_ctx = sorted_keys[i]; + if (key_ctx->column_family != cf) { + multiget_cf_data.emplace_back( + MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr)); + cf_start = i; + cf = key_ctx->column_family; + } + } + { + // multiget_cf_data.emplace_back( + // MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr)); + multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr); + } + std::function::iterator&)> + iter_deref_lambda = + [](autovector::iterator& cf_iter) { + return &(*cf_iter); + }; + + SequenceNumber consistent_seqnum; + bool unref_only = MultiCFSnapshot< + autovector>( + read_options, nullptr, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); + + for (auto cf_iter = multiget_cf_data.begin(); + cf_iter != multiget_cf_data.end(); ++cf_iter) { + MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys, + cf_iter->super_version, consistent_seqnum, nullptr, nullptr); + if (!unref_only) { + ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version); + } else { + cf_iter->cfd->GetSuperVersion()->Unref(); + } + } +} + +namespace { +// Order keys by CF ID, followed by key contents +struct CompareKeyContext { + inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) { + ColumnFamilyHandleImpl* cfh = + static_cast(lhs->column_family); + uint32_t cfd_id1 = cfh->cfd()->GetID(); + const Comparator* comparator = cfh->cfd()->user_comparator(); + cfh = static_cast(lhs->column_family); + uint32_t cfd_id2 = cfh->cfd()->GetID(); + + if (cfd_id1 < cfd_id2) { + return true; + } else if (cfd_id1 > cfd_id2) { + return false; + } + + // Both keys are from the same column family + int cmp = comparator->Compare(*(lhs->key), *(rhs->key)); + if (cmp < 0) { + return true; + } + return false; + } +}; + +} // anonymous namespace + +void DBImpl::PrepareMultiGetKeys( + size_t num_keys, bool sorted_input, + autovector* sorted_keys) { +#ifndef NDEBUG + if (sorted_input) { + for (size_t index = 0; index < sorted_keys->size(); ++index) { + if (index > 0) { + KeyContext* lhs = (*sorted_keys)[index - 1]; + KeyContext* rhs = (*sorted_keys)[index]; + ColumnFamilyHandleImpl* cfh = + reinterpret_cast(lhs->column_family); + uint32_t cfd_id1 = cfh->cfd()->GetID(); + const Comparator* comparator = cfh->cfd()->user_comparator(); + cfh = reinterpret_cast(lhs->column_family); + uint32_t cfd_id2 = cfh->cfd()->GetID(); + + assert(cfd_id1 <= cfd_id2); + if (cfd_id1 < cfd_id2) { + continue; + } + + // Both keys are from the same column family + int cmp = comparator->Compare(*(lhs->key), *(rhs->key)); + assert(cmp <= 0); + } + index++; + } + } +#endif + if (!sorted_input) { + CompareKeyContext sort_comparator; + std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys, + sort_comparator); + } +} + +void DBImpl::MultiGet(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, + Status* statuses, const bool sorted_input) { + autovector key_context; + autovector sorted_keys; + sorted_keys.resize(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]); + } + for (size_t i = 0; i < num_keys; ++i) { + sorted_keys[i] = &key_context[i]; + } + PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys); + MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys); +} + +void DBImpl::MultiGetWithCallback( + const ReadOptions& read_options, ColumnFamilyHandle* column_family, + ReadCallback* callback, + autovector* sorted_keys) { + std::array multiget_cf_data; + multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr); + std::function::iterator&)> + iter_deref_lambda = + [](std::array::iterator& cf_iter) { + return &(*cf_iter); + }; + + size_t num_keys = sorted_keys->size(); + SequenceNumber consistent_seqnum; + bool unref_only = MultiCFSnapshot>( + read_options, callback, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); +#ifndef NDEBUG + assert(!unref_only); +#else + // Silence unused variable warning + (void)unref_only; +#endif // NDEBUG + + if (callback && read_options.snapshot == nullptr) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. + callback->Refresh(consistent_seqnum); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + consistent_seqnum = callback->max_visible_seq(); + } + + MultiGetImpl(read_options, 0, num_keys, sorted_keys, + multiget_cf_data[0].super_version, consistent_seqnum, nullptr, + nullptr); + ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd, + multiget_cf_data[0].super_version); +} + +void DBImpl::MultiGetImpl( + const ReadOptions& read_options, size_t start_key, size_t num_keys, + autovector* sorted_keys, + SuperVersion* super_version, SequenceNumber snapshot, + ReadCallback* callback, bool* is_blob_index) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); + StopWatch sw(env_, stats_, DB_MULTIGET); + + // For each of the given keys, apply the entire "get" process as follows: + // First look in the memtable, then in the immutable memtable (if any). + // s is both in/out. When in, s could either be OK or MergeInProgress. + // merge_operands will contain the sequence of merges in the latter case. + size_t keys_left = num_keys; + while (keys_left) { + size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE) + ? MultiGetContext::MAX_BATCH_SIZE + : keys_left; + MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left, + batch_size, snapshot); + MultiGetRange range = ctx.GetMultiGetRange(); + bool lookup_current = false; + + keys_left -= batch_size; + for (auto mget_iter = range.begin(); mget_iter != range.end(); + ++mget_iter) { + mget_iter->merge_context.Clear(); + *mget_iter->s = Status::OK(); + } + + bool skip_memtable = + (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + if (!skip_memtable) { + super_version->mem->MultiGet(read_options, &range, callback, + is_blob_index); + if (!range.empty()) { + super_version->imm->MultiGet(read_options, &range, callback, + is_blob_index); + } + if (!range.empty()) { + lookup_current = true; + uint64_t left = range.KeysLeft(); + RecordTick(stats_, MEMTABLE_MISS, left); + } + } + if (lookup_current) { + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->MultiGet(read_options, &range, callback, + is_blob_index); + } + } + + // Post processing (decrement reference counts and record statistics) + PERF_TIMER_GUARD(get_post_process_time); + size_t num_found = 0; + uint64_t bytes_read = 0; + for (size_t i = start_key; i < start_key + num_keys; ++i) { + KeyContext* key = (*sorted_keys)[i]; + if (key->s->ok()) { + bytes_read += key->value->size(); + num_found++; + } + } + + RecordTick(stats_, NUMBER_MULTIGET_CALLS); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); + RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found); + RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); + RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read); + PERF_COUNTER_ADD(multiget_read_bytes, bytes_read); + PERF_TIMER_STOP(get_post_process_time); +} + +Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const std::string& column_family, + ColumnFamilyHandle** handle) { + assert(handle != nullptr); + Status s = CreateColumnFamilyImpl(cf_options, column_family, handle); + if (s.ok()) { + s = WriteOptionsFile(true /*need_mutex_lock*/, + true /*need_enter_write_thread*/); + } + return s; +} + +Status DBImpl::CreateColumnFamilies( + const ColumnFamilyOptions& cf_options, + const std::vector& column_family_names, + std::vector* handles) { + assert(handles != nullptr); + handles->clear(); + size_t num_cf = column_family_names.size(); + Status s; + bool success_once = false; + for (size_t i = 0; i < num_cf; i++) { + ColumnFamilyHandle* handle; + s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle); + if (!s.ok()) { + break; + } + handles->push_back(handle); + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) { + assert(handles != nullptr); + handles->clear(); + size_t num_cf = column_families.size(); + Status s; + bool success_once = false; + for (size_t i = 0; i < num_cf; i++) { + ColumnFamilyHandle* handle; + s = CreateColumnFamilyImpl(column_families[i].options, + column_families[i].name, &handle); + if (!s.ok()) { + break; + } + handles->push_back(handle); + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, + const std::string& column_family_name, + ColumnFamilyHandle** handle) { + Status s; + Status persist_options_status; + *handle = nullptr; + + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + s = ColumnFamilyData::ValidateOptions(db_options, cf_options); + if (s.ok()) { + for (auto& cf_path : cf_options.cf_paths) { + s = env_->CreateDirIfMissing(cf_path.path); + if (!s.ok()) { + break; + } + } + } + if (!s.ok()) { + return s; + } + + SuperVersionContext sv_context(/* create_superversion */ true); + { + InstrumentedMutexLock l(&mutex_); + + if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != + nullptr) { + return Status::InvalidArgument("Column family already exists"); + } + VersionEdit edit; + edit.AddColumnFamily(column_family_name); + uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); + edit.SetColumnFamily(new_id); + edit.SetLogNumber(logfile_number_); + edit.SetComparatorName(cf_options.comparator->Name()); + + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + { // write thread + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit, + &mutex_, directories_.GetDbDir(), false, + &cf_options); + write_thread_.ExitUnbatched(&w); + } + if (s.ok()) { + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + std::map> dummy_created_dirs; + s = cfd->AddDirectories(&dummy_created_dirs); + } + if (s.ok()) { + single_column_family_mode_ = false; + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + InstallSuperVersionAndScheduleWork(cfd, &sv_context, + *cfd->GetLatestMutableCFOptions()); + + if (!cfd->mem()->IsSnapshotSupported()) { + is_snapshot_supported_ = false; + } + + cfd->set_initialized(); + + *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Created column family [%s] (ID %u)", + column_family_name.c_str(), (unsigned)cfd->GetID()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Creating column family [%s] FAILED -- %s", + column_family_name.c_str(), s.ToString().c_str()); + } + } // InstrumentedMutexLock l(&mutex_) + + sv_context.Clean(); + // this is outside the mutex + if (s.ok()) { + NewThreadStatusCfInfo( + reinterpret_cast(*handle)->cfd()); + } + return s; +} + +Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { + assert(column_family != nullptr); + Status s = DropColumnFamilyImpl(column_family); + if (s.ok()) { + s = WriteOptionsFile(true /*need_mutex_lock*/, + true /*need_enter_write_thread*/); + } + return s; +} + +Status DBImpl::DropColumnFamilies( + const std::vector& column_families) { + Status s; + bool success_once = false; + for (auto* handle : column_families) { + s = DropColumnFamilyImpl(handle); + if (!s.ok()) { + break; + } + success_once = true; + } + if (success_once) { + Status persist_options_status = WriteOptionsFile( + true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + if (s.ok() && !persist_options_status.ok()) { + s = persist_options_status; + } + } + return s; +} + +Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + if (cfd->GetID() == 0) { + return Status::InvalidArgument("Can't drop default column family"); + } + + bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported(); + + VersionEdit edit; + edit.DropColumnFamily(); + edit.SetColumnFamily(cfd->GetID()); + + Status s; + { + InstrumentedMutexLock l(&mutex_); + if (cfd->IsDropped()) { + s = Status::InvalidArgument("Column family already dropped!\n"); + } + if (s.ok()) { + // we drop column family from a single write thread + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, + &mutex_); + write_thread_.ExitUnbatched(&w); + } + if (s.ok()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + + if (!cf_support_snapshot) { + // Dropped Column Family doesn't support snapshot. Need to recalculate + // is_snapshot_supported_. + bool new_is_snapshot_supported = true; + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) { + new_is_snapshot_supported = false; + break; + } + } + is_snapshot_supported_ = new_is_snapshot_supported; + } + bg_cv_.SignalAll(); + } + + if (s.ok()) { + // Note that here we erase the associated cf_info of the to-be-dropped + // cfd before its ref-count goes to zero to avoid having to erase cf_info + // later inside db_mutex. + EraseThreadStatusCfInfo(cfd); + assert(cfd->IsDropped()); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Dropped column family with id %u\n", cfd->GetID()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Dropping column family with id %u FAILED -- %s\n", + cfd->GetID(), s.ToString().c_str()); + } + + return s; +} + +bool DBImpl::KeyMayExist(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, bool* value_found) { + assert(value != nullptr); + if (value_found != nullptr) { + // falsify later if key-may-exist but can't fetch value + *value_found = true; + } + ReadOptions roptions = read_options; + roptions.read_tier = kBlockCacheTier; // read from block cache only + PinnableSlice pinnable_val; + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = value_found; + auto s = GetImpl(roptions, key, get_impl_options); + value->assign(pinnable_val.data(), pinnable_val.size()); + + // If block_cache is enabled and the index block of the table didn't + // not present in block_cache, the return value will be Status::Incomplete. + // In this case, key may still exist in the table. + return s.ok() || s.IsIncomplete(); +} + +Iterator* DBImpl::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + if (read_options.managed) { + return NewErrorIterator( + Status::NotSupported("Managed iterator is not supported anymore.")); + } + Iterator* result = nullptr; + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); + } + // if iterator wants internal keys, we can only proceed if + // we can guarantee the deletes haven't been processed yet + if (immutable_db_options_.preserve_deletes && + read_options.iter_start_seqnum > 0 && + read_options.iter_start_seqnum < preserve_deletes_seqnum_.load()) { + return NewErrorIterator(Status::InvalidArgument( + "Iterator requested internal keys which are too old and are not" + " guaranteed to be preserved, try larger iter_start_seqnum opt.")); + } + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + ReadCallback* read_callback = nullptr; // No read callback provided. + if (read_options.tailing) { +#ifdef ROCKSDB_LITE + // not supported in lite version + result = nullptr; + +#else + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + auto iter = new ForwardIterator(this, read_options, cfd, sv); + result = NewDBIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + cfd->user_comparator(), iter, kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, + this, cfd); +#endif + } else { + // Note: no need to consider the special case of + // last_seq_same_as_publish_seq_==false since NewIterator is overridden in + // WritePreparedTxnDB + auto snapshot = read_options.snapshot != nullptr + ? read_options.snapshot->GetSequenceNumber() + : versions_->LastSequence(); + result = NewIteratorImpl(read_options, cfd, snapshot, read_callback); + } + return result; +} + +ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool allow_blob, + bool allow_refresh) { + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + + // Try to generate a DB iterator tree in continuous memory area to be + // cache friendly. Here is an example of result: + // +-------------------------------+ + // | | + // | ArenaWrappedDBIter | + // | + | + // | +---> Inner Iterator ------------+ + // | | | | + // | | +-- -- -- -- -- -- -- --+ | + // | +--- | Arena | | + // | | | | + // | Allocated Memory: | | + // | | +-------------------+ | + // | | | DBIter | <---+ + // | | + | + // | | | +-> iter_ ------------+ + // | | | | | + // | | +-------------------+ | + // | | | MergingIterator | <---+ + // | | + | + // | | | +->child iter1 ------------+ + // | | | | | | + // | | +->child iter2 ----------+ | + // | | | | | | | + // | | | +->child iter3 --------+ | | + // | | | | | | + // | | +-------------------+ | | | + // | | | Iterator1 | <--------+ + // | | +-------------------+ | | + // | | | Iterator2 | <------+ + // | | +-------------------+ | + // | | | Iterator3 | <----+ + // | | +-------------------+ + // | | | + // +-------+-----------------------+ + // + // ArenaWrappedDBIter inlines an arena area where all the iterators in + // the iterator tree are allocated in the order of being accessed when + // querying. + // Laying out the iterators in the order of being accessed makes it more + // likely that any iterator pointer is close to the iterator it points to so + // that they are likely to be in the same cache line and/or page. + ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_callback, this, cfd, allow_blob, + read_options.snapshot != nullptr ? false : allow_refresh); + + InternalIterator* internal_iter = + NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), snapshot); + db_iter->SetIterUnderDBIter(internal_iter); + + return db_iter; +} + +Status DBImpl::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.managed) { + return Status::NotSupported("Managed iterator is not supported anymore."); + } + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } + ReadCallback* read_callback = nullptr; // No read callback provided. + iterators->clear(); + iterators->reserve(column_families.size()); + if (read_options.tailing) { +#ifdef ROCKSDB_LITE + return Status::InvalidArgument( + "Tailing iterator not supported in RocksDB lite"); +#else + for (auto cfh : column_families) { + auto cfd = reinterpret_cast(cfh)->cfd(); + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + auto iter = new ForwardIterator(this, read_options, cfd, sv); + iterators->push_back(NewDBIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + cfd->user_comparator(), iter, kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + read_callback, this, cfd)); + } +#endif + } else { + // Note: no need to consider the special case of + // last_seq_same_as_publish_seq_==false since NewIterators is overridden in + // WritePreparedTxnDB + auto snapshot = read_options.snapshot != nullptr + ? read_options.snapshot->GetSequenceNumber() + : versions_->LastSequence(); + for (size_t i = 0; i < column_families.size(); ++i) { + auto* cfd = + reinterpret_cast(column_families[i])->cfd(); + iterators->push_back( + NewIteratorImpl(read_options, cfd, snapshot, read_callback)); + } + } + + return Status::OK(); +} + +const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); } + +#ifndef ROCKSDB_LITE +const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { + return GetSnapshotImpl(true); +} +#endif // ROCKSDB_LITE + +SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary, + bool lock) { + int64_t unix_time = 0; + env_->GetCurrentTime(&unix_time); // Ignore error + SnapshotImpl* s = new SnapshotImpl; + + if (lock) { + mutex_.Lock(); + } + // returns null if the underlying memtable does not support snapshot. + if (!is_snapshot_supported_) { + if (lock) { + mutex_.Unlock(); + } + delete s; + return nullptr; + } + auto snapshot_seq = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + SnapshotImpl* snapshot = + snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary); + if (lock) { + mutex_.Unlock(); + } + return snapshot; +} + +namespace { +typedef autovector CfdList; +bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) { + for (const ColumnFamilyData* t : list) { + if (t == cfd) { + return true; + } + } + return false; +} +} // namespace + +void DBImpl::ReleaseSnapshot(const Snapshot* s) { + const SnapshotImpl* casted_s = reinterpret_cast(s); + { + InstrumentedMutexLock l(&mutex_); + snapshots_.Delete(casted_s); + uint64_t oldest_snapshot; + if (snapshots_.empty()) { + oldest_snapshot = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + } else { + oldest_snapshot = snapshots_.oldest()->number_; + } + // Avoid to go through every column family by checking a global threshold + // first. + if (oldest_snapshot > bottommost_files_mark_threshold_) { + CfdList cf_scheduled; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot); + if (!cfd->current() + ->storage_info() + ->BottommostFilesMarkedForCompaction() + .empty()) { + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + cf_scheduled.push_back(cfd); + } + } + + // Calculate a new threshold, skipping those CFs where compactions are + // scheduled. We do not do the same pass as the previous loop because + // mutex might be unlocked during the loop, making the result inaccurate. + SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (CfdListContains(cf_scheduled, cfd)) { + continue; + } + new_bottommost_files_mark_threshold = std::min( + new_bottommost_files_mark_threshold, + cfd->current()->storage_info()->bottommost_files_mark_threshold()); + } + bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold; + } + } + delete casted_s; +} + +#ifndef ROCKSDB_LITE +Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + auto s = version->GetPropertiesOfAllTables(props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + +Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, + const Range* range, std::size_t n, + TablePropertiesCollection* props) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + // Increment the ref count + mutex_.Lock(); + auto version = cfd->current(); + version->Ref(); + mutex_.Unlock(); + + auto s = version->GetPropertiesOfTablesInRange(range, n, props); + + // Decrement the ref count + mutex_.Lock(); + version->Unref(); + mutex_.Unlock(); + + return s; +} + +#endif // ROCKSDB_LITE + +const std::string& DBImpl::GetName() const { return dbname_; } + +Env* DBImpl::GetEnv() const { return env_; } + +FileSystem* DB::GetFileSystem() const { + static LegacyFileSystemWrapper fs_wrap(GetEnv()); + return &fs_wrap; +} + +FileSystem* DBImpl::GetFileSystem() const { + return immutable_db_options_.fs.get(); +} + +Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const { + InstrumentedMutexLock l(&mutex_); + auto cfh = reinterpret_cast(column_family); + return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_), + cfh->cfd()->GetLatestCFOptions()); +} + +DBOptions DBImpl::GetDBOptions() const { + InstrumentedMutexLock l(&mutex_); + return BuildDBOptions(immutable_db_options_, mutable_db_options_); +} + +bool DBImpl::GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + value->clear(); + auto cfd = reinterpret_cast(column_family)->cfd(); + if (property_info == nullptr) { + return false; + } else if (property_info->handle_int) { + uint64_t int_value; + bool ret_value = + GetIntPropertyInternal(cfd, *property_info, false, &int_value); + if (ret_value) { + *value = ToString(int_value); + } + return ret_value; + } else if (property_info->handle_string) { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetStringProperty(*property_info, property, + value); + } else if (property_info->handle_string_dbimpl) { + std::string tmp_value; + bool ret_value = (this->*(property_info->handle_string_dbimpl))(&tmp_value); + if (ret_value) { + *value = tmp_value; + } + return ret_value; + } + // Shouldn't reach here since exactly one of handle_string and handle_int + // should be non-nullptr. + assert(false); + return false; +} + +bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family, + const Slice& property, + std::map* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + value->clear(); + auto cfd = reinterpret_cast(column_family)->cfd(); + if (property_info == nullptr) { + return false; + } else if (property_info->handle_map) { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetMapProperty(*property_info, property, + value); + } + // If we reach this point it means that handle_map is not provided for the + // requested property + return false; +} + +bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { + return false; + } + auto cfd = reinterpret_cast(column_family)->cfd(); + return GetIntPropertyInternal(cfd, *property_info, false, value); +} + +bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value) { + assert(property_info.handle_int != nullptr); + if (!property_info.need_out_of_mutex) { + if (is_locked) { + mutex_.AssertHeld(); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); + } else { + InstrumentedMutexLock l(&mutex_); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); + } + } else { + SuperVersion* sv = nullptr; + if (!is_locked) { + sv = GetAndRefSuperVersion(cfd); + } else { + sv = cfd->GetSuperVersion(); + } + + bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex( + property_info, sv->current, value); + + if (!is_locked) { + ReturnAndCleanupSuperVersion(cfd, sv); + } + + return ret; + } +} + +bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) { + assert(value != nullptr); + Statistics* statistics = immutable_db_options_.statistics.get(); + if (!statistics) { + return false; + } + *value = statistics->ToString(); + return true; +} + +#ifndef ROCKSDB_LITE +Status DBImpl::ResetStats() { + InstrumentedMutexLock l(&mutex_); + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (cfd->initialized()) { + cfd->internal_stats()->Clear(); + } + } + return Status::OK(); +} +#endif // ROCKSDB_LITE + +bool DBImpl::GetAggregatedIntProperty(const Slice& property, + uint64_t* aggregated_value) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { + return false; + } + + uint64_t sum = 0; + { + // Needs mutex to protect the list of column families. + InstrumentedMutexLock l(&mutex_); + uint64_t value; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->initialized()) { + continue; + } + if (GetIntPropertyInternal(cfd, *property_info, true, &value)) { + sum += value; + } else { + return false; + } + } + } + *aggregated_value = sum; + return true; +} + +SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { + // TODO(ljin): consider using GetReferencedSuperVersion() directly + return cfd->GetThreadLocalSuperVersion(this); +} + +// REQUIRED: this function should only be called on the write thread or if the +// mutex is held. +SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) { + auto column_family_set = versions_->GetColumnFamilySet(); + auto cfd = column_family_set->GetColumnFamily(column_family_id); + if (!cfd) { + return nullptr; + } + + return GetAndRefSuperVersion(cfd); +} + +void DBImpl::CleanupSuperVersion(SuperVersion* sv) { + // Release SuperVersion + if (sv->Unref()) { + bool defer_purge = + immutable_db_options().avoid_unnecessary_blocking_io; + { + InstrumentedMutexLock l(&mutex_); + sv->Cleanup(); + if (defer_purge) { + AddSuperVersionsToFreeQueue(sv); + SchedulePurge(); + } + } + if (!defer_purge) { + delete sv; + } + RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS); + } + RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES); +} + +void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, + SuperVersion* sv) { + if (!cfd->ReturnThreadLocalSuperVersion(sv)) { + CleanupSuperVersion(sv); + } +} + +// REQUIRED: this function should only be called on the write thread. +void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id, + SuperVersion* sv) { + auto column_family_set = versions_->GetColumnFamilySet(); + auto cfd = column_family_set->GetColumnFamily(column_family_id); + + // If SuperVersion is held, and we successfully fetched a cfd using + // GetAndRefSuperVersion(), it must still exist. + assert(cfd != nullptr); + ReturnAndCleanupSuperVersion(cfd, sv); +} + +// REQUIRED: this function should only be called on the write thread or if the +// mutex is held. +ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { + ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get(); + + if (!cf_memtables->Seek(column_family_id)) { + return nullptr; + } + + return cf_memtables->GetColumnFamilyHandle(); +} + +// REQUIRED: mutex is NOT held. +std::unique_ptr DBImpl::GetColumnFamilyHandleUnlocked( + uint32_t column_family_id) { + InstrumentedMutexLock l(&mutex_); + + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id); + if (cfd == nullptr) { + return nullptr; + } + + return std::unique_ptr( + new ColumnFamilyHandleImpl(cfd, this, &mutex_)); +} + +void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) { + ColumnFamilyHandleImpl* cfh = + reinterpret_cast(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + SuperVersion* sv = GetAndRefSuperVersion(cfd); + + // Convert user_key into a corresponding internal key. + InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek); + MemTable::MemTableStats memStats = + sv->mem->ApproximateStats(k1.Encode(), k2.Encode()); + MemTable::MemTableStats immStats = + sv->imm->ApproximateStats(k1.Encode(), k2.Encode()); + *count = memStats.count + immStats.count; + *size = memStats.size + immStats.size; + + ReturnAndCleanupSuperVersion(cfd, sv); +} + +Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes) { + if (!options.include_memtabtles && !options.include_files) { + return Status::InvalidArgument("Invalid options"); + } + + Version* v; + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + SuperVersion* sv = GetAndRefSuperVersion(cfd); + v = sv->current; + + for (int i = 0; i < n; i++) { + // Convert user_key into a corresponding internal key. + InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + sizes[i] = 0; + if (options.include_files) { + sizes[i] += versions_->ApproximateSize( + options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, + /*end_level=*/-1, TableReaderCaller::kUserApproximateSize); + } + if (options.include_memtabtles) { + sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size; + sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size; + } + } + + ReturnAndCleanupSuperVersion(cfd, sv); + return Status::OK(); +} + +std::list::iterator +DBImpl::CaptureCurrentFileNumberInPendingOutputs() { + // We need to remember the iterator of our insert, because after the + // background job is done, we need to remove that element from + // pending_outputs_. + pending_outputs_.push_back(versions_->current_next_file_number()); + auto pending_outputs_inserted_elem = pending_outputs_.end(); + --pending_outputs_inserted_elem; + return pending_outputs_inserted_elem; +} + +void DBImpl::ReleaseFileNumberFromPendingOutputs( + std::unique_ptr::iterator>& v) { + if (v.get() != nullptr) { + pending_outputs_.erase(*v.get()); + v.reset(); + } +} + +#ifndef ROCKSDB_LITE +Status DBImpl::GetUpdatesSince( + SequenceNumber seq, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options) { + RecordTick(stats_, GET_UPDATES_SINCE_CALLS); + if (seq > versions_->LastSequence()) { + return Status::NotFound("Requested sequence not yet written in the db"); + } + return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get()); +} + +Status DBImpl::DeleteFile(std::string name) { + uint64_t number; + FileType type; + WalFileType log_type; + if (!ParseFileName(name, &number, &type, &log_type) || + (type != kTableFile && type != kLogFile)) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n", + name.c_str()); + return Status::InvalidArgument("Invalid file name"); + } + + Status status; + if (type == kLogFile) { + // Only allow deleting archived log files + if (log_type != kArchivedLogFile) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "DeleteFile %s failed - not archived log.\n", + name.c_str()); + return Status::NotSupported("Delete only supported for archived logs"); + } + status = wal_manager_.DeleteFile(name, number); + if (!status.ok()) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "DeleteFile %s failed -- %s.\n", name.c_str(), + status.ToString().c_str()); + } + return status; + } + + int level; + FileMetaData* metadata; + ColumnFamilyData* cfd; + VersionEdit edit; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); + if (!status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s failed. File not found\n", name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File not found"); + } + assert(level < cfd->NumberLevels()); + + // If the file is being compacted no need to delete. + if (metadata->being_compacted) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "DeleteFile %s Skipped. File about to be compacted\n", + name.c_str()); + job_context.Clean(); + return Status::OK(); + } + + // Only the files in the last level can be deleted externally. + // This is to make sure that any deletion tombstones are not + // lost. Check that the level passed is the last level. + auto* vstoreage = cfd->current()->storage_info(); + for (int i = level + 1; i < cfd->NumberLevels(); i++) { + if (vstoreage->NumLevelFiles(i) != 0) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s FAILED. File not in last level\n", + name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File not in last level"); + } + } + // if level == 0, it has to be the oldest file + if (level == 0 && + vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "DeleteFile %s failed ---" + " target file in level 0 must be the oldest.", + name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File in level 0, but not oldest"); + } + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(level, number); + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + FindObsoleteFiles(&job_context, false); + } // lock released here + + LogFlush(immutable_db_options_.info_log); + // remove files outside the db-lock + if (job_context.HaveSomethingToDelete()) { + // Call PurgeObsoleteFiles() without holding mutex. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + return status; +} + +Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end) { + Status status; + auto cfh = reinterpret_cast(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + VersionEdit edit; + std::set deleted_files; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + Version* input_version = cfd->current(); + + auto* vstorage = input_version->storage_info(); + for (size_t r = 0; r < n; r++) { + auto begin = ranges[r].start, end = ranges[r].limit; + for (int i = 1; i < cfd->NumberLevels(); i++) { + if (vstorage->LevelFiles(i).empty() || + !vstorage->OverlapInLevel(i, begin, end)) { + continue; + } + std::vector level_files; + InternalKey begin_storage, end_storage, *begin_key, *end_key; + if (begin == nullptr) { + begin_key = nullptr; + } else { + begin_storage.SetMinPossibleForUserKey(*begin); + begin_key = &begin_storage; + } + if (end == nullptr) { + end_key = nullptr; + } else { + end_storage.SetMaxPossibleForUserKey(*end); + end_key = &end_storage; + } + + vstorage->GetCleanInputsWithinInterval( + i, begin_key, end_key, &level_files, -1 /* hint_index */, + nullptr /* file_index */); + FileMetaData* level_file; + for (uint32_t j = 0; j < level_files.size(); j++) { + level_file = level_files[j]; + if (level_file->being_compacted) { + continue; + } + if (deleted_files.find(level_file) != deleted_files.end()) { + continue; + } + if (!include_end && end != nullptr && + cfd->user_comparator()->Compare(level_file->largest.user_key(), + *end) == 0) { + continue; + } + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(i, level_file->fd.GetNumber()); + deleted_files.insert(level_file); + level_file->being_compacted = true; + } + } + } + if (edit.GetDeletedFiles().empty()) { + job_context.Clean(); + return Status::OK(); + } + input_version->Ref(); + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + for (auto* deleted_file : deleted_files) { + deleted_file->being_compacted = false; + } + input_version->Unref(); + FindObsoleteFiles(&job_context, false); + } // lock released here + + LogFlush(immutable_db_options_.info_log); + // remove files outside the db-lock + if (job_context.HaveSomethingToDelete()) { + // Call PurgeObsoleteFiles() without holding mutex. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + return status; +} + +void DBImpl::GetLiveFilesMetaData(std::vector* metadata) { + InstrumentedMutexLock l(&mutex_); + versions_->GetLiveFilesMetaData(metadata); +} + +void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* cf_meta) { + assert(column_family); + auto* cfd = reinterpret_cast(column_family)->cfd(); + auto* sv = GetAndRefSuperVersion(cfd); + { + // Without mutex, Version::GetColumnFamilyMetaData will have data race with + // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but + // this may cause regression. An alternative is to make + // FileMetaData::being_compacted atomic, but it will make FileMetaData + // non-copy-able. Another option is to separate these variables from + // original FileMetaData struct, and this requires re-organization of data + // structures. For now, we take the easy approach. If + // DB::GetColumnFamilyMetaData is not called frequently, the regression + // should not be big. We still need to keep an eye on it. + InstrumentedMutexLock l(&mutex_); + sv->current->GetColumnFamilyMetaData(cf_meta); + } + ReturnAndCleanupSuperVersion(cfd, sv); +} + +#endif // ROCKSDB_LITE + +Status DBImpl::CheckConsistency() { + mutex_.AssertHeld(); + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData"); + + std::string corruption_messages; + + if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { + // Instead of calling GetFileSize() for each expected file, call + // GetChildren() for the DB directory and check that all expected files + // are listed, without checking their sizes. + // Since sst files might be in different directories, do it for each + // directory separately. + std::map> files_by_directory; + for (const auto& md : metadata) { + // md.name has a leading "/". Remove it. + std::string fname = md.name; + if (!fname.empty() && fname[0] == '/') { + fname = fname.substr(1); + } + files_by_directory[md.db_path].push_back(fname); + } + for (const auto& dir_files : files_by_directory) { + std::string directory = dir_files.first; + std::vector existing_files; + Status s = env_->GetChildren(directory, &existing_files); + if (!s.ok()) { + corruption_messages += + "Can't list files in " + directory + ": " + s.ToString() + "\n"; + continue; + } + std::sort(existing_files.begin(), existing_files.end()); + + for (const std::string& fname : dir_files.second) { + if (!std::binary_search(existing_files.begin(), existing_files.end(), + fname) && + !std::binary_search(existing_files.begin(), existing_files.end(), + Rocks2LevelTableFileName(fname))) { + corruption_messages += + "Missing sst file " + fname + " in " + directory + "\n"; + } + } + } + } else { + for (const auto& md : metadata) { + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; + + uint64_t fsize = 0; + TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize"); + Status s = env_->GetFileSize(file_path, &fsize); + if (!s.ok() && + env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) { + s = Status::OK(); + } + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } else if (fsize != md.size) { + corruption_messages += "Sst file size mismatch: " + file_path + + ". Size recorded in manifest " + + ToString(md.size) + ", actual size " + + ToString(fsize) + "\n"; + } + } + } + + if (corruption_messages.size() == 0) { + return Status::OK(); + } else { + return Status::Corruption(corruption_messages); + } +} + +Status DBImpl::GetDbIdentity(std::string& identity) const { + identity.assign(db_id_); + return Status::OK(); +} + +Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const { + std::string idfilename = IdentityFileName(dbname_); + const FileOptions soptions; + + Status s = ReadFileToString(fs_.get(), idfilename, identity); + if (!s.ok()) { + return s; + } + + // If last character is '\n' remove it from identity + if (identity->size() > 0 && identity->back() == '\n') { + identity->pop_back(); + } + return s; +} + +// Default implementation -- returns not supported status +Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/, + const std::string& /*column_family_name*/, + ColumnFamilyHandle** /*handle*/) { + return Status::NotSupported(""); +} + +Status DB::CreateColumnFamilies( + const ColumnFamilyOptions& /*cf_options*/, + const std::vector& /*column_family_names*/, + std::vector* /*handles*/) { + return Status::NotSupported(""); +} + +Status DB::CreateColumnFamilies( + const std::vector& /*column_families*/, + std::vector* /*handles*/) { + return Status::NotSupported(""); +} + +Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) { + return Status::NotSupported(""); +} + +Status DB::DropColumnFamilies( + const std::vector& /*column_families*/) { + return Status::NotSupported(""); +} + +Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) { + delete column_family; + return Status::OK(); +} + +DB::~DB() {} + +Status DBImpl::Close() { + if (!closed_) { + { + InstrumentedMutexLock l(&mutex_); + // If there is unreleased snapshot, fail the close call + if (!snapshots_.empty()) { + return Status::Aborted("Cannot close DB with unreleased snapshot."); + } + } + + closed_ = true; + return CloseImpl(); + } + return Status::OK(); +} + +Status DB::ListColumnFamilies(const DBOptions& db_options, + const std::string& name, + std::vector* column_families) { + FileSystem* fs = db_options.file_system.get(); + LegacyFileSystemWrapper legacy_fs(db_options.env); + if (!fs) { + fs = &legacy_fs; + } + return VersionSet::ListColumnFamilies(column_families, name, fs); +} + +Snapshot::~Snapshot() {} + +Status DestroyDB(const std::string& dbname, const Options& options, + const std::vector& column_families) { + ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); + Env* env = soptions.env; + std::vector filenames; + bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions); + + // Reset the logger because it holds a handle to the + // log file and prevents cleanup and directory removal + soptions.info_log.reset(); + // Ignore error in case directory does not exist + env->GetChildren(dbname, &filenames); + + FileLock* lock; + const std::string lockname = LockFileName(dbname); + Status result = env->LockFile(lockname, &lock); + if (result.ok()) { + uint64_t number; + FileType type; + InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname); + for (const auto& fname : filenames) { + if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) && + type != kDBLockFile) { // Lock file will be deleted at end + Status del; + std::string path_to_delete = dbname + "/" + fname; + if (type == kMetaDatabase) { + del = DestroyDB(path_to_delete, options); + } else if (type == kTableFile || type == kLogFile) { + del = DeleteDBFile(&soptions, path_to_delete, dbname, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); + } else { + del = env->DeleteFile(path_to_delete); + } + if (result.ok() && !del.ok()) { + result = del; + } + } + } + + std::vector paths; + + for (const auto& path : options.db_paths) { + paths.emplace_back(path.path); + } + for (const auto& cf : column_families) { + for (const auto& path : cf.options.cf_paths) { + paths.emplace_back(path.path); + } + } + + // Remove duplicate paths. + // Note that we compare only the actual paths but not path ids. + // This reason is that same path can appear at different path_ids + // for different column families. + std::sort(paths.begin(), paths.end()); + paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); + + for (const auto& path : paths) { + if (env->GetChildren(path, &filenames).ok()) { + for (const auto& fname : filenames) { + if (ParseFileName(fname, &number, &type) && + type == kTableFile) { // Lock file will be deleted at end + std::string table_path = path + "/" + fname; + Status del = DeleteDBFile(&soptions, table_path, dbname, + /*force_bg=*/false, /*force_fg=*/false); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + env->DeleteDir(path); + } + } + + std::vector walDirFiles; + std::string archivedir = ArchivalDirectory(dbname); + bool wal_dir_exists = false; + if (dbname != soptions.wal_dir) { + wal_dir_exists = env->GetChildren(soptions.wal_dir, &walDirFiles).ok(); + archivedir = ArchivalDirectory(soptions.wal_dir); + } + + // Archive dir may be inside wal dir or dbname and should be + // processed and removed before those otherwise we have issues + // removing them + std::vector archiveFiles; + if (env->GetChildren(archivedir, &archiveFiles).ok()) { + // Delete archival files. + for (const auto& file : archiveFiles) { + if (ParseFileName(file, &number, &type) && type == kLogFile) { + Status del = + DeleteDBFile(&soptions, archivedir + "/" + file, archivedir, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + env->DeleteDir(archivedir); + } + + // Delete log files in the WAL dir + if (wal_dir_exists) { + for (const auto& file : walDirFiles) { + if (ParseFileName(file, &number, &type) && type == kLogFile) { + Status del = + DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number), + soptions.wal_dir, /*force_bg=*/false, + /*force_fg=*/!wal_in_db_path); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + env->DeleteDir(soptions.wal_dir); + } + + env->UnlockFile(lock); // Ignore error since state is already gone + env->DeleteFile(lockname); + + // sst_file_manager holds a ref to the logger. Make sure the logger is + // gone before trying to remove the directory. + soptions.sst_file_manager.reset(); + + env->DeleteDir(dbname); // Ignore error in case dir contains other files + } + return result; +} + +Status DBImpl::WriteOptionsFile(bool need_mutex_lock, + bool need_enter_write_thread) { +#ifndef ROCKSDB_LITE + WriteThread::Writer w; + if (need_mutex_lock) { + mutex_.Lock(); + } else { + mutex_.AssertHeld(); + } + if (need_enter_write_thread) { + write_thread_.EnterUnbatched(&w, &mutex_); + } + + std::vector cf_names; + std::vector cf_opts; + + // This part requires mutex to protect the column family options + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + cf_names.push_back(cfd->GetName()); + cf_opts.push_back(cfd->GetLatestCFOptions()); + } + + // Unlock during expensive operations. New writes cannot get here + // because the single write thread ensures all new writes get queued. + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + mutex_.Unlock(); + + TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1"); + TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2"); + + std::string file_name = + TempOptionsFileName(GetName(), versions_->NewFileNumber()); + Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name, + GetFileSystem()); + + if (s.ok()) { + s = RenameTempFileToOptionsFile(file_name); + } + // restore lock + if (!need_mutex_lock) { + mutex_.Lock(); + } + if (need_enter_write_thread) { + write_thread_.ExitUnbatched(&w); + } + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unnable to persist options -- %s", s.ToString().c_str()); + if (immutable_db_options_.fail_if_options_file_error) { + return Status::IOError("Unable to persist options.", + s.ToString().c_str()); + } + } +#else + (void)need_mutex_lock; + (void)need_enter_write_thread; +#endif // !ROCKSDB_LITE + return Status::OK(); +} + +#ifndef ROCKSDB_LITE +namespace { +void DeleteOptionsFilesHelper(const std::map& filenames, + const size_t num_files_to_keep, + const std::shared_ptr& info_log, + Env* env) { + if (filenames.size() <= num_files_to_keep) { + return; + } + for (auto iter = std::next(filenames.begin(), num_files_to_keep); + iter != filenames.end(); ++iter) { + if (!env->DeleteFile(iter->second).ok()) { + ROCKS_LOG_WARN(info_log, "Unable to delete options file %s", + iter->second.c_str()); + } + } +} +} // namespace +#endif // !ROCKSDB_LITE + +Status DBImpl::DeleteObsoleteOptionsFiles() { +#ifndef ROCKSDB_LITE + std::vector filenames; + // use ordered map to store keep the filenames sorted from the newest + // to the oldest. + std::map options_filenames; + Status s; + s = GetEnv()->GetChildren(GetName(), &filenames); + if (!s.ok()) { + return s; + } + for (auto& filename : filenames) { + uint64_t file_number; + FileType type; + if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) { + options_filenames.insert( + {std::numeric_limits::max() - file_number, + GetName() + "/" + filename}); + } + } + + // Keeps the latest 2 Options file + const size_t kNumOptionsFilesKept = 2; + DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept, + immutable_db_options_.info_log, GetEnv()); + return Status::OK(); +#else + return Status::OK(); +#endif // !ROCKSDB_LITE +} + +Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { +#ifndef ROCKSDB_LITE + Status s; + + uint64_t options_file_number = versions_->NewFileNumber(); + std::string options_file_name = + OptionsFileName(GetName(), options_file_number); + // Retry if the file name happen to conflict with an existing one. + s = GetEnv()->RenameFile(file_name, options_file_name); + if (s.ok()) { + InstrumentedMutexLock l(&mutex_); + versions_->options_file_number_ = options_file_number; + } + + if (0 == disable_delete_obsolete_files_) { + DeleteObsoleteOptionsFiles(); + } + return s; +#else + (void)file_name; + return Status::OK(); +#endif // !ROCKSDB_LITE +} + +#ifdef ROCKSDB_USING_THREAD_STATUS + +void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(), + cfd->ioptions()->env); + } +} + +void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::EraseColumnFamilyInfo(cfd); + } +} + +void DBImpl::EraseThreadStatusDbInfo() const { + if (immutable_db_options_.enable_thread_tracking) { + ThreadStatusUtil::EraseDatabaseInfo(this); + } +} + +#else +void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {} + +void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {} + +void DBImpl::EraseThreadStatusDbInfo() const {} +#endif // ROCKSDB_USING_THREAD_STATUS + +// +// A global method that can dump out the build version +void DumpRocksDBBuildVersion(Logger* log) { +#if !defined(IOS_CROSS_COMPILE) + // if we compile with Xcode, we don't run build_detect_version, so we don't + // generate util/build_version.cc + ROCKS_LOG_HEADER(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, + ROCKSDB_MINOR, ROCKSDB_PATCH); + ROCKS_LOG_HEADER(log, "Git sha %s", rocksdb_build_git_sha); + ROCKS_LOG_HEADER(log, "Compile date %s", rocksdb_build_compile_date); +#else + (void)log; // ignore "-Wunused-parameter" +#endif +} + +#ifndef ROCKSDB_LITE +SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv, + bool include_history) { + // Find the earliest sequence number that we know we can rely on reading + // from the memtable without needing to check sst files. + SequenceNumber earliest_seq = + sv->imm->GetEarliestSequenceNumber(include_history); + if (earliest_seq == kMaxSequenceNumber) { + earliest_seq = sv->mem->GetEarliestSequenceNumber(); + } + assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq); + + return earliest_seq; +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, + bool cache_only, + SequenceNumber lower_bound_seq, + SequenceNumber* seq, + bool* found_record_for_key, + bool* is_blob_index) { + Status s; + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + + ReadOptions read_options; + SequenceNumber current_seq = versions_->LastSequence(); + LookupKey lkey(key, current_seq); + + *seq = kMaxSequenceNumber; + *found_record_for_key = false; + + // Check if there is a record for this key in the latest memtable + sv->mem->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq, + seq, read_options, nullptr /*read_callback*/, is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from MemTable::Get: %s\n", + s.ToString().c_str()); + + return s; + } + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check immutable memtables + *found_record_for_key = true; + return Status::OK(); + } + + SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber(); + if (lower_bound_in_mem != kMaxSequenceNumber && + lower_bound_in_mem < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + + // Check if there is a record for this key in the immutable memtables + sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq, + seq, read_options, nullptr /*read_callback*/, is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from MemTableList::Get: %s\n", + s.ToString().c_str()); + + return s; + } + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check memtable history + *found_record_for_key = true; + return Status::OK(); + } + + SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber(); + if (lower_bound_in_imm != kMaxSequenceNumber && + lower_bound_in_imm < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + + // Check if there is a record for this key in the immutable memtables + sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, + &max_covering_tombstone_seq, seq, read_options, + is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading memtable. + ROCKS_LOG_ERROR( + immutable_db_options_.info_log, + "Unexpected status returned from MemTableList::GetFromHistory: %s\n", + s.ToString().c_str()); + + return s; + } + + if (*seq != kMaxSequenceNumber) { + // Found a sequence number, no need to check SST files + *found_record_for_key = true; + return Status::OK(); + } + + // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true) + // check here to skip the history if possible. But currently the caller + // already does that. Maybe we should move the logic here later. + + // TODO(agiardullo): possible optimization: consider checking cached + // SST files if cache_only=true? + if (!cache_only) { + // Check tables + sv->current->Get(read_options, lkey, nullptr, &s, &merge_context, + &max_covering_tombstone_seq, nullptr /* value_found */, + found_record_for_key, seq, nullptr /*read_callback*/, + is_blob_index); + + if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { + // unexpected error reading SST files + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Unexpected status returned from Version::Get: %s\n", + s.ToString().c_str()); + } + } + + return s; +} + +Status DBImpl::IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& ingestion_options) { + IngestExternalFileArg arg; + arg.column_family = column_family; + arg.external_files = external_files; + arg.options = ingestion_options; + return IngestExternalFiles({arg}); +} + +Status DBImpl::IngestExternalFiles( + const std::vector& args) { + if (args.empty()) { + return Status::InvalidArgument("ingestion arg list is empty"); + } + { + std::unordered_set unique_cfhs; + for (const auto& arg : args) { + if (arg.column_family == nullptr) { + return Status::InvalidArgument("column family handle is null"); + } else if (unique_cfhs.count(arg.column_family) > 0) { + return Status::InvalidArgument( + "ingestion args have duplicate column families"); + } + unique_cfhs.insert(arg.column_family); + } + } + // Ingest multiple external SST files atomically. + size_t num_cfs = args.size(); + for (size_t i = 0; i != num_cfs; ++i) { + if (args[i].external_files.empty()) { + char err_msg[128] = {0}; + snprintf(err_msg, 128, "external_files[%zu] is empty", i); + return Status::InvalidArgument(err_msg); + } + } + for (const auto& arg : args) { + const IngestExternalFileOptions& ingest_opts = arg.options; + if (ingest_opts.ingest_behind && + !immutable_db_options_.allow_ingest_behind) { + return Status::InvalidArgument( + "can't ingest_behind file in DB with allow_ingest_behind=false"); + } + } + + // TODO (yanqin) maybe handle the case in which column_families have + // duplicates + std::unique_ptr::iterator> pending_output_elem; + size_t total = 0; + for (const auto& arg : args) { + total += arg.external_files.size(); + } + uint64_t next_file_number = 0; + Status status = ReserveFileNumbersBeforeIngestion( + static_cast(args[0].column_family)->cfd(), total, + pending_output_elem, &next_file_number); + if (!status.ok()) { + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + return status; + } + + std::vector ingestion_jobs; + for (const auto& arg : args) { + auto* cfd = static_cast(arg.column_family)->cfd(); + ingestion_jobs.emplace_back( + env_, versions_.get(), cfd, immutable_db_options_, file_options_, + &snapshots_, arg.options, &directories_, &event_logger_); + } + std::vector> exec_results; + for (size_t i = 0; i != num_cfs; ++i) { + exec_results.emplace_back(false, Status::OK()); + } + // TODO(yanqin) maybe make jobs run in parallel + uint64_t start_file_number = next_file_number; + for (size_t i = 1; i != num_cfs; ++i) { + start_file_number += args[i - 1].external_files.size(); + auto* cfd = + static_cast(args[i].column_family)->cfd(); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + exec_results[i].second = ingestion_jobs[i].Prepare( + args[i].external_files, start_file_number, super_version); + exec_results[i].first = true; + CleanupSuperVersion(super_version); + } + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0"); + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"); + { + auto* cfd = + static_cast(args[0].column_family)->cfd(); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + exec_results[0].second = ingestion_jobs[0].Prepare( + args[0].external_files, next_file_number, super_version); + exec_results[0].first = true; + CleanupSuperVersion(super_version); + } + for (const auto& exec_result : exec_results) { + if (!exec_result.second.ok()) { + status = exec_result.second; + break; + } + } + if (!status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + if (exec_results[i].first) { + ingestion_jobs[i].Cleanup(status); + } + } + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + return status; + } + + std::vector sv_ctxs; + for (size_t i = 0; i != num_cfs; ++i) { + sv_ctxs.emplace_back(true /* create_superversion */); + } + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0"); + TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1"); + TEST_SYNC_POINT("DBImpl::AddFile:Start"); + { + InstrumentedMutexLock l(&mutex_); + TEST_SYNC_POINT("DBImpl::AddFile:MutexLock"); + + // Stop writes to the DB by entering both write threads + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + // When unordered_write is enabled, the keys are writing to memtable in an + // unordered way. If the ingestion job checks memtable key range before the + // key landing in memtable, the ingestion job may skip the necessary + // memtable flush. + // So wait here to ensure there is no pending write to memtable. + WaitForPendingWrites(); + + num_running_ingest_file_ += static_cast(num_cfs); + TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter"); + + bool at_least_one_cf_need_flush = false; + std::vector need_flush(num_cfs, false); + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (cfd->IsDropped()) { + // TODO (yanqin) investigate whether we should abort ingestion or + // proceed with other non-dropped column families. + status = Status::InvalidArgument( + "cannot ingest an external file into a dropped CF"); + break; + } + bool tmp = false; + status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion()); + need_flush[i] = tmp; + at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp); + if (!status.ok()) { + break; + } + } + TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush", + &at_least_one_cf_need_flush); + + if (status.ok() && at_least_one_cf_need_flush) { + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + if (immutable_db_options_.atomic_flush) { + autovector cfds_to_flush; + SelectColumnFamiliesForAtomicFlush(&cfds_to_flush); + mutex_.Unlock(); + status = AtomicFlushMemTables(cfds_to_flush, flush_opts, + FlushReason::kExternalFileIngestion, + true /* writes_stopped */); + mutex_.Lock(); + } else { + for (size_t i = 0; i != num_cfs; ++i) { + if (need_flush[i]) { + mutex_.Unlock(); + auto* cfd = + static_cast(args[i].column_family) + ->cfd(); + status = FlushMemTable(cfd, flush_opts, + FlushReason::kExternalFileIngestion, + true /* writes_stopped */); + mutex_.Lock(); + if (!status.ok()) { + break; + } + } + } + } + } + // Run ingestion jobs. + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + status = ingestion_jobs[i].Run(); + if (!status.ok()) { + break; + } + } + } + if (status.ok()) { + int consumed_seqno_count = + ingestion_jobs[0].ConsumedSequenceNumbersCount(); +#ifndef NDEBUG + for (size_t i = 1; i != num_cfs; ++i) { + assert(!!consumed_seqno_count == + !!ingestion_jobs[i].ConsumedSequenceNumbersCount()); + consumed_seqno_count += + ingestion_jobs[i].ConsumedSequenceNumbersCount(); + } +#endif + if (consumed_seqno_count > 0) { + const SequenceNumber last_seqno = versions_->LastSequence(); + versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count); + versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count); + versions_->SetLastSequence(last_seqno + consumed_seqno_count); + } + autovector cfds_to_commit; + autovector mutable_cf_options_list; + autovector> edit_lists; + uint32_t num_entries = 0; + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (cfd->IsDropped()) { + continue; + } + cfds_to_commit.push_back(cfd); + mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions()); + autovector edit_list; + edit_list.push_back(ingestion_jobs[i].edit()); + edit_lists.push_back(edit_list); + ++num_entries; + } + // Mark the version edits as an atomic group if the number of version + // edits exceeds 1. + if (cfds_to_commit.size() > 1) { + for (auto& edits : edit_lists) { + assert(edits.size() == 1); + edits[0]->MarkAtomicGroup(--num_entries); + } + assert(0 == num_entries); + } + status = + versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, + edit_lists, &mutex_, directories_.GetDbDir()); + } + + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (!cfd->IsDropped()) { + InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i], + *cfd->GetLatestMutableCFOptions()); +#ifndef NDEBUG + if (0 == i && num_cfs > 1) { + TEST_SYNC_POINT( + "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0"); + TEST_SYNC_POINT( + "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"); + } +#endif // !NDEBUG + } + } + } + + // Resume writes to the DB + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); + + if (status.ok()) { + for (auto& job : ingestion_jobs) { + job.UpdateStats(); + } + } + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + num_running_ingest_file_ -= static_cast(num_cfs); + if (0 == num_running_ingest_file_) { + bg_cv_.SignalAll(); + } + TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock"); + } + // mutex_ is unlocked here + + // Cleanup + for (size_t i = 0; i != num_cfs; ++i) { + sv_ctxs[i].Clean(); + // This may rollback jobs that have completed successfully. This is + // intended for atomicity. + ingestion_jobs[i].Cleanup(status); + } + if (status.ok()) { + for (size_t i = 0; i != num_cfs; ++i) { + auto* cfd = + static_cast(args[i].column_family)->cfd(); + if (!cfd->IsDropped()) { + NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]); + } + } + } + return status; +} + +Status DBImpl::CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) { + assert(handle != nullptr); + assert(*handle == nullptr); + std::string cf_comparator_name = options.comparator->Name(); + if (cf_comparator_name != metadata.db_comparator_name) { + return Status::InvalidArgument("Comparator name mismatch"); + } + + // Create column family. + auto status = CreateColumnFamily(options, column_family_name, handle); + if (!status.ok()) { + return status; + } + + // Import sst files from metadata. + auto cfh = reinterpret_cast(*handle); + auto cfd = cfh->cfd(); + ImportColumnFamilyJob import_job(env_, versions_.get(), cfd, + immutable_db_options_, file_options_, + import_options, metadata.files); + + SuperVersionContext dummy_sv_ctx(/* create_superversion */ true); + VersionEdit dummy_edit; + uint64_t next_file_number = 0; + std::unique_ptr::iterator> pending_output_elem; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + if (error_handler_.IsDBStopped()) { + // Don't import files when there is a bg_error + status = error_handler_.GetBGError(); + } + + // Make sure that bg cleanup wont delete the files that we are importing + pending_output_elem.reset(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + if (status.ok()) { + // If crash happen after a hard link established, Recover function may + // reuse the file number that has already assigned to the internal file, + // and this will overwrite the external file. To protect the external + // file, we have to make sure the file number will never being reused. + next_file_number = versions_->FetchAddFileNumber(metadata.files.size()); + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); + } + } + } + dummy_sv_ctx.Clean(); + + if (status.ok()) { + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + status = import_job.Prepare(next_file_number, sv); + CleanupSuperVersion(sv); + } + + if (status.ok()) { + SuperVersionContext sv_context(true /*create_superversion*/); + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + + // Stop writes to the DB by entering both write threads + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + num_running_ingest_file_++; + assert(!cfd->IsDropped()); + status = import_job.Run(); + + // Install job edit [Mutex will be unlocked here] + if (status.ok()) { + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(), + &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options); + } + } + + // Resume writes to the DB + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); + + num_running_ingest_file_--; + if (num_running_ingest_file_ == 0) { + bg_cv_.SignalAll(); + } + } + // mutex_ is unlocked here + + sv_context.Clean(); + } + + { + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + } + + import_job.Cleanup(status); + if (!status.ok()) { + DropColumnFamily(*handle); + DestroyColumnFamilyHandle(*handle); + *handle = nullptr; + } + return status; +} + +Status DBImpl::VerifyChecksum(const ReadOptions& read_options) { + Status s; + std::vector cfd_list; + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized()) { + cfd->Ref(); + cfd_list.push_back(cfd); + } + } + } + std::vector sv_list; + for (auto cfd : cfd_list) { + sv_list.push_back(cfd->GetReferencedSuperVersion(this)); + } + for (auto& sv : sv_list) { + VersionStorageInfo* vstorage = sv->current->storage_info(); + ColumnFamilyData* cfd = sv->current->cfd(); + Options opts; + { + InstrumentedMutexLock l(&mutex_); + opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_), + cfd->GetLatestCFOptions()); + } + for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) { + for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok(); + j++) { + const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd; + std::string fname = TableFileName(cfd->ioptions()->cf_paths, + fd.GetNumber(), fd.GetPathId()); + s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(opts, file_options_, + read_options, fname); + } + } + if (!s.ok()) { + break; + } + } + bool defer_purge = + immutable_db_options().avoid_unnecessary_blocking_io; + { + InstrumentedMutexLock l(&mutex_); + for (auto sv : sv_list) { + if (sv && sv->Unref()) { + sv->Cleanup(); + if (defer_purge) { + AddSuperVersionsToFreeQueue(sv); + } else { + delete sv; + } + } + } + if (defer_purge) { + SchedulePurge(); + } + for (auto cfd : cfd_list) { + cfd->UnrefAndTryDelete(); + } + } + return s; +} + +void DBImpl::NotifyOnExternalFileIngested( + ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) { + if (immutable_db_options_.listeners.empty()) { + return; + } + + for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) { + ExternalFileIngestionInfo info; + info.cf_name = cfd->GetName(); + info.external_file_path = f.external_file_path; + info.internal_file_path = f.internal_file_path; + info.global_seqno = f.assigned_seqno; + info.table_properties = f.table_properties; + for (auto listener : immutable_db_options_.listeners) { + listener->OnExternalFileIngested(this, info); + } + } +} + +void DBImpl::WaitForIngestFile() { + mutex_.AssertHeld(); + while (num_running_ingest_file_ > 0) { + bg_cv_.Wait(); + } +} + +Status DBImpl::StartTrace(const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + InstrumentedMutexLock lock(&trace_mutex_); + tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer))); + return Status::OK(); +} + +Status DBImpl::EndTrace() { + InstrumentedMutexLock lock(&trace_mutex_); + Status s; + if (tracer_ != nullptr) { + s = tracer_->Close(); + tracer_.reset(); + } else { + return Status::IOError("No trace file to close"); + } + return s; +} + +Status DBImpl::StartBlockCacheTrace( + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + return block_cache_tracer_.StartTrace(env_, trace_options, + std::move(trace_writer)); +} + +Status DBImpl::EndBlockCacheTrace() { + block_cache_tracer_.EndTrace(); + return Status::OK(); +} + +Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) { + Status s; + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + s = tracer_->IteratorSeek(cf_id, key); + } + } + return s; +} + +Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, + const Slice& key) { + Status s; + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + s = tracer_->IteratorSeekForPrev(cf_id, key); + } + } + return s; +} + +Status DBImpl::ReserveFileNumbersBeforeIngestion( + ColumnFamilyData* cfd, uint64_t num, + std::unique_ptr::iterator>& pending_output_elem, + uint64_t* next_file_number) { + Status s; + SuperVersionContext dummy_sv_ctx(true /* create_superversion */); + assert(nullptr != next_file_number); + InstrumentedMutexLock l(&mutex_); + if (error_handler_.IsDBStopped()) { + // Do not ingest files when there is a bg_error + return error_handler_.GetBGError(); + } + pending_output_elem.reset(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + *next_file_number = versions_->FetchAddFileNumber(static_cast(num)); + auto cf_options = cfd->GetLatestMutableCFOptions(); + VersionEdit dummy_edit; + // If crash happen after a hard link established, Recover function may + // reuse the file number that has already assigned to the internal file, + // and this will overwrite the external file. To protect the external + // file, we have to make sure the file number will never being reused. + s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + if (s.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); + } + dummy_sv_ctx.Clean(); + return s; +} + +Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) { + if (mutable_db_options_.max_open_files == -1) { + uint64_t oldest_time = port::kMaxUint64; + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped()) { + uint64_t ctime; + { + SuperVersion* sv = GetAndRefSuperVersion(cfd); + Version* version = sv->current; + version->GetCreationTimeOfOldestFile(&ctime); + ReturnAndCleanupSuperVersion(cfd, sv); + } + + if (ctime < oldest_time) { + oldest_time = ctime; + } + if (oldest_time == 0) { + break; + } + } + } + *creation_time = oldest_time; + return Status::OK(); + } else { + return Status::NotSupported("This API only works if max_open_files = -1"); + } +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl.h b/src/rocksdb/db/db_impl/db_impl.h new file mode 100644 index 000000000..119555cb4 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl.h @@ -0,0 +1,2107 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/column_family.h" +#include "db/compaction/compaction_job.h" +#include "db/dbformat.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/flush_job.h" +#include "db/flush_scheduler.h" +#include "db/import_column_family_job.h" +#include "db/internal_stats.h" +#include "db/log_writer.h" +#include "db/logs_with_prep_tracker.h" +#include "db/memtable_list.h" +#include "db/pre_release_callback.h" +#include "db/range_del_aggregator.h" +#include "db/read_callback.h" +#include "db/snapshot_checker.h" +#include "db/snapshot_impl.h" +#include "db/trim_history_scheduler.h" +#include "db/version_edit.h" +#include "db/wal_manager.h" +#include "db/write_controller.h" +#include "db/write_thread.h" +#include "logging/event_logger.h" +#include "monitoring/instrumented_mutex.h" +#include "options/db_options.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/status.h" +#include "rocksdb/trace_reader_writer.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/write_buffer_manager.h" +#include "table/scoped_arena_iterator.h" +#include "trace_replay/block_cache_tracer.h" +#include "trace_replay/trace_replay.h" +#include "util/autovector.h" +#include "util/hash.h" +#include "util/repeatable_thread.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" + +namespace ROCKSDB_NAMESPACE { + +class Arena; +class ArenaWrappedDBIter; +class InMemoryStatsHistoryIterator; +class MemTable; +class PersistentStatsHistoryIterator; +class TableCache; +class TaskLimiterToken; +class Version; +class VersionEdit; +class VersionSet; +class WriteCallback; +struct JobContext; +struct ExternalSstFileInfo; +struct MemTableInfo; + +// Class to maintain directories for all database paths other than main one. +class Directories { + public: + Status SetDirectories(Env* env, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths); + + Directory* GetDataDir(size_t path_id) const { + assert(path_id < data_dirs_.size()); + Directory* ret_dir = data_dirs_[path_id].get(); + if (ret_dir == nullptr) { + // Should use db_dir_ + return db_dir_.get(); + } + return ret_dir; + } + + Directory* GetWalDir() { + if (wal_dir_) { + return wal_dir_.get(); + } + return db_dir_.get(); + } + + Directory* GetDbDir() { return db_dir_.get(); } + + private: + std::unique_ptr db_dir_; + std::vector> data_dirs_; + std::unique_ptr wal_dir_; +}; + +// While DB is the public interface of RocksDB, and DBImpl is the actual +// class implementing it. It's the entrance of the core RocksdB engine. +// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a +// DBImpl internally. +// Other than functions implementing the DB interface, some public +// functions are there for other internal components to call. For +// example, TransactionDB directly calls DBImpl::WriteImpl() and +// BlobDB directly calls DBImpl::GetImpl(). Some other functions +// are for sub-components to call. For example, ColumnFamilyHandleImpl +// calls DBImpl::FindObsoleteFiles(). +// +// Since it's a very large class, the definition of the functions is +// divided in several db_impl_*.cc files, besides db_impl.cc. +class DBImpl : public DB { + public: + DBImpl(const DBOptions& options, const std::string& dbname, + const bool seq_per_batch = false, const bool batch_per_txn = true); + // No copying allowed + DBImpl(const DBImpl&) = delete; + void operator=(const DBImpl&) = delete; + + virtual ~DBImpl(); + + // ---- Implementations of the DB interface ---- + + using DB::Resume; + virtual Status Resume() override; + + using DB::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + using DB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override; + using DB::Delete; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) override; + using DB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) override; + using DB::Write; + virtual Status Write(const WriteOptions& options, + WriteBatch* updates) override; + + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + using DB::GetMergeOperands; + Status GetMergeOperands(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* merge_operands, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.merge_operands = merge_operands; + get_impl_options.get_merge_operands_options = get_merge_operands_options; + get_impl_options.number_of_operands = number_of_operands; + get_impl_options.get_value = false; + return GetImpl(options, key, get_impl_options); + } + + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + // This MultiGet is a batched version, which may be faster than calling Get + // multiple times, especially if the keys have some spatial locality that + // enables them to be queried in the same SST files/set of files. The larger + // the batch size, the more scope for batching and performance improvement + // The values and statuses parameters are arrays with number of elements + // equal to keys.size(). This allows the storage for those to be alloacted + // by the caller on the stack for small batches + virtual void MultiGet(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + + virtual void MultiGetWithCallback( + const ReadOptions& options, ColumnFamilyHandle* column_family, + ReadCallback* callback, + autovector* sorted_keys); + + virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const std::string& column_family, + ColumnFamilyHandle** handle) override; + virtual Status CreateColumnFamilies( + const ColumnFamilyOptions& cf_options, + const std::vector& column_family_names, + std::vector* handles) override; + virtual Status CreateColumnFamilies( + const std::vector& column_families, + std::vector* handles) override; + virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override; + virtual Status DropColumnFamilies( + const std::vector& column_families) override; + + // Returns false if key doesn't exist in the database and true if it may. + // If value_found is not passed in as null, then return the value if found in + // memory. On return, if value was found, then value_found will be set to true + // , otherwise false. + using DB::KeyMayExist; + virtual bool KeyMayExist(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value, + bool* value_found = nullptr) override; + + using DB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* column_family) override; + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + virtual const Snapshot* GetSnapshot() override; + virtual void ReleaseSnapshot(const Snapshot* snapshot) override; + using DB::GetProperty; + virtual bool GetProperty(ColumnFamilyHandle* column_family, + const Slice& property, std::string* value) override; + using DB::GetMapProperty; + virtual bool GetMapProperty( + ColumnFamilyHandle* column_family, const Slice& property, + std::map* value) override; + using DB::GetIntProperty; + virtual bool GetIntProperty(ColumnFamilyHandle* column_family, + const Slice& property, uint64_t* value) override; + using DB::GetAggregatedIntProperty; + virtual bool GetAggregatedIntProperty(const Slice& property, + uint64_t* aggregated_value) override; + using DB::GetApproximateSizes; + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes) override; + using DB::GetApproximateMemTableStats; + virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, + const Range& range, + uint64_t* const count, + uint64_t* const size) override; + using DB::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override; + + using DB::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, const int output_level, + const int output_path_id = -1, + std::vector* const output_file_names = nullptr, + CompactionJobInfo* compaction_job_info = nullptr) override; + + virtual Status PauseBackgroundWork() override; + virtual Status ContinueBackgroundWork() override; + + virtual Status EnableAutoCompaction( + const std::vector& column_family_handles) override; + + virtual void EnableManualCompaction() override; + virtual void DisableManualCompaction() override; + + using DB::SetOptions; + Status SetOptions( + ColumnFamilyHandle* column_family, + const std::unordered_map& options_map) override; + + virtual Status SetDBOptions( + const std::unordered_map& options_map) override; + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) override; + using DB::MaxMemCompactionLevel; + virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override; + using DB::Level0StopWriteTrigger; + virtual int Level0StopWriteTrigger( + ColumnFamilyHandle* column_family) override; + virtual const std::string& GetName() const override; + virtual Env* GetEnv() const override; + virtual FileSystem* GetFileSystem() const override; + using DB::GetOptions; + virtual Options GetOptions(ColumnFamilyHandle* column_family) const override; + using DB::GetDBOptions; + virtual DBOptions GetDBOptions() const override; + using DB::Flush; + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) override; + virtual Status Flush( + const FlushOptions& options, + const std::vector& column_families) override; + virtual Status FlushWAL(bool sync) override; + bool TEST_WALBufferIsEmpty(bool lock = true); + virtual Status SyncWAL() override; + virtual Status LockWAL() override; + virtual Status UnlockWAL() override; + + virtual SequenceNumber GetLatestSequenceNumber() const override; + + virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override; + + virtual Status GetDbIdentity(std::string& identity) const override; + + virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const; + + ColumnFamilyHandle* DefaultColumnFamily() const override; + + ColumnFamilyHandle* PersistentStatsColumnFamily() const; + + virtual Status Close() override; + + Status GetStatsHistory( + uint64_t start_time, uint64_t end_time, + std::unique_ptr* stats_iterator) override; + +#ifndef ROCKSDB_LITE + using DB::ResetStats; + virtual Status ResetStats() override; + virtual Status DisableFileDeletions() override; + virtual Status EnableFileDeletions(bool force) override; + virtual int IsFileDeletionsEnabled() const; + // All the returned filenames start with "/" + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) override; + virtual Status GetSortedWalFiles(VectorLogPtr& files) override; + virtual Status GetCurrentWalFile( + std::unique_ptr* current_log_file) override; + virtual Status GetCreationTimeOfOldestFile( + uint64_t* creation_time) override; + + virtual Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options = + TransactionLogIterator::ReadOptions()) override; + virtual Status DeleteFile(std::string name) override; + Status DeleteFilesInRanges(ColumnFamilyHandle* column_family, + const RangePtr* ranges, size_t n, + bool include_end = true); + + virtual void GetLiveFilesMetaData( + std::vector* metadata) override; + + // Obtains the meta data of the specified column family of the DB. + // Status::NotFound() will be returned if the current DB does not have + // any column family match the specified name. + // TODO(yhchiang): output parameter is placed in the end in this codebase. + virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* metadata) override; + + Status SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) override; + + Status PromoteL0(ColumnFamilyHandle* column_family, + int target_level) override; + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& ingestion_options) override; + + using DB::IngestExternalFiles; + virtual Status IngestExternalFiles( + const std::vector& args) override; + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) override; + + using DB::VerifyChecksum; + virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override; + + using DB::StartTrace; + virtual Status StartTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndTrace; + virtual Status EndTrace() override; + + using DB::StartBlockCacheTrace; + Status StartBlockCacheTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndBlockCacheTrace; + Status EndBlockCacheTrace() override; + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables( + ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override; + +#endif // ROCKSDB_LITE + + // ---- End of implementations of the DB interface ---- + + struct GetImplOptions { + ColumnFamilyHandle* column_family = nullptr; + PinnableSlice* value = nullptr; + bool* value_found = nullptr; + ReadCallback* callback = nullptr; + bool* is_blob_index = nullptr; + // If true return value associated with key via value pointer else return + // all merge operands for key via merge_operands pointer + bool get_value = true; + // Pointer to an array of size + // get_merge_operands_options.expected_max_number_of_operands allocated by + // user + PinnableSlice* merge_operands = nullptr; + GetMergeOperandsOptions* get_merge_operands_options = nullptr; + int* number_of_operands = nullptr; + }; + + // Function that Get and KeyMayExist call with no_io true or false + // Note: 'value_found' from KeyMayExist propagates here + // This function is also called by GetMergeOperands + // If get_impl_options.get_value = true get value associated with + // get_impl_options.key via get_impl_options.value + // If get_impl_options.get_value = false get merge operands associated with + // get_impl_options.key via get_impl_options.merge_operands + Status GetImpl(const ReadOptions& options, const Slice& key, + GetImplOptions get_impl_options); + + ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool allow_blob = false, + bool allow_refresh = true); + + virtual SequenceNumber GetLastPublishedSequence() const { + if (last_seq_same_as_publish_seq_) { + return versions_->LastSequence(); + } else { + return versions_->LastPublishedSequence(); + } + } + + // REQUIRES: joined the main write queue if two_write_queues is disabled, and + // the second write queue otherwise. + virtual void SetLastPublishedSequence(SequenceNumber seq); + // Returns LastSequence in last_seq_same_as_publish_seq_ + // mode and LastAllocatedSequence otherwise. This is useful when visiblility + // depends also on data written to the WAL but not to the memtable. + SequenceNumber TEST_GetLastVisibleSequence() const; + +#ifndef ROCKSDB_LITE + // Similar to Write() but will call the callback once on the single write + // thread to determine whether it is safe to perform the write. + virtual Status WriteWithCallback(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback); + + // Returns the sequence number that is guaranteed to be smaller than or equal + // to the sequence number of any key that could be inserted into the current + // memtables. It can then be assumed that any write with a larger(or equal) + // sequence number will be present in this memtable or a later memtable. + // + // If the earliest sequence number could not be determined, + // kMaxSequenceNumber will be returned. + // + // If include_history=true, will also search Memtables in MemTableList + // History. + SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv, + bool include_history); + + // For a given key, check to see if there are any records for this key + // in the memtables, including memtable history. If cache_only is false, + // SST files will also be checked. + // + // If a key is found, *found_record_for_key will be set to true and + // *seq will be set to the stored sequence number for the latest + // operation on this key or kMaxSequenceNumber if unknown. + // If no key is found, *found_record_for_key will be set to false. + // + // Note: If cache_only=false, it is possible for *seq to be set to 0 if + // the sequence number has been cleared from the record. If the caller is + // holding an active db snapshot, we know the missing sequence must be less + // than the snapshot's sequence number (sequence numbers are only cleared + // when there are no earlier active snapshots). + // + // If NotFound is returned and found_record_for_key is set to false, then no + // record for this key was found. If the caller is holding an active db + // snapshot, we know that no key could have existing after this snapshot + // (since we do not compact keys that have an earlier snapshot). + // + // Only records newer than or at `lower_bound_seq` are guaranteed to be + // returned. Memtables and files may not be checked if it only contains data + // older than `lower_bound_seq`. + // + // Returns OK or NotFound on success, + // other status on unexpected error. + // TODO(andrewkr): this API need to be aware of range deletion operations + Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, + bool cache_only, + SequenceNumber lower_bound_seq, + SequenceNumber* seq, + bool* found_record_for_key, + bool* is_blob_index = nullptr); + + Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key); + Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key); +#endif // ROCKSDB_LITE + + // Similar to GetSnapshot(), but also lets the db know that this snapshot + // will be used for transaction write-conflict checking. The DB can then + // make sure not to compact any keys that would prevent a write-conflict from + // being detected. + const Snapshot* GetSnapshotForWriteConflictBoundary(); + + // checks if all live files exist on file system and that their file sizes + // match to our in-memory records + virtual Status CheckConsistency(); + + // max_file_num_to_ignore allows bottom level compaction to filter out newly + // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will + // disable the filtering + Status RunManualCompaction(ColumnFamilyData* cfd, int input_level, + int output_level, + const CompactRangeOptions& compact_range_options, + const Slice* begin, const Slice* end, + bool exclusive, bool disallow_trivial_move, + uint64_t max_file_num_to_ignore); + + // Return an internal iterator over the current state of the database. + // The keys of this iterator are internal keys (see format.h). + // The returned iterator should be deleted when no longer needed. + InternalIterator* NewInternalIterator( + Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence, + ColumnFamilyHandle* column_family = nullptr); + + LogsWithPrepTracker* logs_with_prep_tracker() { + return &logs_with_prep_tracker_; + } + + struct BGJobLimits { + int max_flushes; + int max_compactions; + }; + // Returns maximum background flushes and compactions allowed to be scheduled + BGJobLimits GetBGJobLimits() const; + // Need a static version that can be called during SanitizeOptions(). + static BGJobLimits GetBGJobLimits(int max_background_flushes, + int max_background_compactions, + int max_background_jobs, + bool parallelize_compactions); + + // move logs pending closing from job_context to the DB queue and + // schedule a purge + void ScheduleBgLogWriterClose(JobContext* job_context); + + uint64_t MinLogNumberToKeep(); + + // Returns the lower bound file number for SSTs that won't be deleted, even if + // they're obsolete. This lower bound is used internally to prevent newly + // created flush/compaction output files from being deleted before they're + // installed. This technique avoids the need for tracking the exact numbers of + // files pending creation, although it prevents more files than necessary from + // being deleted. + uint64_t MinObsoleteSstNumberToKeep(); + + // Returns the list of live files in 'live' and the list + // of all files in the filesystem in 'candidate_files'. + // If force == false and the last call was less than + // db_options_.delete_obsolete_files_period_micros microseconds ago, + // it will not fill up the job_context + void FindObsoleteFiles(JobContext* job_context, bool force, + bool no_full_scan = false); + + // Diffs the files listed in filenames and those that do not + // belong to live files are possibly removed. Also, removes all the + // files in sst_delete_files and log_delete_files. + // It is not necessary to hold the mutex when invoking this method. + // If FindObsoleteFiles() was run, we need to also run + // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true + void PurgeObsoleteFiles(JobContext& background_contet, + bool schedule_only = false); + + // Schedule a background job to actually delete obsolete files. + void SchedulePurge(); + + const SnapshotList& snapshots() const { return snapshots_; } + + // load list of snapshots to `snap_vector` that is no newer than `max_seq` + // in ascending order. + // `oldest_write_conflict_snapshot` is filled with the oldest snapshot + // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true. + void LoadSnapshots(std::vector* snap_vector, + SequenceNumber* oldest_write_conflict_snapshot, + const SequenceNumber& max_seq) const { + InstrumentedMutexLock l(mutex()); + snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq); + } + + const ImmutableDBOptions& immutable_db_options() const { + return immutable_db_options_; + } + + // Cancel all background jobs, including flush, compaction, background + // purging, stats dumping threads, etc. If `wait` = true, wait for the + // running jobs to abort or finish before returning. Otherwise, only + // sends the signals. + void CancelAllBackgroundWork(bool wait); + + // Find Super version and reference it. Based on options, it might return + // the thread local cached one. + // Call ReturnAndCleanupSuperVersion() when it is no longer needed. + SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd); + + // Similar to the previous function but looks up based on a column family id. + // nullptr will be returned if this column family no longer exists. + // REQUIRED: this function should only be called on the write thread or if the + // mutex is held. + SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id); + + // Un-reference the super version and clean it up if it is the last reference. + void CleanupSuperVersion(SuperVersion* sv); + + // Un-reference the super version and return it to thread local cache if + // needed. If it is the last reference of the super version. Clean it up + // after un-referencing it. + void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv); + + // Similar to the previous function but looks up based on a column family id. + // nullptr will be returned if this column family no longer exists. + // REQUIRED: this function should only be called on the write thread. + void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv); + + // REQUIRED: this function should only be called on the write thread or if the + // mutex is held. Return value only valid until next call to this function or + // mutex is released. + ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id); + + // Same as above, should called without mutex held and not on write thread. + std::unique_ptr GetColumnFamilyHandleUnlocked( + uint32_t column_family_id); + + // Returns the number of currently running flushes. + // REQUIREMENT: mutex_ must be held when calling this function. + int num_running_flushes() { + mutex_.AssertHeld(); + return num_running_flushes_; + } + + // Returns the number of currently running compactions. + // REQUIREMENT: mutex_ must be held when calling this function. + int num_running_compactions() { + mutex_.AssertHeld(); + return num_running_compactions_; + } + + const WriteController& write_controller() { return write_controller_; } + + InternalIterator* NewInternalIterator( + const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version, + Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence); + + // hollow transactions shell used for recovery. + // these will then be passed to TransactionDB so that + // locks can be reacquired before writing can resume. + struct RecoveredTransaction { + std::string name_; + bool unprepared_; + + struct BatchInfo { + uint64_t log_number_; + // TODO(lth): For unprepared, the memory usage here can be big for + // unprepared transactions. This is only useful for rollbacks, and we + // can in theory just keep keyset for that. + WriteBatch* batch_; + // Number of sub-batches. A new sub-batch is created if txn attempts to + // insert a duplicate key,seq to memtable. This is currently used in + // WritePreparedTxn/WriteUnpreparedTxn. + size_t batch_cnt_; + }; + + // This maps the seq of the first key in the batch to BatchInfo, which + // contains WriteBatch and other information relevant to the batch. + // + // For WriteUnprepared, batches_ can have size greater than 1, but for + // other write policies, it must be of size 1. + std::map batches_; + + explicit RecoveredTransaction(const uint64_t log, const std::string& name, + WriteBatch* batch, SequenceNumber seq, + size_t batch_cnt, bool unprepared) + : name_(name), unprepared_(unprepared) { + batches_[seq] = {log, batch, batch_cnt}; + } + + ~RecoveredTransaction() { + for (auto& it : batches_) { + delete it.second.batch_; + } + } + + void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch, + size_t batch_cnt, bool unprepared) { + assert(batches_.count(seq) == 0); + batches_[seq] = {log_number, batch, batch_cnt}; + // Prior state must be unprepared, since the prepare batch must be the + // last batch. + assert(unprepared_); + unprepared_ = unprepared; + } + }; + + bool allow_2pc() const { return immutable_db_options_.allow_2pc; } + + std::unordered_map + recovered_transactions() { + return recovered_transactions_; + } + + RecoveredTransaction* GetRecoveredTransaction(const std::string& name) { + auto it = recovered_transactions_.find(name); + if (it == recovered_transactions_.end()) { + return nullptr; + } else { + return it->second; + } + } + + void InsertRecoveredTransaction(const uint64_t log, const std::string& name, + WriteBatch* batch, SequenceNumber seq, + size_t batch_cnt, bool unprepared_batch) { + // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple + // times for every unprepared batch encountered during recovery. + // + // If the transaction is prepared, then the last call to + // InsertRecoveredTransaction will have unprepared_batch = false. + auto rtxn = recovered_transactions_.find(name); + if (rtxn == recovered_transactions_.end()) { + recovered_transactions_[name] = new RecoveredTransaction( + log, name, batch, seq, batch_cnt, unprepared_batch); + } else { + rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch); + } + logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log); + } + + void DeleteRecoveredTransaction(const std::string& name) { + auto it = recovered_transactions_.find(name); + assert(it != recovered_transactions_.end()); + auto* trx = it->second; + recovered_transactions_.erase(it); + for (const auto& info : trx->batches_) { + logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed( + info.second.log_number_); + } + delete trx; + } + + void DeleteAllRecoveredTransactions() { + for (auto it = recovered_transactions_.begin(); + it != recovered_transactions_.end(); ++it) { + delete it->second; + } + recovered_transactions_.clear(); + } + + void AddToLogsToFreeQueue(log::Writer* log_writer) { + logs_to_free_queue_.push_back(log_writer); + } + + void AddSuperVersionsToFreeQueue(SuperVersion* sv) { + superversions_to_free_queue_.push_back(sv); + } + + void SetSnapshotChecker(SnapshotChecker* snapshot_checker); + + // Fill JobContext with snapshot information needed by flush and compaction. + void GetSnapshotContext(JobContext* job_context, + std::vector* snapshot_seqs, + SequenceNumber* earliest_write_conflict_snapshot, + SnapshotChecker** snapshot_checker); + + // Not thread-safe. + void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback); + + InstrumentedMutex* mutex() const { return &mutex_; } + + // Initialize a brand new DB. The DB directory is expected to be empty before + // calling it. + Status NewDB(); + + // This is to be used only by internal rocksdb classes. + static Status Open(const DBOptions& db_options, const std::string& name, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + const bool seq_per_batch, const bool batch_per_txn); + + static Status CreateAndNewDirectory(Env* env, const std::string& dirname, + std::unique_ptr* directory); + + // find stats map from stats_history_ with smallest timestamp in + // the range of [start_time, end_time) + bool FindStatsByTime(uint64_t start_time, uint64_t end_time, + uint64_t* new_time, + std::map* stats_map); + + // Print information of all tombstones of all iterators to the std::string + // This is only used by ldb. The output might be capped. Tombstones + // printed out are not guaranteed to be in any order. + Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str); + +#ifndef NDEBUG + // Compact any files in the named level that overlap [*begin, *end] + Status TEST_CompactRange(int level, const Slice* begin, const Slice* end, + ColumnFamilyHandle* column_family = nullptr, + bool disallow_trivial_move = false); + + void TEST_SwitchWAL(); + + bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; } + + bool TEST_IsLogGettingFlushed() { + return alive_log_files_.begin()->getting_flushed; + } + + Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr); + + // Force current memtable contents to be flushed. + Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false, + ColumnFamilyHandle* cfh = nullptr); + + Status TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts); + + // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This + // is because in certain cases, we can flush column families, wait for the + // flush to complete, but delete the column family handle before the wait + // finishes. For example in CompactRange. + Status TEST_AtomicFlushMemTables(const autovector& cfds, + const FlushOptions& flush_opts); + + // Wait for memtable compaction + Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); + + // Wait for any compaction + // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this + // is only for the special test of CancelledCompactions + Status TEST_WaitForCompact(bool waitUnscheduled = false); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family = nullptr); + + // Return the current manifest file no. + uint64_t TEST_Current_Manifest_FileNo(); + + // Returns the number that'll be assigned to the next file that's created. + uint64_t TEST_Current_Next_FileNo(); + + // get total level0 file size. Only for testing. + uint64_t TEST_GetLevel0TotalSize(); + + void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family, + std::vector>* metadata); + + void TEST_LockMutex(); + + void TEST_UnlockMutex(); + + // REQUIRES: mutex locked + void* TEST_BeginWrite(); + + // REQUIRES: mutex locked + // pass the pointer that you got from TEST_BeginWrite() + void TEST_EndWrite(void* w); + + uint64_t TEST_MaxTotalInMemoryState() const { + return max_total_in_memory_state_; + } + + size_t TEST_LogsToFreeSize(); + + uint64_t TEST_LogfileNumber(); + + uint64_t TEST_total_log_size() const { return total_log_size_; } + + // Returns column family name to ImmutableCFOptions map. + Status TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map); + + // Return the lastest MutableCFOptions of a column family + Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family, + MutableCFOptions* mutable_cf_options); + + Cache* TEST_table_cache() { return table_cache_.get(); } + + WriteController& TEST_write_controler() { return write_controller_; } + + uint64_t TEST_FindMinLogContainingOutstandingPrep(); + uint64_t TEST_FindMinPrepLogReferencedByMemTable(); + size_t TEST_PreparedSectionCompletedSize(); + size_t TEST_LogsWithPrepSize(); + + int TEST_BGCompactionsAllowed() const; + int TEST_BGFlushesAllowed() const; + size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + void TEST_WaitForDumpStatsRun(std::function callback) const; + void TEST_WaitForPersistStatsRun(std::function callback) const; + bool TEST_IsPersistentStatsEnabled() const; + size_t TEST_EstimateInMemoryStatsHistorySize() const; +#endif // NDEBUG + + protected: + const std::string dbname_; + std::string db_id_; + std::unique_ptr versions_; + // Flag to check whether we allocated and own the info log file + bool own_info_log_; + const DBOptions initial_db_options_; + Env* const env_; + std::shared_ptr fs_; + const ImmutableDBOptions immutable_db_options_; + MutableDBOptions mutable_db_options_; + Statistics* stats_; + std::unordered_map + recovered_transactions_; + std::unique_ptr tracer_; + InstrumentedMutex trace_mutex_; + BlockCacheTracer block_cache_tracer_; + + // State below is protected by mutex_ + // With two_write_queues enabled, some of the variables that accessed during + // WriteToWAL need different synchronization: log_empty_, alive_log_files_, + // logs_, logfile_number_. Refer to the definition of each variable below for + // more description. + mutable InstrumentedMutex mutex_; + + ColumnFamilyHandleImpl* default_cf_handle_; + InternalStats* default_cf_internal_stats_; + + // only used for dynamically adjusting max_total_wal_size. it is a sum of + // [write_buffer_size * max_write_buffer_number] over all column families + uint64_t max_total_in_memory_state_; + // If true, we have only one (default) column family. We use this to optimize + // some code-paths + bool single_column_family_mode_; + + // The options to access storage files + const FileOptions file_options_; + + // Additonal options for compaction and flush + FileOptions file_options_for_compaction_; + + std::unique_ptr column_family_memtables_; + + // Increase the sequence number after writing each batch, whether memtable is + // disabled for that or not. Otherwise the sequence number is increased after + // writing each key into memtable. This implies that when disable_memtable is + // set, the seq is not increased at all. + // + // Default: false + const bool seq_per_batch_; + // This determines during recovery whether we expect one writebatch per + // recovered transaction, or potentially multiple writebatches per + // transaction. For WriteUnprepared, this is set to false, since multiple + // batches can exist per transaction. + // + // Default: true + const bool batch_per_txn_; + + // Except in DB::Open(), WriteOptionsFile can only be called when: + // Persist options to options file. + // If need_mutex_lock = false, the method will lock DB mutex. + // If need_enter_write_thread = false, the method will enter write thread. + Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread); + + // The following two functions can only be called when: + // 1. WriteThread::Writer::EnterUnbatched() is used. + // 2. db_mutex is NOT held + Status RenameTempFileToOptionsFile(const std::string& file_name); + Status DeleteObsoleteOptionsFiles(); + + void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, + const MutableCFOptions& mutable_cf_options, + int job_id); + + void NotifyOnFlushCompleted( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + std::list>* flush_jobs_info); + + void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, int job_id); + + void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, + int job_id); + void NotifyOnMemTableSealed(ColumnFamilyData* cfd, + const MemTableInfo& mem_table_info); + +#ifndef ROCKSDB_LITE + void NotifyOnExternalFileIngested( + ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job); +#endif // !ROCKSDB_LITE + + void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const; + + void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const; + + void EraseThreadStatusDbInfo() const; + + // If disable_memtable is set the application logic must guarantee that the + // batch will still be skipped from memtable during the recovery. An excption + // to this is seq_per_batch_ mode, in which since each batch already takes one + // seq, it is ok for the batch to write to memtable during recovery as long as + // it only takes one sequence number: i.e., no duplicate keys. + // In WriteCommitted it is guarnateed since disable_memtable is used for + // prepare batch which will be written to memtable later during the commit, + // and in WritePrepared it is guaranteed since it will be used only for WAL + // markers which will never be written to memtable. If the commit marker is + // accompanied with CommitTimeWriteBatch that is not written to memtable as + // long as it has no duplicate keys, it does not violate the one-seq-per-batch + // policy. + // batch_cnt is expected to be non-zero in seq_per_batch mode and + // indicates the number of sub-patches. A sub-patch is a subset of the write + // batch that does not have duplicate keys. + Status WriteImpl(const WriteOptions& options, WriteBatch* updates, + WriteCallback* callback = nullptr, + uint64_t* log_used = nullptr, uint64_t log_ref = 0, + bool disable_memtable = false, uint64_t* seq_used = nullptr, + size_t batch_cnt = 0, + PreReleaseCallback* pre_release_callback = nullptr); + + Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates, + WriteCallback* callback = nullptr, + uint64_t* log_used = nullptr, uint64_t log_ref = 0, + bool disable_memtable = false, + uint64_t* seq_used = nullptr); + + // Write only to memtables without joining any write queue + Status UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t log_ref, SequenceNumber seq, + const size_t sub_batch_cnt); + + // Whether the batch requires to be assigned with an order + enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder }; + // Whether it requires publishing last sequence or not + enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq }; + + // Join the write_thread to write the batch only to the WAL. It is the + // responsibility of the caller to also write the write batch to the memtable + // if it required. + // + // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder + // indicating the number of sub-batches in my_batch. A sub-patch is a subset + // of the write batch that does not have duplicate keys. When seq_per_batch is + // not set, each key is a separate sub_batch. Otherwise each duplicate key + // marks start of a new sub-batch. + Status WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& options, + WriteBatch* updates, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable); + + // write cached_recoverable_state_ to memtable if it is not empty + // The writer must be the leader in write_thread_ and holding mutex_ + Status WriteRecoverableState(); + + // Actual implementation of Close() + Status CloseImpl(); + + // Recover the descriptor from persistent storage. May do a significant + // amount of work to recover recently logged updates. Any changes to + // be made to the descriptor are added to *edit. + // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is + // skipped. + virtual Status Recover( + const std::vector& column_families, + bool read_only = false, bool error_if_log_file_exist = false, + bool error_if_data_exists_in_logs = false, + uint64_t* recovered_seq = nullptr); + + virtual bool OwnTablesAndLogs() const { return true; } + + private: + friend class DB; + friend class ErrorHandler; + friend class InternalStats; + friend class PessimisticTransaction; + friend class TransactionBaseImpl; + friend class WriteCommittedTxn; + friend class WritePreparedTxn; + friend class WritePreparedTxnDB; + friend class WriteBatchWithIndex; + friend class WriteUnpreparedTxnDB; + friend class WriteUnpreparedTxn; + +#ifndef ROCKSDB_LITE + friend class ForwardIterator; +#endif + friend struct SuperVersion; + friend class CompactedDBImpl; + friend class DBTest_ConcurrentFlushWAL_Test; + friend class DBTest_MixedSlowdownOptionsStop_Test; + friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test; + friend class DBCompactionTest_CompactionDuringShutdown_Test; + friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test; +#ifndef NDEBUG + friend class DBTest2_ReadCallbackTest_Test; + friend class WriteCallbackTest_WriteWithCallbackTest_Test; + friend class XFTransactionWriteHandler; + friend class DBBlobIndexTest; + friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; +#endif + + struct CompactionState; + struct PrepickedCompaction; + struct PurgeFileInfo; + + struct WriteContext { + SuperVersionContext superversion_context; + autovector memtables_to_free_; + + explicit WriteContext(bool create_superversion = false) + : superversion_context(create_superversion) {} + + ~WriteContext() { + superversion_context.Clean(); + for (auto& m : memtables_to_free_) { + delete m; + } + } + }; + + struct LogFileNumberSize { + explicit LogFileNumberSize(uint64_t _number) : number(_number) {} + void AddSize(uint64_t new_size) { size += new_size; } + uint64_t number; + uint64_t size = 0; + bool getting_flushed = false; + }; + + struct LogWriterNumber { + // pass ownership of _writer + LogWriterNumber(uint64_t _number, log::Writer* _writer) + : number(_number), writer(_writer) {} + + log::Writer* ReleaseWriter() { + auto* w = writer; + writer = nullptr; + return w; + } + Status ClearWriter() { + Status s = writer->WriteBuffer(); + delete writer; + writer = nullptr; + return s; + } + + uint64_t number; + // Visual Studio doesn't support deque's member to be noncopyable because + // of a std::unique_ptr as a member. + log::Writer* writer; // own + // true for some prefix of logs_ + bool getting_synced = false; + }; + + // PurgeFileInfo is a structure to hold information of files to be deleted in + // purge_files_ + struct PurgeFileInfo { + std::string fname; + std::string dir_to_sync; + FileType type; + uint64_t number; + int job_id; + PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num, + int jid) + : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {} + }; + + // Argument required by background flush thread. + struct BGFlushArg { + BGFlushArg() + : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {} + BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id, + SuperVersionContext* superversion_context) + : cfd_(cfd), + max_memtable_id_(max_memtable_id), + superversion_context_(superversion_context) {} + + // Column family to flush. + ColumnFamilyData* cfd_; + // Maximum ID of memtable to flush. In this column family, memtables with + // IDs smaller than this value must be flushed before this flush completes. + uint64_t max_memtable_id_; + // Pointer to a SuperVersionContext object. After flush completes, RocksDB + // installs a new superversion for the column family. This operation + // requires a SuperVersionContext object (currently embedded in JobContext). + SuperVersionContext* superversion_context_; + }; + + // Argument passed to flush thread. + struct FlushThreadArg { + DBImpl* db_; + + Env::Priority thread_pri_; + }; + + // Information for a manual compaction + struct ManualCompactionState { + ColumnFamilyData* cfd; + int input_level; + int output_level; + uint32_t output_path_id; + Status status; + bool done; + bool in_progress; // compaction request being processed? + bool incomplete; // only part of requested range compacted + bool exclusive; // current behavior of only one manual + bool disallow_trivial_move; // Force actual compaction to run + const InternalKey* begin; // nullptr means beginning of key range + const InternalKey* end; // nullptr means end of key range + InternalKey* manual_end; // how far we are compacting + InternalKey tmp_storage; // Used to keep track of compaction progress + InternalKey tmp_storage1; // Used to keep track of compaction progress + }; + struct PrepickedCompaction { + // background compaction takes ownership of `compaction`. + Compaction* compaction; + // caller retains ownership of `manual_compaction_state` as it is reused + // across background compactions. + ManualCompactionState* manual_compaction_state; // nullptr if non-manual + // task limiter token is requested during compaction picking. + std::unique_ptr task_token; + }; + + struct CompactionArg { + // caller retains ownership of `db`. + DBImpl* db; + // background compaction takes ownership of `prepicked_compaction`. + PrepickedCompaction* prepicked_compaction; + }; + + // Initialize the built-in column family for persistent stats. Depending on + // whether on-disk persistent stats have been enabled before, it may either + // create a new column family and column family handle or just a column family + // handle. + // Required: DB mutex held + Status InitPersistStatsColumnFamily(); + + // Persistent Stats column family has two format version key which are used + // for compatibility check. Write format version if it's created for the + // first time, read format version and check compatibility if recovering + // from disk. This function requires DB mutex held at entrance but may + // release and re-acquire DB mutex in the process. + // Required: DB mutex held + Status PersistentStatsProcessFormatVersion(); + + Status ResumeImpl(); + + void MaybeIgnoreError(Status* s) const; + + const Status CreateArchivalDirectory(); + + Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, + const std::string& cf_name, + ColumnFamilyHandle** handle); + + Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family); + + // Delete any unneeded files and stale in-memory entries. + void DeleteObsoleteFiles(); + // Delete obsolete files and log status and information of file deletion + void DeleteObsoleteFileImpl(int job_id, const std::string& fname, + const std::string& path_to_sync, FileType type, + uint64_t number); + + // Background process needs to call + // auto x = CaptureCurrentFileNumberInPendingOutputs() + // auto file_num = versions_->NewFileNumber(); + // + // ReleaseFileNumberFromPendingOutputs(x) + // This will protect any file with number `file_num` or greater from being + // deleted while is running. + // ----------- + // This function will capture current file number and append it to + // pending_outputs_. This will prevent any background process to delete any + // file created after this point. + std::list::iterator CaptureCurrentFileNumberInPendingOutputs(); + // This function should be called with the result of + // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file + // created between the calls CaptureCurrentFileNumberInPendingOutputs() and + // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live + // and blocked by any other pending_outputs_ calls) + void ReleaseFileNumberFromPendingOutputs( + std::unique_ptr::iterator>& v); + + Status SyncClosedLogs(JobContext* job_context); + + // Flush the in-memory write buffer to storage. Switches to a new + // log-file/memtable and writes a new descriptor iff successful. Then + // installs a new super version for the column family. + Status FlushMemTableToOutputFile( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + bool* madeProgress, JobContext* job_context, + SuperVersionContext* superversion_context, + std::vector& snapshot_seqs, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, + Env::Priority thread_pri); + + // Flush the memtables of (multiple) column families to multiple files on + // persistent storage. + Status FlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri); + + Status AtomicFlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri); + + // REQUIRES: log_numbers are sorted in ascending order + // corrupted_log_found is set to true if we recover from a corrupted log file. + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, bool read_only, + bool* corrupted_log_found); + + // The following two methods are used to flush a memtable to + // storage. The first one is used at database RecoveryTime (when the + // database is opened) and is heavyweight because it holds the mutex + // for the entire period. The second method WriteLevel0Table supports + // concurrent flush memtables to storage. + Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, + MemTable* mem, VersionEdit* edit); + + // Restore alive_log_files_ and total_log_size_ after recovery. + // It needs to run only when there's no flush during recovery + // (e.g. avoid_flush_during_recovery=true). May also trigger flush + // in case total_log_size > max_total_wal_size. + Status RestoreAliveLogFiles(const std::vector& log_numbers); + + // num_bytes: for slowdown case, delay time is calculated based on + // `num_bytes` going through. + Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options); + + Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, + WriteBatch* my_batch); + + // REQUIRES: mutex locked and in write thread. + Status ScheduleFlushes(WriteContext* context); + + void MaybeFlushStatsCF(autovector* cfds); + + Status TrimMemtableHistory(WriteContext* context); + + Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context); + + void SelectColumnFamiliesForAtomicFlush(autovector* cfds); + + // Force current memtable contents to be flushed. + Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options, + FlushReason flush_reason, bool writes_stopped = false); + + Status AtomicFlushMemTables( + const autovector& column_family_datas, + const FlushOptions& options, FlushReason flush_reason, + bool writes_stopped = false); + + // Wait until flushing this column family won't stall writes + Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, + bool* flush_needed); + + // Wait for memtable flushed. + // If flush_memtable_id is non-null, wait until the memtable with the ID + // gets flush. Otherwise, wait until the column family don't have any + // memtable pending flush. + // resuming_from_bg_err indicates whether the caller is attempting to resume + // from background error. + Status WaitForFlushMemTable(ColumnFamilyData* cfd, + const uint64_t* flush_memtable_id = nullptr, + bool resuming_from_bg_err = false) { + return WaitForFlushMemTables({cfd}, {flush_memtable_id}, + resuming_from_bg_err); + } + // Wait for memtables to be flushed for multiple column families. + Status WaitForFlushMemTables( + const autovector& cfds, + const autovector& flush_memtable_ids, + bool resuming_from_bg_err); + + inline void WaitForPendingWrites() { + mutex_.AssertHeld(); + TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock"); + // In case of pipelined write is enabled, wait for all pending memtable + // writers. + if (immutable_db_options_.enable_pipelined_write) { + // Memtable writers may call DB::Get in case max_successive_merges > 0, + // which may lock mutex. Unlocking mutex here to avoid deadlock. + mutex_.Unlock(); + write_thread_.WaitForMemTableWriters(); + mutex_.Lock(); + } + + if (!immutable_db_options_.unordered_write) { + // Then the writes are finished before the next write group starts + return; + } + + // Wait for the ones who already wrote to the WAL to finish their + // memtable write. + if (pending_memtable_writes_.load() != 0) { + std::unique_lock guard(switch_mutex_); + switch_cv_.wait(guard, + [&] { return pending_memtable_writes_.load() == 0; }); + } + } + + // REQUIRES: mutex locked and in write thread. + void AssignAtomicFlushSeq(const autovector& cfds); + + // REQUIRES: mutex locked and in write thread. + Status SwitchWAL(WriteContext* write_context); + + // REQUIRES: mutex locked and in write thread. + Status HandleWriteBufferFull(WriteContext* write_context); + + // REQUIRES: mutex locked + Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync, + WriteContext* write_context); + + WriteBatch* MergeBatch(const WriteThread::WriteGroup& write_group, + WriteBatch* tmp_batch, size_t* write_with_wal, + WriteBatch** to_be_cached_state); + + Status WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer, + uint64_t* log_used, uint64_t* log_size); + + Status WriteToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* log_used, + bool need_log_sync, bool need_log_dir_sync, + SequenceNumber sequence); + + Status ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group, + uint64_t* log_used, SequenceNumber* last_sequence, + size_t seq_inc); + + // Used by WriteImpl to update bg_error_ if paranoid check is enabled. + void WriteStatusCheck(const Status& status); + + // Used by WriteImpl to update bg_error_ in case of memtable insert error. + void MemTableInsertStatusCheck(const Status& memtable_insert_status); + +#ifndef ROCKSDB_LITE + + Status CompactFilesImpl(const CompactionOptions& compact_options, + ColumnFamilyData* cfd, Version* version, + const std::vector& input_file_names, + std::vector* const output_file_names, + const int output_level, int output_path_id, + JobContext* job_context, LogBuffer* log_buffer, + CompactionJobInfo* compaction_job_info); + + // Wait for current IngestExternalFile() calls to finish. + // REQUIRES: mutex_ held + void WaitForIngestFile(); + +#else + // IngestExternalFile is not supported in ROCKSDB_LITE so this function + // will be no-op + void WaitForIngestFile() {} +#endif // ROCKSDB_LITE + + ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name); + + void MaybeScheduleFlushOrCompaction(); + + // A flush request specifies the column families to flush as well as the + // largest memtable id to persist for each column family. Once all the + // memtables whose IDs are smaller than or equal to this per-column-family + // specified value, this flush request is considered to have completed its + // work of flushing this column family. After completing the work for all + // column families in this request, this flush is considered complete. + typedef std::vector> FlushRequest; + + void GenerateFlushRequest(const autovector& cfds, + FlushRequest* req); + + void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason); + + void SchedulePendingCompaction(ColumnFamilyData* cfd); + void SchedulePendingPurge(std::string fname, std::string dir_to_sync, + FileType type, uint64_t number, int job_id); + static void BGWorkCompaction(void* arg); + // Runs a pre-chosen universal compaction involving bottom level in a + // separate, bottom-pri thread pool. + static void BGWorkBottomCompaction(void* arg); + static void BGWorkFlush(void* arg); + static void BGWorkPurge(void* arg); + static void UnscheduleCompactionCallback(void* arg); + static void UnscheduleFlushCallback(void* arg); + void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri); + void BackgroundCallFlush(Env::Priority thread_pri); + void BackgroundCallPurge(); + Status BackgroundCompaction(bool* madeProgress, JobContext* job_context, + LogBuffer* log_buffer, + PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri); + Status BackgroundFlush(bool* madeProgress, JobContext* job_context, + LogBuffer* log_buffer, FlushReason* reason, + Env::Priority thread_pri); + + bool EnoughRoomForCompaction(ColumnFamilyData* cfd, + const std::vector& inputs, + bool* sfm_bookkeeping, LogBuffer* log_buffer); + + // Request compaction tasks token from compaction thread limiter. + // It always succeeds if force = true or limiter is disable. + bool RequestCompactionToken(ColumnFamilyData* cfd, bool force, + std::unique_ptr* token, + LogBuffer* log_buffer); + + // Schedule background tasks + void StartTimedTasks(); + + void PrintStatistics(); + + size_t EstimateInMemoryStatsHistorySize() const; + + // persist stats to column family "_persistent_stats" + void PersistStats(); + + // dump rocksdb.stats to LOG + void DumpStats(); + + // Return the minimum empty level that could hold the total data in the + // input level. Return the input level, if such level could not be found. + int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, + const MutableCFOptions& mutable_cf_options, + int level); + + // Move the files in the input level to the target level. + // If target_level < 0, automatically calculate the minimum level that could + // hold the data set. + Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1); + + // helper functions for adding and removing from flush & compaction queues + void AddToCompactionQueue(ColumnFamilyData* cfd); + ColumnFamilyData* PopFirstFromCompactionQueue(); + FlushRequest PopFirstFromFlushQueue(); + + // Pick the first unthrottled compaction with task token from queue. + ColumnFamilyData* PickCompactionFromQueue( + std::unique_ptr* token, LogBuffer* log_buffer); + + // helper function to call after some of the logs_ were synced + void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status); + + SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary, + bool lock = true); + + uint64_t GetMaxTotalWalSize() const; + + Directory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const; + + Status CloseHelper(); + + void WaitForBackgroundWork(); + + // Background threads call this function, which is just a wrapper around + // the InstallSuperVersion() function. Background threads carry + // sv_context which can have new_superversion already + // allocated. + // All ColumnFamily state changes go through this function. Here we analyze + // the new state and we schedule background work if we detect that the new + // state needs flush or compaction. + void InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options); + + bool GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value); + bool GetPropertyHandleOptionsStatistics(std::string* value); + + bool HasPendingManualCompaction(); + bool HasExclusiveManualCompaction(); + void AddManualCompaction(ManualCompactionState* m); + void RemoveManualCompaction(ManualCompactionState* m); + bool ShouldntRunManualCompaction(ManualCompactionState* m); + bool HaveManualCompaction(ColumnFamilyData* cfd); + bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); +#ifndef ROCKSDB_LITE + void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& compaction_job_stats, + const int job_id, const Version* current, + CompactionJobInfo* compaction_job_info) const; + // Reserve the next 'num' file numbers for to-be-ingested external SST files, + // and return the current file_number in 'next_file_number'. + // Write a version edit to the MANIFEST. + Status ReserveFileNumbersBeforeIngestion( + ColumnFamilyData* cfd, uint64_t num, + std::unique_ptr::iterator>& pending_output_elem, + uint64_t* next_file_number); +#endif //! ROCKSDB_LITE + + bool ShouldPurge(uint64_t file_number) const; + void MarkAsGrabbedForPurge(uint64_t file_number); + + size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; } + + Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, log::Writer** new_log); + + // Validate self-consistency of DB options + static Status ValidateOptions(const DBOptions& db_options); + // Validate self-consistency of DB options and its consistency with cf options + static Status ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families); + + // Utility function to do some debug validation and sort the given vector + // of MultiGet keys + void PrepareMultiGetKeys( + const size_t num_keys, bool sorted, + autovector* key_ptrs); + + // A structure to hold the information required to process MultiGet of keys + // belonging to one column family. For a multi column family MultiGet, there + // will be a container of these objects. + struct MultiGetColumnFamilyData { + ColumnFamilyHandle* cf; + ColumnFamilyData* cfd; + + // For the batched MultiGet which relies on sorted keys, start specifies + // the index of first key belonging to this column family in the sorted + // list. + size_t start; + + // For the batched MultiGet case, num_keys specifies the number of keys + // belonging to this column family in the sorted list + size_t num_keys; + + // SuperVersion for the column family obtained in a manner that ensures a + // consistent view across all column families in the DB + SuperVersion* super_version; + MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, + SuperVersion* sv) + : cf(column_family), + cfd(static_cast(cf)->cfd()), + start(0), + num_keys(0), + super_version(sv) {} + + MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first, + size_t count, SuperVersion* sv) + : cf(column_family), + cfd(static_cast(cf)->cfd()), + start(first), + num_keys(count), + super_version(sv) {} + + MultiGetColumnFamilyData() = default; + }; + + // A common function to obtain a consistent snapshot, which can be implicit + // if the user doesn't specify a snapshot in read_options, across + // multiple column families for MultiGet. It will attempt to get an implicit + // snapshot without acquiring the db_mutes, but will give up after a few + // tries and acquire the mutex if a memtable flush happens. The template + // allows both the batched and non-batched MultiGet to call this with + // either an std::unordered_map or autovector of column families. + // + // If callback is non-null, the callback is refreshed with the snapshot + // sequence number + // + // A return value of true indicates that the SuperVersions were obtained + // from the ColumnFamilyData, whereas false indicates they are thread + // local + template + bool MultiCFSnapshot( + const ReadOptions& read_options, ReadCallback* callback, + std::function& + iter_deref_func, + T* cf_list, SequenceNumber* snapshot); + + // The actual implementation of the batching MultiGet. The caller is expected + // to have acquired the SuperVersion and pass in a snapshot sequence number + // in order to construct the LookupKeys. The start_key and num_keys specify + // the range of keys in the sorted_keys vector for a single column family. + void MultiGetImpl( + const ReadOptions& read_options, size_t start_key, size_t num_keys, + autovector* sorted_keys, + SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback, + bool* is_blob_index); + + // table_cache_ provides its own synchronization + std::shared_ptr table_cache_; + + // Lock over the persistent DB state. Non-nullptr iff successfully acquired. + FileLock* db_lock_; + + // In addition to mutex_, log_write_mutex_ protected writes to stats_history_ + InstrumentedMutex stats_history_mutex_; + // In addition to mutex_, log_write_mutex_ protected writes to logs_ and + // logfile_number_. With two_write_queues it also protects alive_log_files_, + // and log_empty_. Refer to the definition of each variable below for more + // details. + // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and + // mutex_, the order should be first mutex_ and then log_write_mutex_. + InstrumentedMutex log_write_mutex_; + + std::atomic shutting_down_; + std::atomic manual_compaction_paused_; + // This condition variable is signaled on these conditions: + // * whenever bg_compaction_scheduled_ goes down to 0 + // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't + // made any progress + // * whenever a compaction made any progress + // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases + // (i.e. whenever a flush is done, even if it didn't make any progress) + // * whenever there is an error in background purge, flush or compaction + // * whenever num_running_ingest_file_ goes to 0. + // * whenever pending_purge_obsolete_files_ goes to 0. + // * whenever disable_delete_obsolete_files_ goes to 0. + // * whenever SetOptions successfully updates options. + // * whenever a column family is dropped. + InstrumentedCondVar bg_cv_; + // Writes are protected by locking both mutex_ and log_write_mutex_, and reads + // must be under either mutex_ or log_write_mutex_. Since after ::Open, + // logfile_number_ is currently updated only in write_thread_, it can be read + // from the same write_thread_ without any locks. + uint64_t logfile_number_; + std::deque + log_recycle_files_; // a list of log files that we can recycle + bool log_dir_synced_; + // Without two_write_queues, read and writes to log_empty_ are protected by + // mutex_. Since it is currently updated/read only in write_thread_, it can be + // accessed from the same write_thread_ without any locks. With + // two_write_queues writes, where it can be updated in different threads, + // read and writes are protected by log_write_mutex_ instead. This is to avoid + // expesnive mutex_ lock during WAL write, which update log_empty_. + bool log_empty_; + + ColumnFamilyHandleImpl* persist_stats_cf_handle_; + + bool persistent_stats_cfd_exists_ = true; + + // Without two_write_queues, read and writes to alive_log_files_ are + // protected by mutex_. However since back() is never popped, and push_back() + // is done only from write_thread_, the same thread can access the item + // reffered by back() without mutex_. With two_write_queues_, writes + // are protected by locking both mutex_ and log_write_mutex_, and reads must + // be under either mutex_ or log_write_mutex_. + std::deque alive_log_files_; + // Log files that aren't fully synced, and the current log file. + // Synchronization: + // - push_back() is done from write_thread_ with locked mutex_ and + // log_write_mutex_ + // - pop_front() is done from any thread with locked mutex_ and + // log_write_mutex_ + // - reads are done with either locked mutex_ or log_write_mutex_ + // - back() and items with getting_synced=true are not popped, + // - The same thread that sets getting_synced=true will reset it. + // - it follows that the object referred by back() can be safely read from + // the write_thread_ without using mutex + // - it follows that the items with getting_synced=true can be safely read + // from the same thread that has set getting_synced=true + std::deque logs_; + // Signaled when getting_synced becomes false for some of the logs_. + InstrumentedCondVar log_sync_cv_; + // This is the app-level state that is written to the WAL but will be used + // only during recovery. Using this feature enables not writing the state to + // memtable on normal writes and hence improving the throughput. Each new + // write of the state will replace the previous state entirely even if the + // keys in the two consecuitive states do not overlap. + // It is protected by log_write_mutex_ when two_write_queues_ is enabled. + // Otherwise only the heaad of write_thread_ can access it. + WriteBatch cached_recoverable_state_; + std::atomic cached_recoverable_state_empty_ = {true}; + std::atomic total_log_size_; + + // If this is non-empty, we need to delete these log files in background + // threads. Protected by db mutex. + autovector logs_to_free_; + + bool is_snapshot_supported_; + + std::map> stats_history_; + + std::map stats_slice_; + + bool stats_slice_initialized_ = false; + + Directories directories_; + + WriteBufferManager* write_buffer_manager_; + + WriteThread write_thread_; + WriteBatch tmp_batch_; + // The write thread when the writers have no memtable write. This will be used + // in 2PC to batch the prepares separately from the serial commit. + WriteThread nonmem_write_thread_; + + WriteController write_controller_; + + // Size of the last batch group. In slowdown mode, next write needs to + // sleep if it uses up the quota. + // Note: This is to protect memtable and compaction. If the batch only writes + // to the WAL its size need not to be included in this. + uint64_t last_batch_group_size_; + + FlushScheduler flush_scheduler_; + + TrimHistoryScheduler trim_history_scheduler_; + + SnapshotList snapshots_; + + // For each background job, pending_outputs_ keeps the current file number at + // the time that background job started. + // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has + // number bigger than any of the file number in pending_outputs_. Since file + // numbers grow monotonically, this also means that pending_outputs_ is always + // sorted. After a background job is done executing, its file number is + // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean + // it up. + // State is protected with db mutex. + std::list pending_outputs_; + + // flush_queue_ and compaction_queue_ hold column families that we need to + // flush and compact, respectively. + // A column family is inserted into flush_queue_ when it satisfies condition + // cfd->imm()->IsFlushPending() + // A column family is inserted into compaction_queue_ when it satisfied + // condition cfd->NeedsCompaction() + // Column families in this list are all Ref()-erenced + // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will + // do RAII on ColumnFamilyData + // Column families are in this queue when they need to be flushed or + // compacted. Consumers of these queues are flush and compaction threads. When + // column family is put on this queue, we increase unscheduled_flushes_ and + // unscheduled_compactions_. When these variables are bigger than zero, that + // means we need to schedule background threads for flush and compaction. + // Once the background threads are scheduled, we decrease unscheduled_flushes_ + // and unscheduled_compactions_. That way we keep track of number of + // compaction and flush threads we need to schedule. This scheduling is done + // in MaybeScheduleFlushOrCompaction() + // invariant(column family present in flush_queue_ <==> + // ColumnFamilyData::pending_flush_ == true) + std::deque flush_queue_; + // invariant(column family present in compaction_queue_ <==> + // ColumnFamilyData::pending_compaction_ == true) + std::deque compaction_queue_; + + // A map to store file numbers and filenames of the files to be purged + std::unordered_map purge_files_; + + // A vector to store the file numbers that have been assigned to certain + // JobContext. Current implementation tracks ssts only. + std::unordered_set files_grabbed_for_purge_; + + // A queue to store log writers to close + std::deque logs_to_free_queue_; + std::deque superversions_to_free_queue_; + int unscheduled_flushes_; + int unscheduled_compactions_; + + // count how many background compactions are running or have been scheduled in + // the BOTTOM pool + int bg_bottom_compaction_scheduled_; + + // count how many background compactions are running or have been scheduled + int bg_compaction_scheduled_; + + // stores the number of compactions are currently running + int num_running_compactions_; + + // number of background memtable flush jobs, submitted to the HIGH pool + int bg_flush_scheduled_; + + // stores the number of flushes are currently running + int num_running_flushes_; + + // number of background obsolete file purge jobs, submitted to the HIGH pool + int bg_purge_scheduled_; + + std::deque manual_compaction_dequeue_; + + // shall we disable deletion of obsolete files + // if 0 the deletion is enabled. + // if non-zero, files will not be getting deleted + // This enables two different threads to call + // EnableFileDeletions() and DisableFileDeletions() + // without any synchronization + int disable_delete_obsolete_files_; + + // Number of times FindObsoleteFiles has found deletable files and the + // corresponding call to PurgeObsoleteFiles has not yet finished. + int pending_purge_obsolete_files_; + + // last time when DeleteObsoleteFiles with full scan was executed. Originally + // initialized with startup time. + uint64_t delete_obsolete_files_last_run_; + + // last time stats were dumped to LOG + std::atomic last_stats_dump_time_microsec_; + + // The thread that wants to switch memtable, can wait on this cv until the + // pending writes to memtable finishes. + std::condition_variable switch_cv_; + // The mutex used by switch_cv_. mutex_ should be acquired beforehand. + std::mutex switch_mutex_; + // Number of threads intending to write to memtable + std::atomic pending_memtable_writes_ = {}; + + // Each flush or compaction gets its own job id. this counter makes sure + // they're unique + std::atomic next_job_id_; + + // A flag indicating whether the current rocksdb database has any + // data that is not yet persisted into either WAL or SST file. + // Used when disableWAL is true. + std::atomic has_unpersisted_data_; + + // if an attempt was made to flush all column families that + // the oldest log depends on but uncommitted data in the oldest + // log prevents the log from being released. + // We must attempt to free the dependent memtables again + // at a later time after the transaction in the oldest + // log is fully commited. + bool unable_to_release_oldest_log_; + + static const int KEEP_LOG_FILE_NUM = 1000; + // MSVC version 1800 still does not have constexpr for ::max() + static const uint64_t kNoTimeOut = port::kMaxUint64; + + std::string db_absolute_path_; + + // Number of running IngestExternalFile() or CreateColumnFamilyWithImport() + // calls. + // REQUIRES: mutex held + int num_running_ingest_file_; + +#ifndef ROCKSDB_LITE + WalManager wal_manager_; +#endif // ROCKSDB_LITE + + // Unified interface for logging events + EventLogger event_logger_; + + // A value of > 0 temporarily disables scheduling of background work + int bg_work_paused_; + + // A value of > 0 temporarily disables scheduling of background compaction + int bg_compaction_paused_; + + // Guard against multiple concurrent refitting + bool refitting_level_; + + // Indicate DB was opened successfully + bool opened_successfully_; + + // The min threshold to triggere bottommost compaction for removing + // garbages, among all column families. + SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber; + + LogsWithPrepTracker logs_with_prep_tracker_; + + // Callback for compaction to check if a key is visible to a snapshot. + // REQUIRES: mutex held + std::unique_ptr snapshot_checker_; + + // Callback for when the cached_recoverable_state_ is written to memtable + // Only to be set during initialization + std::unique_ptr recoverable_state_pre_release_callback_; + + // handle for scheduling stats dumping at fixed intervals + // REQUIRES: mutex locked + std::unique_ptr thread_dump_stats_; + + // handle for scheduling stats snapshoting at fixed intervals + // REQUIRES: mutex locked + std::unique_ptr thread_persist_stats_; + + // When set, we use a separate queue for writes that dont write to memtable. + // In 2PC these are the writes at Prepare phase. + const bool two_write_queues_; + const bool manual_wal_flush_; + + // LastSequence also indicates last published sequence visibile to the + // readers. Otherwise LastPublishedSequence should be used. + const bool last_seq_same_as_publish_seq_; + // It indicates that a customized gc algorithm must be used for + // flush/compaction and if it is not provided vis SnapshotChecker, we should + // disable gc to be safe. + const bool use_custom_gc_; + // Flag to indicate that the DB instance shutdown has been initiated. This + // different from shutting_down_ atomic in that it is set at the beginning + // of shutdown sequence, specifically in order to prevent any background + // error recovery from going on in parallel. The latter, shutting_down_, + // is set a little later during the shutdown after scheduling memtable + // flushes + std::atomic shutdown_initiated_; + // Flag to indicate whether sst_file_manager object was allocated in + // DB::Open() or passed to us + bool own_sfm_; + + // Clients must periodically call SetPreserveDeletesSequenceNumber() + // to advance this seqnum. Default value is 0 which means ALL deletes are + // preserved. Note that this has no effect if DBOptions.preserve_deletes + // is set to false. + std::atomic preserve_deletes_seqnum_; + const bool preserve_deletes_; + + // Flag to check whether Close() has been called on this DB + bool closed_; + + ErrorHandler error_handler_; + + // Conditional variable to coordinate installation of atomic flush results. + // With atomic flush, each bg thread installs the result of flushing multiple + // column families, and different threads can flush different column + // families. It's difficult to rely on one thread to perform batch + // installation for all threads. This is different from the non-atomic flush + // case. + // atomic_flush_install_cv_ makes sure that threads install atomic flush + // results sequentially. Flush results of memtables with lower IDs get + // installed to MANIFEST first. + InstrumentedCondVar atomic_flush_install_cv_; + + bool wal_in_db_path_; +}; + +extern Options SanitizeOptions(const std::string& db, const Options& src); + +extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src); + +extern CompressionType GetCompressionFlush( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + +// Return the earliest log file to keep after the memtable flush is +// finalized. +// `cfd_to_flush` is the column family whose memtable (specified in +// `memtables_to_flush`) will be flushed and thus will not depend on any WAL +// file. +// The function is only applicable to 2pc mode. +extern uint64_t PrecomputeMinLogNumberToKeep( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + autovector edit_list, + const autovector& memtables_to_flush, + LogsWithPrepTracker* prep_tracker); + +// `cfd_to_flush` is the column family whose memtable will be flushed and thus +// will not depend on any WAL file. nullptr means no memtable is being flushed. +// The function is only applicable to 2pc mode. +extern uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, const ColumnFamilyData* cfd_to_flush, + const autovector& memtables_to_flush); + +// Fix user-supplied options to be reasonable +template +static void ClipToRange(T* ptr, V minvalue, V maxvalue) { + if (static_cast(*ptr) > maxvalue) *ptr = maxvalue; + if (static_cast(*ptr) < minvalue) *ptr = minvalue; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc new file mode 100644 index 000000000..c7b3510c3 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc @@ -0,0 +1,3116 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include + +#include "db/builder.h" +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/perf_context_imp.h" +#include "monitoring/thread_status_updater.h" +#include "monitoring/thread_status_util.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/concurrent_task_limiter_impl.h" + +namespace ROCKSDB_NAMESPACE { + +bool DBImpl::EnoughRoomForCompaction( + ColumnFamilyData* cfd, const std::vector& inputs, + bool* sfm_reserved_compact_space, LogBuffer* log_buffer) { + // Check if we have enough room to do the compaction + bool enough_room = true; +#ifndef ROCKSDB_LITE + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm) { + // Pass the current bg_error_ to SFM so it can decide what checks to + // perform. If this DB instance hasn't seen any error yet, the SFM can be + // optimistic and not do disk space checks + enough_room = + sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError()); + if (enough_room) { + *sfm_reserved_compact_space = true; + } + } +#else + (void)cfd; + (void)inputs; + (void)sfm_reserved_compact_space; +#endif // ROCKSDB_LITE + if (!enough_room) { + // Just in case tests want to change the value of enough_room + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction():CancelledCompaction", &enough_room); + ROCKS_LOG_BUFFER(log_buffer, + "Cancelled compaction because not enough room"); + RecordTick(stats_, COMPACTION_CANCELLED, 1); + } + return enough_room; +} + +bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force, + std::unique_ptr* token, + LogBuffer* log_buffer) { + assert(*token == nullptr); + auto limiter = static_cast( + cfd->ioptions()->compaction_thread_limiter.get()); + if (limiter == nullptr) { + return true; + } + *token = limiter->GetToken(force); + if (*token != nullptr) { + ROCKS_LOG_BUFFER(log_buffer, + "Thread limiter [%s] increase [%s] compaction task, " + "force: %s, tasks after: %d", + limiter->GetName().c_str(), cfd->GetName().c_str(), + force ? "true" : "false", limiter->GetOutstandingTask()); + return true; + } + return false; +} + +Status DBImpl::SyncClosedLogs(JobContext* job_context) { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); + mutex_.AssertHeld(); + autovector logs_to_sync; + uint64_t current_log_number = logfile_number_; + while (logs_.front().number < current_log_number && + logs_.front().getting_synced) { + log_sync_cv_.Wait(); + } + for (auto it = logs_.begin(); + it != logs_.end() && it->number < current_log_number; ++it) { + auto& log = *it; + assert(!log.getting_synced); + log.getting_synced = true; + logs_to_sync.push_back(log.writer); + } + + Status s; + if (!logs_to_sync.empty()) { + mutex_.Unlock(); + + for (log::Writer* log : logs_to_sync) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Syncing log #%" PRIu64, job_context->job_id, + log->get_log_number()); + s = log->file()->Sync(immutable_db_options_.use_fsync); + if (!s.ok()) { + break; + } + + if (immutable_db_options_.recycle_log_file_num > 0) { + s = log->Close(); + if (!s.ok()) { + break; + } + } + } + if (s.ok()) { + s = directories_.GetWalDir()->Fsync(); + } + + mutex_.Lock(); + + // "number <= current_log_number - 1" is equivalent to + // "number < current_log_number". + MarkLogsSynced(current_log_number - 1, true, s); + if (!s.ok()) { + error_handler_.SetBGError(s, BackgroundErrorReason::kFlush); + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed"); + return s; + } + } + return s; +} + +Status DBImpl::FlushMemTableToOutputFile( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + bool* made_progress, JobContext* job_context, + SuperVersionContext* superversion_context, + std::vector& snapshot_seqs, + SequenceNumber earliest_write_conflict_snapshot, + SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, + Env::Priority thread_pri) { + mutex_.AssertHeld(); + assert(cfd->imm()->NumNotFlushed() != 0); + assert(cfd->imm()->IsFlushPending()); + + FlushJob flush_job( + dbname_, cfd, immutable_db_options_, mutable_cf_options, + nullptr /* memtable_id */, file_options_for_compaction_, versions_.get(), + &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, job_context, log_buffer, directories_.GetDbDir(), + GetDataDir(cfd, 0U), + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, + &event_logger_, mutable_cf_options.report_bg_io_stats, + true /* sync_output_directory */, true /* write_manifest */, thread_pri); + + FileMetaData file_meta; + + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"); + flush_job.PickMemTable(); + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables"); + +#ifndef ROCKSDB_LITE + // may temporarily unlock and lock the mutex. + NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id); +#endif // ROCKSDB_LITE + + Status s; + if (logfile_number_ > 0 && + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1) { + // If there are more than one column families, we need to make sure that + // all the log files except the most recent one are synced. Otherwise if + // the host crashes after flushing and before WAL is persistent, the + // flushed SST may contain data from write batches whose updates to + // other column families are missing. + // SyncClosedLogs() may unlock and re-lock the db_mutex. + s = SyncClosedLogs(job_context); + } else { + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip"); + } + + // Within flush_job.Run, rocksdb may call event listener to notify + // file creation and deletion. + // + // Note that flush_job.Run will unlock and lock the db_mutex, + // and EventListener callback will be called when the db_mutex + // is unlocked by the current thread. + if (s.ok()) { + s = flush_job.Run(&logs_with_prep_tracker_, &file_meta); + } else { + flush_job.Cancel(); + } + + if (s.ok()) { + InstallSuperVersionAndScheduleWork(cfd, superversion_context, + mutable_cf_options); + if (made_progress) { + *made_progress = true; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", + cfd->GetName().c_str(), + cfd->current()->storage_info()->LevelSummary(&tmp)); + } + + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + if (s.ok()) { +#ifndef ROCKSDB_LITE + // may temporarily unlock and lock the mutex. + NotifyOnFlushCompleted(cfd, mutable_cf_options, + flush_job.GetCommittedFlushJobsInfo()); + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm) { + // Notify sst_file_manager that a new file was added + std::string file_path = MakeTableFileName( + cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber()); + sfm->OnAddFile(file_path); + if (sfm->IsMaxAllowedSpaceReached()) { + Status new_bg_error = + Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", + &new_bg_error); + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + } +#endif // ROCKSDB_LITE + } + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish"); + return s; +} + +Status DBImpl::FlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) { + if (immutable_db_options_.atomic_flush) { + return AtomicFlushMemTablesToOutputFiles( + bg_flush_args, made_progress, job_context, log_buffer, thread_pri); + } + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + Status status; + for (auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + SuperVersionContext* superversion_context = arg.superversion_context_; + Status s = FlushMemTableToOutputFile( + cfd, mutable_cf_options, made_progress, job_context, + superversion_context, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, log_buffer, thread_pri); + if (!s.ok()) { + status = s; + if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { + // At this point, DB is not shutting down, nor is cfd dropped. + // Something is wrong, thus we break out of the loop. + break; + } + } + } + return status; +} + +/* + * Atomically flushes multiple column families. + * + * For each column family, all memtables with ID smaller than or equal to the + * ID specified in bg_flush_args will be flushed. Only after all column + * families finish flush will this function commit to MANIFEST. If any of the + * column families are not flushed successfully, this function does not have + * any side-effect on the state of the database. + */ +Status DBImpl::AtomicFlushMemTablesToOutputFiles( + const autovector& bg_flush_args, bool* made_progress, + JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) { + mutex_.AssertHeld(); + + autovector cfds; + for (const auto& arg : bg_flush_args) { + cfds.emplace_back(arg.cfd_); + } + +#ifndef NDEBUG + for (const auto cfd : cfds) { + assert(cfd->imm()->NumNotFlushed() != 0); + assert(cfd->imm()->IsFlushPending()); + } +#endif /* !NDEBUG */ + + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + + autovector distinct_output_dirs; + autovector distinct_output_dir_paths; + std::vector> jobs; + std::vector all_mutable_cf_options; + int num_cfs = static_cast(cfds.size()); + all_mutable_cf_options.reserve(num_cfs); + for (int i = 0; i < num_cfs; ++i) { + auto cfd = cfds[i]; + Directory* data_dir = GetDataDir(cfd, 0U); + const std::string& curr_path = cfd->ioptions()->cf_paths[0].path; + + // Add to distinct output directories if eligible. Use linear search. Since + // the number of elements in the vector is not large, performance should be + // tolerable. + bool found = false; + for (const auto& path : distinct_output_dir_paths) { + if (path == curr_path) { + found = true; + break; + } + } + if (!found) { + distinct_output_dir_paths.emplace_back(curr_path); + distinct_output_dirs.emplace_back(data_dir); + } + + all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions()); + const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back(); + const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_); + jobs.emplace_back(new FlushJob( + dbname_, cfd, immutable_db_options_, mutable_cf_options, + max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_, + &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, job_context, log_buffer, directories_.GetDbDir(), + data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), + stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, + false /* sync_output_directory */, false /* write_manifest */, + thread_pri)); + jobs.back()->PickMemTable(); + } + + std::vector file_meta(num_cfs); + Status s; + assert(num_cfs == static_cast(jobs.size())); + +#ifndef ROCKSDB_LITE + for (int i = 0; i != num_cfs; ++i) { + const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i); + // may temporarily unlock and lock the mutex. + NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options, + job_context->job_id); + } +#endif /* !ROCKSDB_LITE */ + + if (logfile_number_ > 0) { + // TODO (yanqin) investigate whether we should sync the closed logs for + // single column family case. + s = SyncClosedLogs(job_context); + } + + // exec_status stores the execution status of flush_jobs as + // + autovector> exec_status; + for (int i = 0; i != num_cfs; ++i) { + // Initially all jobs are not executed, with status OK. + exec_status.emplace_back(false, Status::OK()); + } + + if (s.ok()) { + // TODO (yanqin): parallelize jobs with threads. + for (int i = 1; i != num_cfs; ++i) { + exec_status[i].second = + jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i]); + exec_status[i].first = true; + } + if (num_cfs > 1) { + TEST_SYNC_POINT( + "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1"); + TEST_SYNC_POINT( + "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"); + } + assert(exec_status.size() > 0); + assert(!file_meta.empty()); + exec_status[0].second = + jobs[0]->Run(&logs_with_prep_tracker_, &file_meta[0]); + exec_status[0].first = true; + + Status error_status; + for (const auto& e : exec_status) { + if (!e.second.ok()) { + s = e.second; + if (!e.second.IsShutdownInProgress() && + !e.second.IsColumnFamilyDropped()) { + // If a flush job did not return OK, and the CF is not dropped, and + // the DB is not shutting down, then we have to return this result to + // caller later. + error_status = e.second; + } + } + } + + s = error_status.ok() ? s : error_status; + } + + if (s.IsColumnFamilyDropped()) { + s = Status::OK(); + } + + if (s.ok() || s.IsShutdownInProgress()) { + // Sync on all distinct output directories. + for (auto dir : distinct_output_dirs) { + if (dir != nullptr) { + Status error_status = dir->Fsync(); + if (!error_status.ok()) { + s = error_status; + break; + } + } + } + } else { + // Need to undo atomic flush if something went wrong, i.e. s is not OK and + // it is not because of CF drop. + // Have to cancel the flush jobs that have NOT executed because we need to + // unref the versions. + for (int i = 0; i != num_cfs; ++i) { + if (!exec_status[i].first) { + jobs[i]->Cancel(); + } + } + for (int i = 0; i != num_cfs; ++i) { + if (exec_status[i].first && exec_status[i].second.ok()) { + auto& mems = jobs[i]->GetMemTables(); + cfds[i]->imm()->RollbackMemtableFlush(mems, + file_meta[i].fd.GetNumber()); + } + } + } + + if (s.ok()) { + auto wait_to_install_func = [&]() { + bool ready = true; + for (size_t i = 0; i != cfds.size(); ++i) { + const auto& mems = jobs[i]->GetMemTables(); + if (cfds[i]->IsDropped()) { + // If the column family is dropped, then do not wait. + continue; + } else if (!mems.empty() && + cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) { + // If a flush job needs to install the flush result for mems and + // mems[0] is not the earliest memtable, it means another thread must + // be installing flush results for the same column family, then the + // current thread needs to wait. + ready = false; + break; + } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <= + bg_flush_args[i].max_memtable_id_) { + // If a flush job does not need to install flush results, then it has + // to wait until all memtables up to max_memtable_id_ (inclusive) are + // installed. + ready = false; + break; + } + } + return ready; + }; + + bool resuming_from_bg_err = error_handler_.IsDBStopped(); + while ((!error_handler_.IsDBStopped() || + error_handler_.GetRecoveryError().ok()) && + !wait_to_install_func()) { + atomic_flush_install_cv_.Wait(); + } + + s = resuming_from_bg_err ? error_handler_.GetRecoveryError() + : error_handler_.GetBGError(); + } + + if (s.ok()) { + autovector tmp_cfds; + autovector*> mems_list; + autovector mutable_cf_options_list; + autovector tmp_file_meta; + for (int i = 0; i != num_cfs; ++i) { + const auto& mems = jobs[i]->GetMemTables(); + if (!cfds[i]->IsDropped() && !mems.empty()) { + tmp_cfds.emplace_back(cfds[i]); + mems_list.emplace_back(&mems); + mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]); + tmp_file_meta.emplace_back(&file_meta[i]); + } + } + + s = InstallMemtableAtomicFlushResults( + nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list, + versions_.get(), &mutex_, tmp_file_meta, + &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer); + } + + if (s.ok()) { + assert(num_cfs == + static_cast(job_context->superversion_contexts.size())); + for (int i = 0; i != num_cfs; ++i) { + if (cfds[i]->IsDropped()) { + continue; + } + InstallSuperVersionAndScheduleWork(cfds[i], + &job_context->superversion_contexts[i], + all_mutable_cf_options[i]); + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", + cfds[i]->GetName().c_str(), + cfds[i]->current()->storage_info()->LevelSummary(&tmp)); + } + if (made_progress) { + *made_progress = true; + } +#ifndef ROCKSDB_LITE + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + assert(all_mutable_cf_options.size() == static_cast(num_cfs)); + for (int i = 0; i != num_cfs; ++i) { + if (cfds[i]->IsDropped()) { + continue; + } + NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i], + jobs[i]->GetCommittedFlushJobsInfo()); + if (sfm) { + std::string file_path = MakeTableFileName( + cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber()); + sfm->OnAddFile(file_path); + if (sfm->IsMaxAllowedSpaceReached() && + error_handler_.GetBGError().ok()) { + Status new_bg_error = + Status::SpaceLimit("Max allowed space was reached"); + error_handler_.SetBGError(new_bg_error, + BackgroundErrorReason::kFlush); + } + } + } +#endif // ROCKSDB_LITE + } + + if (!s.ok() && !s.IsShutdownInProgress()) { + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } + + return s; +} + +void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, + const MutableCFOptions& mutable_cf_options, + int job_id) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + bool triggered_writes_slowdown = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_slowdown_writes_trigger); + bool triggered_writes_stop = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_stop_writes_trigger); + // release lock while notifying events + mutex_.Unlock(); + { + FlushJobInfo info{}; + info.cf_id = cfd->GetID(); + info.cf_name = cfd->GetName(); + // TODO(yhchiang): make db_paths dynamic in case flush does not + // go to L0 in the future. + const uint64_t file_number = file_meta->fd.GetNumber(); + info.file_path = + MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number); + info.file_number = file_number; + info.thread_id = env_->GetThreadID(); + info.job_id = job_id; + info.triggered_writes_slowdown = triggered_writes_slowdown; + info.triggered_writes_stop = triggered_writes_stop; + info.smallest_seqno = file_meta->fd.smallest_seqno; + info.largest_seqno = file_meta->fd.largest_seqno; + info.flush_reason = cfd->GetFlushReason(); + for (auto listener : immutable_db_options_.listeners) { + listener->OnFlushBegin(this, info); + } + } + mutex_.Lock(); +// no need to signal bg_cv_ as it will be signaled at the end of the +// flush process. +#else + (void)cfd; + (void)file_meta; + (void)mutable_cf_options; + (void)job_id; +#endif // ROCKSDB_LITE +} + +void DBImpl::NotifyOnFlushCompleted( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + std::list>* flush_jobs_info) { +#ifndef ROCKSDB_LITE + assert(flush_jobs_info != nullptr); + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + bool triggered_writes_slowdown = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_slowdown_writes_trigger); + bool triggered_writes_stop = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_stop_writes_trigger); + // release lock while notifying events + mutex_.Unlock(); + { + for (auto& info : *flush_jobs_info) { + info->triggered_writes_slowdown = triggered_writes_slowdown; + info->triggered_writes_stop = triggered_writes_stop; + for (auto listener : immutable_db_options_.listeners) { + listener->OnFlushCompleted(this, *info); + } + } + flush_jobs_info->clear(); + } + mutex_.Lock(); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +#else + (void)cfd; + (void)mutable_cf_options; + (void)flush_jobs_info; +#endif // ROCKSDB_LITE +} + +Status DBImpl::CompactRange(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) { + return Status::InvalidArgument("Invalid target path ID"); + } + + bool exclusive = options.exclusive_manual_compaction; + + bool flush_needed = true; + if (begin != nullptr && end != nullptr) { + // TODO(ajkr): We could also optimize away the flush in certain cases where + // one/both sides of the interval are unbounded. But it requires more + // changes to RangesOverlapWithMemtables. + Range range(*begin, *end); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + cfd->RangesOverlapWithMemtables({range}, super_version, &flush_needed); + CleanupSuperVersion(super_version); + } + + Status s; + if (flush_needed) { + FlushOptions fo; + fo.allow_write_stall = options.allow_write_stall; + if (immutable_db_options_.atomic_flush) { + autovector cfds; + mutex_.Lock(); + SelectColumnFamiliesForAtomicFlush(&cfds); + mutex_.Unlock(); + s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction, + false /* writes_stopped */); + } else { + s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction, + false /* writes_stopped*/); + } + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + } + + int max_level_with_files = 0; + // max_file_num_to_ignore can be used to filter out newly created SST files, + // useful for bottom level compaction in a manual compaction + uint64_t max_file_num_to_ignore = port::kMaxUint64; + uint64_t next_file_number = port::kMaxUint64; + { + InstrumentedMutexLock l(&mutex_); + Version* base = cfd->current(); + for (int level = 1; level < base->storage_info()->num_non_empty_levels(); + level++) { + if (base->storage_info()->OverlapInLevel(level, begin, end)) { + max_level_with_files = level; + } + } + next_file_number = versions_->current_next_file_number(); + } + + int final_output_level = 0; + + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal && + cfd->NumberLevels() > 1) { + // Always compact all files together. + final_output_level = cfd->NumberLevels() - 1; + // if bottom most level is reserved + if (immutable_db_options_.allow_ingest_behind) { + final_output_level--; + } + s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, + final_output_level, options, begin, end, exclusive, + false, max_file_num_to_ignore); + } else { + for (int level = 0; level <= max_level_with_files; level++) { + int output_level; + // in case the compaction is universal or if we're compacting the + // bottom-most level, the output level will be the same as input one. + // level 0 can never be the bottommost level (i.e. if all files are in + // level 0, we will compact to level 1) + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + output_level = level; + } else if (level == max_level_with_files && level > 0) { + if (options.bottommost_level_compaction == + BottommostLevelCompaction::kSkip) { + // Skip bottommost level compaction + continue; + } else if (options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter && + cfd->ioptions()->compaction_filter == nullptr && + cfd->ioptions()->compaction_filter_factory == nullptr) { + // Skip bottommost level compaction since we don't have a compaction + // filter + continue; + } + output_level = level; + // update max_file_num_to_ignore only for bottom level compaction + // because data in newly compacted files in middle levels may still need + // to be pushed down + max_file_num_to_ignore = next_file_number; + } else { + output_level = level + 1; + if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && + cfd->ioptions()->level_compaction_dynamic_level_bytes && + level == 0) { + output_level = ColumnFamilyData::kCompactToBaseLevel; + } + } + s = RunManualCompaction(cfd, level, output_level, options, begin, end, + exclusive, false, max_file_num_to_ignore); + if (!s.ok()) { + break; + } + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + final_output_level = cfd->NumberLevels() - 1; + } else if (output_level > final_output_level) { + final_output_level = output_level; + } + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); + } + } + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + + if (options.change_level) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[RefitLevel] waiting for background threads to stop"); + s = PauseBackgroundWork(); + if (s.ok()) { + s = ReFitLevel(cfd, final_output_level, options.target_level); + } + ContinueBackgroundWork(); + } + LogFlush(immutable_db_options_.info_log); + + { + InstrumentedMutexLock l(&mutex_); + // an automatic compaction that has been scheduled might have been + // preempted by the manual compactions. Need to schedule it back. + MaybeScheduleFlushOrCompaction(); + } + + return s; +} + +Status DBImpl::CompactFiles(const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, const int output_path_id, + std::vector* const output_file_names, + CompactionJobInfo* compaction_job_info) { +#ifdef ROCKSDB_LITE + (void)compact_options; + (void)column_family; + (void)input_file_names; + (void)output_level; + (void)output_path_id; + (void)output_file_names; + (void)compaction_job_info; + // not supported in lite version + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + if (column_family == nullptr) { + return Status::InvalidArgument("ColumnFamilyHandle must be non-null."); + } + + auto cfd = reinterpret_cast(column_family)->cfd(); + assert(cfd); + + Status s; + JobContext job_context(0, true); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + + // Perform CompactFiles + TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2"); + { + InstrumentedMutexLock l(&mutex_); + + // This call will unlock/lock the mutex to wait for current running + // IngestExternalFile() calls to finish. + WaitForIngestFile(); + + // We need to get current after `WaitForIngestFile`, because + // `IngestExternalFile` may add files that overlap with `input_file_names` + auto* current = cfd->current(); + current->Ref(); + + s = CompactFilesImpl(compact_options, cfd, current, input_file_names, + output_file_names, output_level, output_path_id, + &job_context, &log_buffer, compaction_job_info); + + current->Unref(); + } + + // Find and delete obsolete files + { + InstrumentedMutexLock l(&mutex_); + // If !s.ok(), this means that Compaction failed. In that case, we want + // to delete all obsolete files we might have created and we force + // FindObsoleteFiles(). This is because job_context does not + // catch all created files if compaction failed. + FindObsoleteFiles(&job_context, !s.ok()); + } // release the mutex + + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + // no mutex is locked here. No need to Unlock() and Lock() here. + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + } + + return s; +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +Status DBImpl::CompactFilesImpl( + const CompactionOptions& compact_options, ColumnFamilyData* cfd, + Version* version, const std::vector& input_file_names, + std::vector* const output_file_names, const int output_level, + int output_path_id, JobContext* job_context, LogBuffer* log_buffer, + CompactionJobInfo* compaction_job_info) { + mutex_.AssertHeld(); + + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + if (manual_compaction_paused_.load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + std::unordered_set input_set; + for (const auto& file_name : input_file_names) { + input_set.insert(TableFileNameToNumber(file_name)); + } + + ColumnFamilyMetaData cf_meta; + // TODO(yhchiang): can directly use version here if none of the + // following functions call is pluggable to external developers. + version->GetColumnFamilyMetaData(&cf_meta); + + if (output_path_id < 0) { + if (cfd->ioptions()->cf_paths.size() == 1U) { + output_path_id = 0; + } else { + return Status::NotSupported( + "Automatic output path selection is not " + "yet supported in CompactFiles()"); + } + } + + Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( + &input_set, cf_meta, output_level); + if (!s.ok()) { + return s; + } + + std::vector input_files; + s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( + &input_files, &input_set, version->storage_info(), compact_options); + if (!s.ok()) { + return s; + } + + for (const auto& inputs : input_files) { + if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) { + return Status::Aborted( + "Some of the necessary compaction input " + "files are already being compacted"); + } + } + bool sfm_reserved_compact_space = false; + // First check if we have enough room to do the compaction + bool enough_room = EnoughRoomForCompaction( + cfd, input_files, &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // m's vars will get set properly at the end of this function, + // as long as status == CompactionTooLarge + return Status::CompactionTooLarge(); + } + + // At this point, CompactFiles will be run. + bg_compaction_scheduled_++; + + std::unique_ptr c; + assert(cfd->compaction_picker()); + c.reset(cfd->compaction_picker()->CompactFiles( + compact_options, input_files, output_level, version->storage_info(), + *cfd->GetLatestMutableCFOptions(), output_path_id)); + // we already sanitized the set of input files and checked for conflicts + // without releasing the lock, so we're guaranteed a compaction can be formed. + assert(c != nullptr); + + c->SetInputVersion(version); + // deletion compaction currently not allowed in CompactFiles. + assert(!c->deletion_compaction()); + + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + + std::unique_ptr::iterator> pending_outputs_inserted_elem( + new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + assert(is_snapshot_supported_ || snapshots_.empty()); + CompactionJobStats compaction_job_stats; + CompactionJob compaction_job( + job_context->job_id, c.get(), immutable_db_options_, + file_options_for_compaction_, versions_.get(), &shutting_down_, + preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(), + GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_, + &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, table_cache_, &event_logger_, + c->mutable_cf_options()->paranoid_file_checks, + c->mutable_cf_options()->report_bg_io_stats, dbname_, + &compaction_job_stats, Env::Priority::USER, &manual_compaction_paused_); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here. + version->storage_info()->ComputeCompactionScore(*cfd->ioptions(), + *c->mutable_cf_options()); + + compaction_job.Prepare(); + + mutex_.Unlock(); + TEST_SYNC_POINT("CompactFilesImpl:0"); + TEST_SYNC_POINT("CompactFilesImpl:1"); + compaction_job.Run(); + TEST_SYNC_POINT("CompactFilesImpl:2"); + TEST_SYNC_POINT("CompactFilesImpl:3"); + mutex_.Lock(); + + Status status = compaction_job.Install(*c->mutable_cf_options()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + } + c->ReleaseCompactionFiles(s); +#ifndef ROCKSDB_LITE + // Need to make sure SstFileManager does its bookkeeping + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm && sfm_reserved_compact_space) { + sfm->OnCompactionCompletion(c.get()); + } +#endif // ROCKSDB_LITE + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + if (compaction_job_info != nullptr) { + BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats, + job_context->job_id, version, compaction_job_info); + } + + if (status.ok()) { + // Done + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { + // Ignore compaction errors found during shutting down + } else if (status.IsManualCompactionPaused()) { + // Don't report stopping manual compaction as error + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] [JOB %d] Stopping manual compaction", + c->column_family_data()->GetName().c_str(), + job_context->job_id); + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] [JOB %d] Compaction error: %s", + c->column_family_data()->GetName().c_str(), + job_context->job_id, status.ToString().c_str()); + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + } + + if (output_file_names != nullptr) { + for (const auto newf : c->edit()->GetNewFiles()) { + (*output_file_names) + .push_back(TableFileName(c->immutable_cf_options()->cf_paths, + newf.second.fd.GetNumber(), + newf.second.fd.GetPathId())); + } + } + + c.reset(); + + bg_compaction_scheduled_--; + if (bg_compaction_scheduled_ == 0) { + bg_cv_.SignalAll(); + } + MaybeScheduleFlushOrCompaction(); + TEST_SYNC_POINT("CompactFilesImpl:End"); + + return status; +} +#endif // ROCKSDB_LITE + +Status DBImpl::PauseBackgroundWork() { + InstrumentedMutexLock guard_lock(&mutex_); + bg_compaction_paused_++; + while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 || + bg_flush_scheduled_ > 0) { + bg_cv_.Wait(); + } + bg_work_paused_++; + return Status::OK(); +} + +Status DBImpl::ContinueBackgroundWork() { + InstrumentedMutexLock guard_lock(&mutex_); + if (bg_work_paused_ == 0) { + return Status::InvalidArgument(); + } + assert(bg_work_paused_ > 0); + assert(bg_compaction_paused_ > 0); + bg_compaction_paused_--; + bg_work_paused_--; + // It's sufficient to check just bg_work_paused_ here since + // bg_work_paused_ is always no greater than bg_compaction_paused_ + if (bg_work_paused_ == 0) { + MaybeScheduleFlushOrCompaction(); + } + return Status::OK(); +} + +void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& job_stats, + int job_id) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.listeners.empty()) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + if (c->is_manual_compaction() && + manual_compaction_paused_.load(std::memory_order_acquire)) { + return; + } + Version* current = cfd->current(); + current->Ref(); + // release lock while notifying events + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex"); + { + CompactionJobInfo info{}; + info.cf_name = cfd->GetName(); + info.status = st; + info.thread_id = env_->GetThreadID(); + info.job_id = job_id; + info.base_input_level = c->start_level(); + info.output_level = c->output_level(); + info.stats = job_stats; + info.table_properties = c->GetOutputTableProperties(); + info.compaction_reason = c->compaction_reason(); + info.compression = c->output_compression(); + for (size_t i = 0; i < c->num_input_levels(); ++i) { + for (const auto fmd : *c->inputs(i)) { + const FileDescriptor& desc = fmd->fd; + const uint64_t file_number = desc.GetNumber(); + auto fn = TableFileName(c->immutable_cf_options()->cf_paths, + file_number, desc.GetPathId()); + info.input_files.push_back(fn); + info.input_file_infos.push_back(CompactionFileInfo{ + static_cast(i), file_number, fmd->oldest_blob_file_number}); + if (info.table_properties.count(fn) == 0) { + std::shared_ptr tp; + auto s = current->GetTableProperties(&tp, fmd, &fn); + if (s.ok()) { + info.table_properties[fn] = tp; + } + } + } + } + for (const auto newf : c->edit()->GetNewFiles()) { + const FileMetaData& meta = newf.second; + const FileDescriptor& desc = meta.fd; + const uint64_t file_number = desc.GetNumber(); + info.output_files.push_back(TableFileName( + c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId())); + info.output_file_infos.push_back(CompactionFileInfo{ + newf.first, file_number, meta.oldest_blob_file_number}); + } + for (auto listener : immutable_db_options_.listeners) { + listener->OnCompactionBegin(this, info); + } + } + mutex_.Lock(); + current->Unref(); +#else + (void)cfd; + (void)c; + (void)st; + (void)job_stats; + (void)job_id; +#endif // ROCKSDB_LITE +} + +void DBImpl::NotifyOnCompactionCompleted( + ColumnFamilyData* cfd, Compaction* c, const Status& st, + const CompactionJobStats& compaction_job_stats, const int job_id) { +#ifndef ROCKSDB_LITE + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + if (c->is_manual_compaction() && + manual_compaction_paused_.load(std::memory_order_acquire)) { + return; + } + Version* current = cfd->current(); + current->Ref(); + // release lock while notifying events + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex"); + { + CompactionJobInfo info{}; + BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current, + &info); + for (auto listener : immutable_db_options_.listeners) { + listener->OnCompactionCompleted(this, info); + } + } + mutex_.Lock(); + current->Unref(); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +#else + (void)cfd; + (void)c; + (void)st; + (void)compaction_job_stats; + (void)job_id; +#endif // ROCKSDB_LITE +} + +// REQUIREMENT: block all background work by calling PauseBackgroundWork() +// before calling this function +Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { + assert(level < cfd->NumberLevels()); + if (target_level >= cfd->NumberLevels()) { + return Status::InvalidArgument("Target level exceeds number of levels"); + } + + SuperVersionContext sv_context(/* create_superversion */ true); + + Status status; + + InstrumentedMutexLock guard_lock(&mutex_); + + // only allow one thread refitting + if (refitting_level_) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[ReFitLevel] another thread is refitting"); + return Status::NotSupported("another thread is refitting"); + } + refitting_level_ = true; + + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + // move to a smaller level + int to_level = target_level; + if (target_level < 0) { + to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level); + } + + auto* vstorage = cfd->current()->storage_info(); + if (to_level > level) { + if (level == 0) { + return Status::NotSupported( + "Cannot change from level 0 to other levels."); + } + // Check levels are empty for a trivial move + for (int l = level + 1; l <= to_level; l++) { + if (vstorage->NumLevelFiles(l) > 0) { + return Status::NotSupported( + "Levels between source and target are not empty for a move."); + } + } + } + if (to_level != level) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Before refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); + + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : vstorage->LevelFiles(level)) { + edit.DeleteFile(level, f->fd.GetNumber()); + edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->oldest_blob_file_number, + f->oldest_ancester_time, f->file_creation_time, + f->file_checksum, f->file_checksum_func_name); + } + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), + edit.DebugString().data()); + + status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, + directories_.GetDbDir()); + InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options); + + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n", + cfd->GetName().c_str(), status.ToString().data()); + + if (status.ok()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] After refitting:\n%s", cfd->GetName().c_str(), + cfd->current()->DebugString().data()); + } + } + + sv_context.Clean(); + refitting_level_ = false; + + return status; +} + +int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + return cfh->cfd()->NumberLevels(); +} + +int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) { + return 0; +} + +int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + InstrumentedMutexLock l(&mutex_); + return cfh->cfd() + ->GetSuperVersion() + ->mutable_cf_options.level0_stop_writes_trigger; +} + +Status DBImpl::Flush(const FlushOptions& flush_options, + ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.", + cfh->GetName().c_str()); + Status s; + if (immutable_db_options_.atomic_flush) { + s = AtomicFlushMemTables({cfh->cfd()}, flush_options, + FlushReason::kManualFlush); + } else { + s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush); + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] Manual flush finished, status: %s\n", + cfh->GetName().c_str(), s.ToString().c_str()); + return s; +} + +Status DBImpl::Flush(const FlushOptions& flush_options, + const std::vector& column_families) { + Status s; + if (!immutable_db_options_.atomic_flush) { + for (auto cfh : column_families) { + s = Flush(flush_options, cfh); + if (!s.ok()) { + break; + } + } + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Manual atomic flush start.\n" + "=====Column families:====="); + for (auto cfh : column_families) { + auto cfhi = static_cast(cfh); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", + cfhi->GetName().c_str()); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "=====End of column families list====="); + autovector cfds; + std::for_each(column_families.begin(), column_families.end(), + [&cfds](ColumnFamilyHandle* elem) { + auto cfh = static_cast(elem); + cfds.emplace_back(cfh->cfd()); + }); + s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Manual atomic flush finished, status: %s\n" + "=====Column families:=====", + s.ToString().c_str()); + for (auto cfh : column_families) { + auto cfhi = static_cast(cfh); + ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", + cfhi->GetName().c_str()); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "=====End of column families list====="); + } + return s; +} + +Status DBImpl::RunManualCompaction( + ColumnFamilyData* cfd, int input_level, int output_level, + const CompactRangeOptions& compact_range_options, const Slice* begin, + const Slice* end, bool exclusive, bool disallow_trivial_move, + uint64_t max_file_num_to_ignore) { + assert(input_level == ColumnFamilyData::kCompactAllLevels || + input_level >= 0); + + InternalKey begin_storage, end_storage; + CompactionArg* ca; + + bool scheduled = false; + bool manual_conflict = false; + ManualCompactionState manual; + manual.cfd = cfd; + manual.input_level = input_level; + manual.output_level = output_level; + manual.output_path_id = compact_range_options.target_path_id; + manual.done = false; + manual.in_progress = false; + manual.incomplete = false; + manual.exclusive = exclusive; + manual.disallow_trivial_move = disallow_trivial_move; + // For universal compaction, we enforce every manual compaction to compact + // all files. + if (begin == nullptr || + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + manual.begin = nullptr; + } else { + begin_storage.SetMinPossibleForUserKey(*begin); + manual.begin = &begin_storage; + } + if (end == nullptr || + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + manual.end = nullptr; + } else { + end_storage.SetMaxPossibleForUserKey(*end); + manual.end = &end_storage; + } + + TEST_SYNC_POINT("DBImpl::RunManualCompaction:0"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction:1"); + InstrumentedMutexLock l(&mutex_); + + // When a manual compaction arrives, temporarily disable scheduling of + // non-manual compactions and wait until the number of scheduled compaction + // jobs drops to zero. This is needed to ensure that this manual compaction + // can compact any range of keys/files. + // + // HasPendingManualCompaction() is true when at least one thread is inside + // RunManualCompaction(), i.e. during that time no other compaction will + // get scheduled (see MaybeScheduleFlushOrCompaction). + // + // Note that the following loop doesn't stop more that one thread calling + // RunManualCompaction() from getting to the second while loop below. + // However, only one of them will actually schedule compaction, while + // others will wait on a condition variable until it completes. + + AddManualCompaction(&manual); + TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_); + if (exclusive) { + while (bg_bottom_compaction_scheduled_ > 0 || + bg_compaction_scheduled_ > 0) { + TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled"); + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[%s] Manual compaction waiting for all other scheduled background " + "compactions to finish", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } + } + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] Manual compaction starting", cfd->GetName().c_str()); + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + // We don't check bg_error_ here, because if we get the error in compaction, + // the compaction will set manual.status to bg_error_ and set manual.done to + // true. + while (!manual.done) { + assert(HasPendingManualCompaction()); + manual_conflict = false; + Compaction* compaction = nullptr; + if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) || + scheduled || + (((manual.manual_end = &manual.tmp_storage1) != nullptr) && + ((compaction = manual.cfd->CompactRange( + *manual.cfd->GetLatestMutableCFOptions(), manual.input_level, + manual.output_level, compact_range_options, manual.begin, + manual.end, &manual.manual_end, &manual_conflict, + max_file_num_to_ignore)) == nullptr && + manual_conflict))) { + // exclusive manual compactions should not see a conflict during + // CompactRange + assert(!exclusive || !manual_conflict); + // Running either this or some other manual compaction + bg_cv_.Wait(); + if (scheduled && manual.incomplete == true) { + assert(!manual.in_progress); + scheduled = false; + manual.incomplete = false; + } + } else if (!scheduled) { + if (compaction == nullptr) { + manual.done = true; + bg_cv_.SignalAll(); + continue; + } + ca = new CompactionArg; + ca->db = this; + ca->prepicked_compaction = new PrepickedCompaction; + ca->prepicked_compaction->manual_compaction_state = &manual; + ca->prepicked_compaction->compaction = compaction; + if (!RequestCompactionToken( + cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) { + // Don't throttle manual compaction, only count outstanding tasks. + assert(false); + } + manual.incomplete = false; + bg_compaction_scheduled_++; + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, + &DBImpl::UnscheduleCompactionCallback); + scheduled = true; + } + } + + log_buffer.FlushBufferToLog(); + assert(!manual.in_progress); + assert(HasPendingManualCompaction()); + RemoveManualCompaction(&manual); + bg_cv_.SignalAll(); + return manual.status; +} + +void DBImpl::GenerateFlushRequest(const autovector& cfds, + FlushRequest* req) { + assert(req != nullptr); + req->reserve(cfds.size()); + for (const auto cfd : cfds) { + if (nullptr == cfd) { + // cfd may be null, see DBImpl::ScheduleFlushes + continue; + } + uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID(); + req->emplace_back(cfd, max_memtable_id); + } +} + +Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_options, + FlushReason flush_reason, bool writes_stopped) { + Status s; + uint64_t flush_memtable_id = 0; + if (!flush_options.allow_write_stall) { + bool flush_needed = true; + s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); + TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone"); + if (!s.ok() || !flush_needed) { + return s; + } + } + FlushRequest flush_req; + { + WriteContext context; + InstrumentedMutexLock guard_lock(&mutex_); + + WriteThread::Writer w; + WriteThread::Writer nonmem_w; + if (!writes_stopped) { + write_thread_.EnterUnbatched(&w, &mutex_); + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + } + WaitForPendingWrites(); + + if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { + s = SwitchMemtable(cfd, &context); + } + if (s.ok()) { + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + flush_memtable_id = cfd->imm()->GetLatestMemTableID(); + flush_req.emplace_back(cfd, flush_memtable_id); + } + if (immutable_db_options_.persist_stats_to_disk) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && cfd_stats != cfd && + !cfd_stats->mem()->IsEmpty()) { + // only force flush stats CF when it will be the only CF lagging + // behind after the current flush + bool stats_cf_flush_needed = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats || loop_cfd == cfd) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + stats_cf_flush_needed = false; + } + } + if (stats_cf_flush_needed) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with manual flush of %s " + "to avoid holding old logs", + cfd->GetName().c_str()); + s = SwitchMemtable(cfd_stats, &context); + flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID(); + flush_req.emplace_back(cfd_stats, flush_memtable_id); + } + } + } + } + + if (s.ok() && !flush_req.empty()) { + for (auto& elem : flush_req) { + ColumnFamilyData* loop_cfd = elem.first; + loop_cfd->imm()->FlushRequested(); + } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (auto& elem : flush_req) { + ColumnFamilyData* loop_cfd = elem.first; + loop_cfd->Ref(); + } + } + SchedulePendingFlush(flush_req, flush_reason); + MaybeScheduleFlushOrCompaction(); + } + + if (!writes_stopped) { + write_thread_.ExitUnbatched(&w); + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + } + } + TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush"); + TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush"); + if (s.ok() && flush_options.wait) { + autovector cfds; + autovector flush_memtable_ids; + for (auto& iter : flush_req) { + cfds.push_back(iter.first); + flush_memtable_ids.push_back(&(iter.second)); + } + s = WaitForFlushMemTables(cfds, flush_memtable_ids, + (flush_reason == FlushReason::kErrorRecovery)); + InstrumentedMutexLock lock_guard(&mutex_); + for (auto* tmp_cfd : cfds) { + tmp_cfd->UnrefAndTryDelete(); + } + } + TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished"); + return s; +} + +// Flush all elements in 'column_family_datas' +// and atomically record the result to the MANIFEST. +Status DBImpl::AtomicFlushMemTables( + const autovector& column_family_datas, + const FlushOptions& flush_options, FlushReason flush_reason, + bool writes_stopped) { + Status s; + if (!flush_options.allow_write_stall) { + int num_cfs_to_flush = 0; + for (auto cfd : column_family_datas) { + bool flush_needed = true; + s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); + if (!s.ok()) { + return s; + } else if (flush_needed) { + ++num_cfs_to_flush; + } + } + if (0 == num_cfs_to_flush) { + return s; + } + } + FlushRequest flush_req; + autovector cfds; + { + WriteContext context; + InstrumentedMutexLock guard_lock(&mutex_); + + WriteThread::Writer w; + WriteThread::Writer nonmem_w; + if (!writes_stopped) { + write_thread_.EnterUnbatched(&w, &mutex_); + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + } + WaitForPendingWrites(); + + for (auto cfd : column_family_datas) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + cfds.emplace_back(cfd); + } + } + for (auto cfd : cfds) { + if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) { + continue; + } + cfd->Ref(); + s = SwitchMemtable(cfd, &context); + cfd->UnrefAndTryDelete(); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + AssignAtomicFlushSeq(cfds); + for (auto cfd : cfds) { + cfd->imm()->FlushRequested(); + } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (auto cfd : cfds) { + cfd->Ref(); + } + } + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, flush_reason); + MaybeScheduleFlushOrCompaction(); + } + + if (!writes_stopped) { + write_thread_.ExitUnbatched(&w); + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + } + } + TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush"); + TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"); + if (s.ok() && flush_options.wait) { + autovector flush_memtable_ids; + for (auto& iter : flush_req) { + flush_memtable_ids.push_back(&(iter.second)); + } + s = WaitForFlushMemTables(cfds, flush_memtable_ids, + (flush_reason == FlushReason::kErrorRecovery)); + InstrumentedMutexLock lock_guard(&mutex_); + for (auto* cfd : cfds) { + cfd->UnrefAndTryDelete(); + } + } + return s; +} + +// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can +// cause write stall, for example if one memtable is being flushed already. +// This method tries to avoid write stall (similar to CompactRange() behavior) +// it emulates how the SuperVersion / LSM would change if flush happens, checks +// it against various constrains and delays flush if it'd cause write stall. +// Called should check status and flush_needed to see if flush already happened. +Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, + bool* flush_needed) { + { + *flush_needed = true; + InstrumentedMutexLock l(&mutex_); + uint64_t orig_active_memtable_id = cfd->mem()->GetID(); + WriteStallCondition write_stall_condition = WriteStallCondition::kNormal; + do { + if (write_stall_condition != WriteStallCondition::kNormal) { + // Same error handling as user writes: Don't wait if there's a + // background error, even if it's a soft error. We might wait here + // indefinitely as the pending flushes/compactions may never finish + // successfully, resulting in the stall condition lasting indefinitely + if (error_handler_.IsBGWorkStopped()) { + return error_handler_.GetBGError(); + } + + TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait"); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] WaitUntilFlushWouldNotStallWrites" + " waiting on stall conditions to clear", + cfd->GetName().c_str()); + bg_cv_.Wait(); + } + if (cfd->IsDropped()) { + return Status::ColumnFamilyDropped(); + } + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + + uint64_t earliest_memtable_id = + std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID()); + if (earliest_memtable_id > orig_active_memtable_id) { + // We waited so long that the memtable we were originally waiting on was + // flushed. + *flush_needed = false; + return Status::OK(); + } + + const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + const auto* vstorage = cfd->current()->storage_info(); + + // Skip stalling check if we're below auto-flush and auto-compaction + // triggers. If it stalled in these conditions, that'd mean the stall + // triggers are so low that stalling is needed for any background work. In + // that case we shouldn't wait since background work won't be scheduled. + if (cfd->imm()->NumNotFlushed() < + cfd->ioptions()->min_write_buffer_number_to_merge && + vstorage->l0_delay_trigger_count() < + mutable_cf_options.level0_file_num_compaction_trigger) { + break; + } + + // check whether one extra immutable memtable or an extra L0 file would + // cause write stalling mode to be entered. It could still enter stall + // mode due to pending compaction bytes, but that's less common + write_stall_condition = + ColumnFamilyData::GetWriteStallConditionAndCause( + cfd->imm()->NumNotFlushed() + 1, + vstorage->l0_delay_trigger_count() + 1, + vstorage->estimated_compaction_needed_bytes(), mutable_cf_options) + .first; + } while (write_stall_condition != WriteStallCondition::kNormal); + } + return Status::OK(); +} + +// Wait for memtables to be flushed for multiple column families. +// let N = cfds.size() +// for i in [0, N), +// 1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs +// have to be flushed for THIS column family; +// 2) if flush_memtable_ids[i] is null, then all memtables in THIS column +// family have to be flushed. +// Finish waiting when ALL column families finish flushing memtables. +// resuming_from_bg_err indicates whether the caller is trying to resume from +// background error or in normal processing. +Status DBImpl::WaitForFlushMemTables( + const autovector& cfds, + const autovector& flush_memtable_ids, + bool resuming_from_bg_err) { + int num = static_cast(cfds.size()); + // Wait until the compaction completes + InstrumentedMutexLock l(&mutex_); + // If the caller is trying to resume from bg error, then + // error_handler_.IsDBStopped() is true. + while (resuming_from_bg_err || !error_handler_.IsDBStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + // If an error has occurred during resumption, then no need to wait. + if (!error_handler_.GetRecoveryError().ok()) { + break; + } + // Number of column families that have been dropped. + int num_dropped = 0; + // Number of column families that have finished flush. + int num_finished = 0; + for (int i = 0; i < num; ++i) { + if (cfds[i]->IsDropped()) { + ++num_dropped; + } else if (cfds[i]->imm()->NumNotFlushed() == 0 || + (flush_memtable_ids[i] != nullptr && + cfds[i]->imm()->GetEarliestMemTableID() > + *flush_memtable_ids[i])) { + ++num_finished; + } + } + if (1 == num_dropped && 1 == num) { + return Status::InvalidArgument("Cannot flush a dropped CF"); + } + // Column families involved in this flush request have either been dropped + // or finished flush. Then it's time to finish waiting. + if (num_dropped + num_finished == num) { + break; + } + bg_cv_.Wait(); + } + Status s; + // If not resuming from bg error, and an error has caused the DB to stop, + // then report the bg error to caller. + if (!resuming_from_bg_err && error_handler_.IsDBStopped()) { + s = error_handler_.GetBGError(); + } + return s; +} + +Status DBImpl::EnableAutoCompaction( + const std::vector& column_family_handles) { + Status s; + for (auto cf_ptr : column_family_handles) { + Status status = + this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}}); + if (!status.ok()) { + s = status; + } + } + + return s; +} + +void DBImpl::DisableManualCompaction() { + manual_compaction_paused_.store(true, std::memory_order_release); +} + +void DBImpl::EnableManualCompaction() { + manual_compaction_paused_.store(false, std::memory_order_release); +} + +void DBImpl::MaybeScheduleFlushOrCompaction() { + mutex_.AssertHeld(); + if (!opened_successfully_) { + // Compaction may introduce data race to DB open + return; + } + if (bg_work_paused_ > 0) { + // we paused the background work + return; + } else if (error_handler_.IsBGWorkStopped() && + !error_handler_.IsRecoveryInProgress()) { + // There has been a hard error and this call is not part of the recovery + // sequence. Bail out here so we don't get into an endless loop of + // scheduling BG work which will again call this function + return; + } else if (shutting_down_.load(std::memory_order_acquire)) { + // DB is being deleted; no more background compactions + return; + } + auto bg_job_limits = GetBGJobLimits(); + bool is_flush_pool_empty = + env_->GetBackgroundThreads(Env::Priority::HIGH) == 0; + while (!is_flush_pool_empty && unscheduled_flushes_ > 0 && + bg_flush_scheduled_ < bg_job_limits.max_flushes) { + bg_flush_scheduled_++; + FlushThreadArg* fta = new FlushThreadArg; + fta->db_ = this; + fta->thread_pri_ = Env::Priority::HIGH; + env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::HIGH, this, + &DBImpl::UnscheduleFlushCallback); + --unscheduled_flushes_; + TEST_SYNC_POINT_CALLBACK( + "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", + &unscheduled_flushes_); + } + + // special case -- if high-pri (flush) thread pool is empty, then schedule + // flushes in low-pri (compaction) thread pool. + if (is_flush_pool_empty) { + while (unscheduled_flushes_ > 0 && + bg_flush_scheduled_ + bg_compaction_scheduled_ < + bg_job_limits.max_flushes) { + bg_flush_scheduled_++; + FlushThreadArg* fta = new FlushThreadArg; + fta->db_ = this; + fta->thread_pri_ = Env::Priority::LOW; + env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::LOW, this, + &DBImpl::UnscheduleFlushCallback); + --unscheduled_flushes_; + } + } + + if (bg_compaction_paused_ > 0) { + // we paused the background compaction + return; + } else if (error_handler_.IsBGWorkStopped()) { + // Compaction is not part of the recovery sequence from a hard error. We + // might get here because recovery might do a flush and install a new + // super version, which will try to schedule pending compactions. Bail + // out here and let the higher level recovery handle compactions + return; + } + + if (HasExclusiveManualCompaction()) { + // only manual compactions are allowed to run. don't schedule automatic + // compactions + TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Conflict"); + return; + } + + while (bg_compaction_scheduled_ < bg_job_limits.max_compactions && + unscheduled_compactions_ > 0) { + CompactionArg* ca = new CompactionArg; + ca->db = this; + ca->prepicked_compaction = nullptr; + bg_compaction_scheduled_++; + unscheduled_compactions_--; + env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, + &DBImpl::UnscheduleCompactionCallback); + } +} + +DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const { + mutex_.AssertHeld(); + return GetBGJobLimits(immutable_db_options_.max_background_flushes, + mutable_db_options_.max_background_compactions, + mutable_db_options_.max_background_jobs, + write_controller_.NeedSpeedupCompaction()); +} + +DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes, + int max_background_compactions, + int max_background_jobs, + bool parallelize_compactions) { + BGJobLimits res; + if (max_background_flushes == -1 && max_background_compactions == -1) { + // for our first stab implementing max_background_jobs, simply allocate a + // quarter of the threads to flushes. + res.max_flushes = std::max(1, max_background_jobs / 4); + res.max_compactions = std::max(1, max_background_jobs - res.max_flushes); + } else { + // compatibility code in case users haven't migrated to max_background_jobs, + // which automatically computes flush/compaction limits + res.max_flushes = std::max(1, max_background_flushes); + res.max_compactions = std::max(1, max_background_compactions); + } + if (!parallelize_compactions) { + // throttle background compactions until we deem necessary + res.max_compactions = 1; + } + return res; +} + +void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) { + assert(!cfd->queued_for_compaction()); + cfd->Ref(); + compaction_queue_.push_back(cfd); + cfd->set_queued_for_compaction(true); +} + +ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() { + assert(!compaction_queue_.empty()); + auto cfd = *compaction_queue_.begin(); + compaction_queue_.pop_front(); + assert(cfd->queued_for_compaction()); + cfd->set_queued_for_compaction(false); + return cfd; +} + +DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() { + assert(!flush_queue_.empty()); + FlushRequest flush_req = flush_queue_.front(); + flush_queue_.pop_front(); + // TODO: need to unset flush reason? + return flush_req; +} + +ColumnFamilyData* DBImpl::PickCompactionFromQueue( + std::unique_ptr* token, LogBuffer* log_buffer) { + assert(!compaction_queue_.empty()); + assert(*token == nullptr); + autovector throttled_candidates; + ColumnFamilyData* cfd = nullptr; + while (!compaction_queue_.empty()) { + auto first_cfd = *compaction_queue_.begin(); + compaction_queue_.pop_front(); + assert(first_cfd->queued_for_compaction()); + if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) { + throttled_candidates.push_back(first_cfd); + continue; + } + cfd = first_cfd; + cfd->set_queued_for_compaction(false); + break; + } + // Add throttled compaction candidates back to queue in the original order. + for (auto iter = throttled_candidates.rbegin(); + iter != throttled_candidates.rend(); ++iter) { + compaction_queue_.push_front(*iter); + } + return cfd; +} + +void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req, + FlushReason flush_reason) { + if (flush_req.empty()) { + return; + } + for (auto& iter : flush_req) { + ColumnFamilyData* cfd = iter.first; + cfd->Ref(); + cfd->SetFlushReason(flush_reason); + } + ++unscheduled_flushes_; + flush_queue_.push_back(flush_req); +} + +void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { + if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) { + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + } +} + +void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync, + FileType type, uint64_t number, int job_id) { + mutex_.AssertHeld(); + PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id); + purge_files_.insert({{number, std::move(file_info)}}); +} + +void DBImpl::BGWorkFlush(void* arg) { + FlushThreadArg fta = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + + IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_); + TEST_SYNC_POINT("DBImpl::BGWorkFlush"); + static_cast_with_check(fta.db_)->BackgroundCallFlush( + fta.thread_pri_); + TEST_SYNC_POINT("DBImpl::BGWorkFlush:done"); +} + +void DBImpl::BGWorkCompaction(void* arg) { + CompactionArg ca = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW); + TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); + auto prepicked_compaction = + static_cast(ca.prepicked_compaction); + static_cast_with_check(ca.db)->BackgroundCallCompaction( + prepicked_compaction, Env::Priority::LOW); + delete prepicked_compaction; +} + +void DBImpl::BGWorkBottomCompaction(void* arg) { + CompactionArg ca = *(static_cast(arg)); + delete static_cast(arg); + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM); + TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction"); + auto* prepicked_compaction = ca.prepicked_compaction; + assert(prepicked_compaction && prepicked_compaction->compaction && + !prepicked_compaction->manual_compaction_state); + ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM); + delete prepicked_compaction; +} + +void DBImpl::BGWorkPurge(void* db) { + IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:start"); + reinterpret_cast(db)->BackgroundCallPurge(); + TEST_SYNC_POINT("DBImpl::BGWorkPurge:end"); +} + +void DBImpl::UnscheduleCompactionCallback(void* arg) { + CompactionArg ca = *(reinterpret_cast(arg)); + delete reinterpret_cast(arg); + if (ca.prepicked_compaction != nullptr) { + if (ca.prepicked_compaction->compaction != nullptr) { + delete ca.prepicked_compaction->compaction; + } + delete ca.prepicked_compaction; + } + TEST_SYNC_POINT("DBImpl::UnscheduleCompactionCallback"); +} + +void DBImpl::UnscheduleFlushCallback(void* arg) { + delete reinterpret_cast(arg); + TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback"); +} + +Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, + LogBuffer* log_buffer, FlushReason* reason, + Env::Priority thread_pri) { + mutex_.AssertHeld(); + + Status status; + *reason = FlushReason::kOthers; + // If BG work is stopped due to an error, but a recovery is in progress, + // that means this flush is part of the recovery. So allow it to go through + if (!error_handler_.IsBGWorkStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + status = Status::ShutdownInProgress(); + } + } else if (!error_handler_.IsRecoveryInProgress()) { + status = error_handler_.GetBGError(); + } + + if (!status.ok()) { + return status; + } + + autovector bg_flush_args; + std::vector& superversion_contexts = + job_context->superversion_contexts; + autovector column_families_not_to_flush; + while (!flush_queue_.empty()) { + // This cfd is already referenced + const FlushRequest& flush_req = PopFirstFromFlushQueue(); + superversion_contexts.clear(); + superversion_contexts.reserve(flush_req.size()); + + for (const auto& iter : flush_req) { + ColumnFamilyData* cfd = iter.first; + if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) { + // can't flush this CF, try next one + column_families_not_to_flush.push_back(cfd); + continue; + } + superversion_contexts.emplace_back(SuperVersionContext(true)); + bg_flush_args.emplace_back(cfd, iter.second, + &(superversion_contexts.back())); + } + if (!bg_flush_args.empty()) { + break; + } + } + + if (!bg_flush_args.empty()) { + auto bg_job_limits = GetBGJobLimits(); + for (const auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + ROCKS_LOG_BUFFER( + log_buffer, + "Calling FlushMemTableToOutputFile with column " + "family [%s], flush slots available %d, compaction slots available " + "%d, " + "flush slots scheduled %d, compaction slots scheduled %d", + cfd->GetName().c_str(), bg_job_limits.max_flushes, + bg_job_limits.max_compactions, bg_flush_scheduled_, + bg_compaction_scheduled_); + } + status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress, + job_context, log_buffer, thread_pri); + TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush"); + // All the CFDs in the FlushReq must have the same flush reason, so just + // grab the first one + *reason = bg_flush_args[0].cfd_->GetFlushReason(); + for (auto& arg : bg_flush_args) { + ColumnFamilyData* cfd = arg.cfd_; + if (cfd->UnrefAndTryDelete()) { + arg.cfd_ = nullptr; + } + } + } + for (auto cfd : column_families_not_to_flush) { + cfd->UnrefAndTryDelete(); + } + return status; +} + +void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { + bool made_progress = false; + JobContext job_context(next_job_id_.fetch_add(1), true); + + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:start"); + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + { + InstrumentedMutexLock l(&mutex_); + assert(bg_flush_scheduled_); + num_running_flushes_++; + + std::unique_ptr::iterator> + pending_outputs_inserted_elem(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + FlushReason reason; + + Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer, + &reason, thread_pri); + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() && + reason != FlushReason::kErrorRecovery) { + // Wait a little bit before retrying background flush in + // case this is an environmental problem and we do not want to + // chew up resources for failed flushes for the duration of + // the problem. + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Waiting after background flush error: %s" + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + log_buffer.FlushBufferToLog(); + LogFlush(immutable_db_options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } + + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0"); + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // If flush failed, we want to delete all temporary files that we might have + // created. Thus, we force full scan in FindObsoleteFiles() + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsColumnFamilyDropped()); + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound"); + // Have to flush the info logs before bg_flush_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); + } + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp"); + + assert(num_running_flushes_ > 0); + num_running_flushes_--; + bg_flush_scheduled_--; + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); + atomic_flush_install_cv_.SignalAll(); + bg_cv_.SignalAll(); + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, + Env::Priority bg_thread_pri) { + bool made_progress = false; + JobContext job_context(next_job_id_.fetch_add(1), true); + TEST_SYNC_POINT("BackgroundCallCompaction:0"); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + { + InstrumentedMutexLock l(&mutex_); + + // This call will unlock/lock the mutex to wait for current running + // IngestExternalFile() calls to finish. + WaitForIngestFile(); + + num_running_compactions_++; + + std::unique_ptr::iterator> + pending_outputs_inserted_elem(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + assert((bg_thread_pri == Env::Priority::BOTTOM && + bg_bottom_compaction_scheduled_) || + (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_)); + Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer, + prepicked_compaction, bg_thread_pri); + TEST_SYNC_POINT("BackgroundCallCompaction:1"); + if (s.IsBusy()) { + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + env_->SleepForMicroseconds(10000); // prevent hot loop + mutex_.Lock(); + } else if (!s.ok() && !s.IsShutdownInProgress() && + !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) { + // Wait a little bit before retrying background compaction in + // case this is an environmental problem and we do not want to + // chew up resources for failed compactions for the duration of + // the problem. + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + log_buffer.FlushBufferToLog(); + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Waiting after background compaction error: %s, " + "Accumulated background error counts: %" PRIu64, + s.ToString().c_str(), error_cnt); + LogFlush(immutable_db_options_.info_log); + env_->SleepForMicroseconds(1000000); + mutex_.Lock(); + } else if (s.IsManualCompactionPaused()) { + ManualCompactionState* m = prepicked_compaction->manual_compaction_state; + assert(m); + ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused", + m->cfd->GetName().c_str(), job_context.job_id); + } + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // If compaction failed, we want to delete all temporary files that we might + // have created (they might not be all recorded in job_context in case of a + // failure). Thus, we force full scan in FindObsoleteFiles() + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsManualCompactionPaused() && + !s.IsColumnFamilyDropped()); + TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"); + + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles"); + } + job_context.Clean(); + mutex_.Lock(); + } + + assert(num_running_compactions_ > 0); + num_running_compactions_--; + if (bg_thread_pri == Env::Priority::LOW) { + bg_compaction_scheduled_--; + } else { + assert(bg_thread_pri == Env::Priority::BOTTOM); + bg_bottom_compaction_scheduled_--; + } + + versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); + if (made_progress || + (bg_compaction_scheduled_ == 0 && + bg_bottom_compaction_scheduled_ == 0) || + HasPendingManualCompaction() || unscheduled_compactions_ == 0) { + // signal if + // * made_progress -- need to wakeup DelayWrite + // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl + // * HasPendingManualCompaction -- need to wakeup RunManualCompaction + // If none of this is true, there is no need to signal since nobody is + // waiting for it + bg_cv_.SignalAll(); + } + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. + } +} + +Status DBImpl::BackgroundCompaction(bool* made_progress, + JobContext* job_context, + LogBuffer* log_buffer, + PrepickedCompaction* prepicked_compaction, + Env::Priority thread_pri) { + ManualCompactionState* manual_compaction = + prepicked_compaction == nullptr + ? nullptr + : prepicked_compaction->manual_compaction_state; + *made_progress = false; + mutex_.AssertHeld(); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); + + bool is_manual = (manual_compaction != nullptr); + std::unique_ptr c; + if (prepicked_compaction != nullptr && + prepicked_compaction->compaction != nullptr) { + c.reset(prepicked_compaction->compaction); + } + bool is_prepicked = is_manual || c; + + // (manual_compaction->in_progress == false); + bool trivial_move_disallowed = + is_manual && manual_compaction->disallow_trivial_move; + + CompactionJobStats compaction_job_stats; + Status status; + if (!error_handler_.IsBGWorkStopped()) { + if (shutting_down_.load(std::memory_order_acquire)) { + status = Status::ShutdownInProgress(); + } else if (is_manual && + manual_compaction_paused_.load(std::memory_order_acquire)) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + } else { + status = error_handler_.GetBGError(); + // If we get here, it means a hard error happened after this compaction + // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got + // a chance to execute. Since we didn't pop a cfd from the compaction + // queue, increment unscheduled_compactions_ + unscheduled_compactions_++; + } + + if (!status.ok()) { + if (is_manual) { + manual_compaction->status = status; + manual_compaction->done = true; + manual_compaction->in_progress = false; + manual_compaction = nullptr; + } + if (c) { + c->ReleaseCompactionFiles(status); + c.reset(); + } + return status; + } + + if (is_manual) { + // another thread cannot pick up the same work + manual_compaction->in_progress = true; + } + + std::unique_ptr task_token; + + // InternalKey manual_end_storage; + // InternalKey* manual_end = &manual_end_storage; + bool sfm_reserved_compact_space = false; + if (is_manual) { + ManualCompactionState* m = manual_compaction; + assert(m->in_progress); + if (!c) { + m->done = true; + m->manual_end = nullptr; + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Manual compaction from level-%d from %s .. " + "%s; nothing to do\n", + m->cfd->GetName().c_str(), m->input_level, + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)")); + } else { + // First check if we have enough room to do the compaction + bool enough_room = EnoughRoomForCompaction( + m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // Then don't do the compaction + c->ReleaseCompactionFiles(status); + c.reset(); + // m's vars will get set properly at the end of this function, + // as long as status == CompactionTooLarge + status = Status::CompactionTooLarge(); + } else { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Manual compaction from level-%d to level-%d from %s .. " + "%s; will stop at %s\n", + m->cfd->GetName().c_str(), m->input_level, c->output_level(), + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)"), + ((m->done || m->manual_end == nullptr) + ? "(end)" + : m->manual_end->DebugString().c_str())); + } + } + } else if (!is_prepicked && !compaction_queue_.empty()) { + if (HasExclusiveManualCompaction()) { + // Can't compact right now, but try again later + TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict"); + + // Stay in the compaction queue. + unscheduled_compactions_++; + + return Status::OK(); + } + + auto cfd = PickCompactionFromQueue(&task_token, log_buffer); + if (cfd == nullptr) { + // Can't find any executable task from the compaction queue. + // All tasks have been throttled by compaction thread limiter. + ++unscheduled_compactions_; + return Status::Busy(); + } + + // We unreference here because the following code will take a Ref() on + // this cfd if it is going to use it (Compaction class holds a + // reference). + // This will all happen under a mutex so we don't have to be afraid of + // somebody else deleting it. + if (cfd->UnrefAndTryDelete()) { + // This was the last reference of the column family, so no need to + // compact. + return Status::OK(); + } + + // Pick up latest mutable CF Options and use it throughout the + // compaction job + // Compaction makes a copy of the latest MutableCFOptions. It should be used + // throughout the compaction procedure to make sure consistency. It will + // eventually be installed into SuperVersion + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) { + // NOTE: try to avoid unnecessary copy of MutableCFOptions if + // compaction is not necessary. Need to make sure mutex is held + // until we make a copy in the following code + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); + c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer)); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); + + if (c != nullptr) { + bool enough_room = EnoughRoomForCompaction( + cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer); + + if (!enough_room) { + // Then don't do the compaction + c->ReleaseCompactionFiles(status); + c->column_family_data() + ->current() + ->storage_info() + ->ComputeCompactionScore(*(c->immutable_cf_options()), + *(c->mutable_cf_options())); + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + + c.reset(); + // Don't need to sleep here, because BackgroundCallCompaction + // will sleep if !s.ok() + status = Status::CompactionTooLarge(); + } else { + // update statistics + RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs(0)->size()); + // There are three things that can change compaction score: + // 1) When flush or compaction finish. This case is covered by + // InstallSuperVersionAndScheduleWork + // 2) When MutableCFOptions changes. This case is also covered by + // InstallSuperVersionAndScheduleWork, because this is when the new + // options take effect. + // 3) When we Pick a new compaction, we "remove" those files being + // compacted from the calculation, which then influences compaction + // score. Here we check if we need the new compaction even without the + // files that are currently being compacted. If we need another + // compaction, we might be able to execute it in parallel, so we add + // it to the queue and schedule a new thread. + if (cfd->NeedsCompaction()) { + // Yes, we need more compactions! + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + MaybeScheduleFlushOrCompaction(); + } + } + } + } + } + + if (!c) { + // Nothing to do + ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do"); + } else if (c->deletion_compaction()) { + // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old + // file if there is alive snapshot pointing to it + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + assert(c->num_input_files(1) == 0); + assert(c->level() == 0); + assert(c->column_family_data()->ioptions()->compaction_style == + kCompactionStyleFIFO); + + compaction_job_stats.num_input_files = c->num_input_files(0); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + + for (const auto& f : *c->inputs(0)) { + c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); + } + status = versions_->LogAndApply(c->column_family_data(), + *c->mutable_cf_options(), c->edit(), + &mutex_, directories_.GetDbDir()); + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n", + c->column_family_data()->GetName().c_str(), + c->num_input_files(0)); + *made_progress = true; + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } else if (!trivial_move_disallowed && c->IsTrivialMove()) { + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + // Instrument for event update + // TODO(yhchiang): add op details for showing trivial-move. + ThreadStatusUtil::SetColumnFamily( + c->column_family_data(), c->column_family_data()->ioptions()->env, + immutable_db_options_.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); + + compaction_job_stats.num_input_files = c->num_input_files(0); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + + // Move files to next level + int32_t moved_files = 0; + int64_t moved_bytes = 0; + for (unsigned int l = 0; l < c->num_input_levels(); l++) { + if (c->level(l) == c->output_level()) { + continue; + } + for (size_t i = 0; i < c->num_input_files(l); i++) { + FileMetaData* f = c->input(l, i); + c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); + c->edit()->AddFile(c->output_level(), f->fd.GetNumber(), + f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, + f->largest, f->fd.smallest_seqno, + f->fd.largest_seqno, f->marked_for_compaction, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->file_checksum, + f->file_checksum_func_name); + + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n", + c->column_family_data()->GetName().c_str(), f->fd.GetNumber(), + c->output_level(), f->fd.GetFileSize()); + ++moved_files; + moved_bytes += f->fd.GetFileSize(); + } + } + + status = versions_->LogAndApply(c->column_family_data(), + *c->mutable_cf_options(), c->edit(), + &mutex_, directories_.GetDbDir()); + // Use latest MutableCFOptions + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + + VersionStorageInfo::LevelSummaryStorage tmp; + c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(), + moved_bytes); + { + event_logger_.LogToBuffer(log_buffer) + << "job" << job_context->job_id << "event" + << "trivial_move" + << "destination_level" << c->output_level() << "files" << moved_files + << "total_files_size" << moved_bytes; + } + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n", + c->column_family_data()->GetName().c_str(), moved_files, + c->output_level(), moved_bytes, status.ToString().c_str(), + c->column_family_data()->current()->storage_info()->LevelSummary(&tmp)); + *made_progress = true; + + // Clear Instrument + ThreadStatusUtil::ResetThreadStatus(); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } else if (!is_prepicked && c->output_level() > 0 && + c->output_level() == + c->column_family_data() + ->current() + ->storage_info() + ->MaxOutputLevel( + immutable_db_options_.allow_ingest_behind) && + env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) { + // Forward compactions involving last level to the bottom pool if it exists, + // such that compactions unlikely to contribute to write stalls can be + // delayed or deprioritized. + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool"); + CompactionArg* ca = new CompactionArg; + ca->db = this; + ca->prepicked_compaction = new PrepickedCompaction; + ca->prepicked_compaction->compaction = c.release(); + ca->prepicked_compaction->manual_compaction_state = nullptr; + // Transfer requested token, so it doesn't need to do it again. + ca->prepicked_compaction->task_token = std::move(task_token); + ++bg_bottom_compaction_scheduled_; + env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM, + this, &DBImpl::UnscheduleCompactionCallback); + } else { + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction", + c->column_family_data()); + int output_level __attribute__((__unused__)); + output_level = c->output_level(); + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial", + &output_level); + std::vector snapshot_seqs; + SequenceNumber earliest_write_conflict_snapshot; + SnapshotChecker* snapshot_checker; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, &snapshot_checker); + assert(is_snapshot_supported_ || snapshots_.empty()); + CompactionJob compaction_job( + job_context->job_id, c.get(), immutable_db_options_, + file_options_for_compaction_, versions_.get(), &shutting_down_, + preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(), + GetDataDir(c->column_family_data(), c->output_path_id()), stats_, + &mutex_, &error_handler_, snapshot_seqs, + earliest_write_conflict_snapshot, snapshot_checker, table_cache_, + &event_logger_, c->mutable_cf_options()->paranoid_file_checks, + c->mutable_cf_options()->report_bg_io_stats, dbname_, + &compaction_job_stats, thread_pri, + is_manual ? &manual_compaction_paused_ : nullptr); + compaction_job.Prepare(); + + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + + mutex_.Unlock(); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); + compaction_job.Run(); + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); + mutex_.Lock(); + + status = compaction_job.Install(*c->mutable_cf_options()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(c->column_family_data(), + &job_context->superversion_contexts[0], + *c->mutable_cf_options()); + } + *made_progress = true; + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", + c->column_family_data()); + } + if (c != nullptr) { + c->ReleaseCompactionFiles(status); + *made_progress = true; + +#ifndef ROCKSDB_LITE + // Need to make sure SstFileManager does its bookkeeping + auto sfm = static_cast( + immutable_db_options_.sst_file_manager.get()); + if (sfm && sfm_reserved_compact_space) { + sfm->OnCompactionCompletion(c.get()); + } +#endif // ROCKSDB_LITE + + NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status, + compaction_job_stats, job_context->job_id); + } + + if (status.ok() || status.IsCompactionTooLarge() || + status.IsManualCompactionPaused()) { + // Done + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { + // Ignore compaction errors found during shutting down + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", + status.ToString().c_str()); + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); + if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) { + // Put this cfd back in the compaction queue so we can retry after some + // time + auto cfd = c->column_family_data(); + assert(cfd != nullptr); + // Since this compaction failed, we need to recompute the score so it + // takes the original input files into account + c->column_family_data() + ->current() + ->storage_info() + ->ComputeCompactionScore(*(c->immutable_cf_options()), + *(c->mutable_cf_options())); + if (!cfd->queued_for_compaction()) { + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + } + } + } + // this will unref its input_version and column_family_data + c.reset(); + + if (is_manual) { + ManualCompactionState* m = manual_compaction; + if (!status.ok()) { + m->status = status; + m->done = true; + } + // For universal compaction: + // Because universal compaction always happens at level 0, so one + // compaction will pick up all overlapped files. No files will be + // filtered out due to size limit and left for a successive compaction. + // So we can safely conclude the current compaction. + // + // Also note that, if we don't stop here, then the current compaction + // writes a new file back to level 0, which will be used in successive + // compaction. Hence the manual compaction will never finish. + // + // Stop the compaction if manual_end points to nullptr -- this means + // that we compacted the whole range. manual_end should always point + // to nullptr in case of universal compaction + if (m->manual_end == nullptr) { + m->done = true; + } + if (!m->done) { + // We only compacted part of the requested range. Update *m + // to the range that is left to be compacted. + // Universal and FIFO compactions should always compact the whole range + assert(m->cfd->ioptions()->compaction_style != + kCompactionStyleUniversal || + m->cfd->ioptions()->num_levels > 1); + assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); + m->tmp_storage = *m->manual_end; + m->begin = &m->tmp_storage; + m->incomplete = true; + } + m->in_progress = false; // not being processed anymore + } + TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish"); + return status; +} + +bool DBImpl::HasPendingManualCompaction() { + return (!manual_compaction_dequeue_.empty()); +} + +void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) { + manual_compaction_dequeue_.push_back(m); +} + +void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) { + // Remove from queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + it = manual_compaction_dequeue_.erase(it); + return; + } + ++it; + } + assert(false); + return; +} + +bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) { + if (num_running_ingest_file_ > 0) { + // We need to wait for other IngestExternalFile() calls to finish + // before running a manual compaction. + return true; + } + if (m->exclusive) { + return (bg_bottom_compaction_scheduled_ > 0 || + bg_compaction_scheduled_ > 0); + } + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + bool seen = false; + while (it != manual_compaction_dequeue_.end()) { + if (m == (*it)) { + ++it; + seen = true; + continue; + } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) { + // Consider the other manual compaction *it, conflicts if: + // overlaps with m + // and (*it) is ahead in the queue and is not yet in progress + return true; + } + ++it; + } + return false; +} + +bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) { + // Allow automatic compaction if manual compaction is + // in progress + return true; + } + ++it; + } + return false; +} + +bool DBImpl::HasExclusiveManualCompaction() { + // Remove from priority queue + std::deque::iterator it = + manual_compaction_dequeue_.begin(); + while (it != manual_compaction_dequeue_.end()) { + if ((*it)->exclusive) { + return true; + } + ++it; + } + return false; +} + +bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) { + if ((m->exclusive) || (m1->exclusive)) { + return true; + } + if (m->cfd != m1->cfd) { + return false; + } + return true; +} + +#ifndef ROCKSDB_LITE +void DBImpl::BuildCompactionJobInfo( + const ColumnFamilyData* cfd, Compaction* c, const Status& st, + const CompactionJobStats& compaction_job_stats, const int job_id, + const Version* current, CompactionJobInfo* compaction_job_info) const { + assert(compaction_job_info != nullptr); + compaction_job_info->cf_id = cfd->GetID(); + compaction_job_info->cf_name = cfd->GetName(); + compaction_job_info->status = st; + compaction_job_info->thread_id = env_->GetThreadID(); + compaction_job_info->job_id = job_id; + compaction_job_info->base_input_level = c->start_level(); + compaction_job_info->output_level = c->output_level(); + compaction_job_info->stats = compaction_job_stats; + compaction_job_info->table_properties = c->GetOutputTableProperties(); + compaction_job_info->compaction_reason = c->compaction_reason(); + compaction_job_info->compression = c->output_compression(); + for (size_t i = 0; i < c->num_input_levels(); ++i) { + for (const auto fmd : *c->inputs(i)) { + const FileDescriptor& desc = fmd->fd; + const uint64_t file_number = desc.GetNumber(); + auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number, + desc.GetPathId()); + compaction_job_info->input_files.push_back(fn); + compaction_job_info->input_file_infos.push_back(CompactionFileInfo{ + static_cast(i), file_number, fmd->oldest_blob_file_number}); + if (compaction_job_info->table_properties.count(fn) == 0) { + std::shared_ptr tp; + auto s = current->GetTableProperties(&tp, fmd, &fn); + if (s.ok()) { + compaction_job_info->table_properties[fn] = tp; + } + } + } + } + for (const auto& newf : c->edit()->GetNewFiles()) { + const FileMetaData& meta = newf.second; + const FileDescriptor& desc = meta.fd; + const uint64_t file_number = desc.GetNumber(); + compaction_job_info->output_files.push_back(TableFileName( + c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId())); + compaction_job_info->output_file_infos.push_back(CompactionFileInfo{ + newf.first, file_number, meta.oldest_blob_file_number}); + } +} +#endif + +// SuperVersionContext gets created and destructed outside of the lock -- +// we use this conveniently to: +// * malloc one SuperVersion() outside of the lock -- new_superversion +// * delete SuperVersion()s outside of the lock -- superversions_to_free +// +// However, if InstallSuperVersionAndScheduleWork() gets called twice with the +// same sv_context, we can't reuse the SuperVersion() that got +// malloced because +// first call already used it. In that rare case, we take a hit and create a +// new SuperVersion() inside of the mutex. We do similar thing +// for superversion_to_free + +void DBImpl::InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options) { + mutex_.AssertHeld(); + + // Update max_total_in_memory_state_ + size_t old_memtable_size = 0; + auto* old_sv = cfd->GetSuperVersion(); + if (old_sv) { + old_memtable_size = old_sv->mutable_cf_options.write_buffer_size * + old_sv->mutable_cf_options.max_write_buffer_number; + } + + // this branch is unlikely to step in + if (UNLIKELY(sv_context->new_superversion == nullptr)) { + sv_context->NewSuperVersion(); + } + cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options); + + // There may be a small data race here. The snapshot tricking bottommost + // compaction may already be released here. But assuming there will always be + // newer snapshot created and released frequently, the compaction will be + // triggered soon anyway. + bottommost_files_mark_threshold_ = kMaxSequenceNumber; + for (auto* my_cfd : *versions_->GetColumnFamilySet()) { + bottommost_files_mark_threshold_ = std::min( + bottommost_files_mark_threshold_, + my_cfd->current()->storage_info()->bottommost_files_mark_threshold()); + } + + // Whenever we install new SuperVersion, we might need to issue new flushes or + // compactions. + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + + // Update max_total_in_memory_state_ + max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size + + mutable_cf_options.write_buffer_size * + mutable_cf_options.max_write_buffer_number; +} + +// ShouldPurge is called by FindObsoleteFiles when doing a full scan, +// and db mutex (mutex_) should already be held. +// Actually, the current implementation of FindObsoleteFiles with +// full_scan=true can issue I/O requests to obtain list of files in +// directories, e.g. env_->getChildren while holding db mutex. +bool DBImpl::ShouldPurge(uint64_t file_number) const { + return files_grabbed_for_purge_.find(file_number) == + files_grabbed_for_purge_.end() && + purge_files_.find(file_number) == purge_files_.end(); +} + +// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex +// (mutex_) should already be held. +void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) { + files_grabbed_for_purge_.insert(file_number); +} + +void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) { + InstrumentedMutexLock l(&mutex_); + // snapshot_checker_ should only set once. If we need to set it multiple + // times, we need to make sure the old one is not deleted while it is still + // using by a compaction job. + assert(!snapshot_checker_); + snapshot_checker_.reset(snapshot_checker); +} + +void DBImpl::GetSnapshotContext( + JobContext* job_context, std::vector* snapshot_seqs, + SequenceNumber* earliest_write_conflict_snapshot, + SnapshotChecker** snapshot_checker_ptr) { + mutex_.AssertHeld(); + assert(job_context != nullptr); + assert(snapshot_seqs != nullptr); + assert(earliest_write_conflict_snapshot != nullptr); + assert(snapshot_checker_ptr != nullptr); + + *snapshot_checker_ptr = snapshot_checker_.get(); + if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) { + *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance(); + } + if (*snapshot_checker_ptr != nullptr) { + // If snapshot_checker is used, that means the flush/compaction may + // contain values not visible to snapshot taken after + // flush/compaction job starts. Take a snapshot and it will appear + // in snapshot_seqs and force compaction iterator to consider such + // snapshots. + const Snapshot* job_snapshot = + GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/); + job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot)); + } + *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_debug.cc b/src/rocksdb/db/db_impl/db_impl_debug.cc new file mode 100644 index 000000000..610b57d39 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_debug.cc @@ -0,0 +1,294 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef NDEBUG + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/error_handler.h" +#include "monitoring/thread_status_updater.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { +uint64_t DBImpl::TEST_GetLevel0TotalSize() { + InstrumentedMutexLock l(&mutex_); + return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0); +} + +void DBImpl::TEST_SwitchWAL() { + WriteContext write_context; + InstrumentedMutexLock l(&mutex_); + void* writer = TEST_BeginWrite(); + SwitchWAL(&write_context); + TEST_EndWrite(writer); +} + +bool DBImpl::TEST_WALBufferIsEmpty(bool lock) { + if (lock) { + log_write_mutex_.Lock(); + } + log::Writer* cur_log_writer = logs_.back().writer; + auto res = cur_log_writer->TEST_BufferIsEmpty(); + if (lock) { + log_write_mutex_.Unlock(); + } + return res; +} + +int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + InstrumentedMutexLock l(&mutex_); + return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes(); +} + +void DBImpl::TEST_GetFilesMetaData( + ColumnFamilyHandle* column_family, + std::vector>* metadata) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + InstrumentedMutexLock l(&mutex_); + metadata->resize(NumberLevels()); + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = + cfd->current()->storage_info()->LevelFiles(level); + + (*metadata)[level].clear(); + for (const auto& f : files) { + (*metadata)[level].push_back(*f); + } + } +} + +uint64_t DBImpl::TEST_Current_Manifest_FileNo() { + return versions_->manifest_file_number(); +} + +uint64_t DBImpl::TEST_Current_Next_FileNo() { + return versions_->current_next_file_number(); +} + +Status DBImpl::TEST_CompactRange(int level, const Slice* begin, + const Slice* end, + ColumnFamilyHandle* column_family, + bool disallow_trivial_move) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + int output_level = + (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) + ? level + : level + 1; + return RunManualCompaction(cfd, level, output_level, CompactRangeOptions(), + begin, end, true, disallow_trivial_move, + port::kMaxUint64 /*max_file_num_to_ignore*/); +} + +Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) { + WriteContext write_context; + InstrumentedMutexLock l(&mutex_); + if (cfd == nullptr) { + cfd = default_cf_handle_->cfd(); + } + + Status s; + void* writer = TEST_BeginWrite(); + if (two_write_queues_) { + WriteThread::Writer nonmem_w; + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + s = SwitchMemtable(cfd, &write_context); + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } else { + s = SwitchMemtable(cfd, &write_context); + } + TEST_EndWrite(writer); + return s; +} + +Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall, + ColumnFamilyHandle* cfh) { + FlushOptions fo; + fo.wait = wait; + fo.allow_write_stall = allow_write_stall; + ColumnFamilyData* cfd; + if (cfh == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfhi = reinterpret_cast(cfh); + cfd = cfhi->cfd(); + } + return FlushMemTable(cfd, fo, FlushReason::kTest); +} + +Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts) { + return FlushMemTable(cfd, flush_opts, FlushReason::kTest); +} + +Status DBImpl::TEST_AtomicFlushMemTables( + const autovector& cfds, const FlushOptions& flush_opts) { + return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest); +} + +Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { + ColumnFamilyData* cfd; + if (column_family == nullptr) { + cfd = default_cf_handle_->cfd(); + } else { + auto cfh = reinterpret_cast(column_family); + cfd = cfh->cfd(); + } + return WaitForFlushMemTable(cfd, nullptr, false); +} + +Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) { + // Wait until the compaction completes + + // TODO: a bug here. This function actually does not necessarily + // wait for compact. It actually waits for scheduled compaction + // OR flush to finish. + + InstrumentedMutexLock l(&mutex_); + while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_ || + (wait_unscheduled && unscheduled_compactions_)) && + (error_handler_.GetBGError() == Status::OK())) { + bg_cv_.Wait(); + } + return error_handler_.GetBGError(); +} + +void DBImpl::TEST_LockMutex() { mutex_.Lock(); } + +void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); } + +void* DBImpl::TEST_BeginWrite() { + auto w = new WriteThread::Writer(); + write_thread_.EnterUnbatched(w, &mutex_); + return reinterpret_cast(w); +} + +void DBImpl::TEST_EndWrite(void* w) { + auto writer = reinterpret_cast(w); + write_thread_.ExitUnbatched(writer); + delete writer; +} + +size_t DBImpl::TEST_LogsToFreeSize() { + InstrumentedMutexLock l(&mutex_); + return logs_to_free_.size(); +} + +uint64_t DBImpl::TEST_LogfileNumber() { + InstrumentedMutexLock l(&mutex_); + return logfile_number_; +} + +Status DBImpl::TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map) { + std::vector cf_names; + std::vector iopts; + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + cf_names.push_back(cfd->GetName()); + iopts.push_back(cfd->ioptions()); + } + } + iopts_map->clear(); + for (size_t i = 0; i < cf_names.size(); ++i) { + iopts_map->insert({cf_names[i], iopts[i]}); + } + + return Status::OK(); +} + +uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() { + return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep(); +} + +size_t DBImpl::TEST_PreparedSectionCompletedSize() { + return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize(); +} + +size_t DBImpl::TEST_LogsWithPrepSize() { + return logs_with_prep_tracker_.TEST_LogsWithPrepSize(); +} + +uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() { + autovector empty_list; + return FindMinPrepLogReferencedByMemTable(versions_.get(), nullptr, + empty_list); +} + +Status DBImpl::TEST_GetLatestMutableCFOptions( + ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) { + InstrumentedMutexLock l(&mutex_); + + auto cfh = reinterpret_cast(column_family); + *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions(); + return Status::OK(); +} + +int DBImpl::TEST_BGCompactionsAllowed() const { + InstrumentedMutexLock l(&mutex_); + return GetBGJobLimits().max_compactions; +} + +int DBImpl::TEST_BGFlushesAllowed() const { + InstrumentedMutexLock l(&mutex_); + return GetBGJobLimits().max_flushes; +} + +SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const { + if (last_seq_same_as_publish_seq_) { + return versions_->LastSequence(); + } else { + return versions_->LastAllocatedSequence(); + } +} + +size_t DBImpl::TEST_GetWalPreallocateBlockSize( + uint64_t write_buffer_size) const { + InstrumentedMutexLock l(&mutex_); + return GetWalPreallocateBlockSize(write_buffer_size); +} + +void DBImpl::TEST_WaitForDumpStatsRun(std::function callback) const { + if (thread_dump_stats_ != nullptr) { + thread_dump_stats_->TEST_WaitForRun(callback); + } +} + +void DBImpl::TEST_WaitForPersistStatsRun(std::function callback) const { + if (thread_persist_stats_ != nullptr) { + thread_persist_stats_->TEST_WaitForRun(callback); + } +} + +bool DBImpl::TEST_IsPersistentStatsEnabled() const { + return thread_persist_stats_ && thread_persist_stats_->IsRunning(); +} + +size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { + return EstimateInMemoryStatsHistorySize(); +} +} // namespace ROCKSDB_NAMESPACE +#endif // NDEBUG diff --git a/src/rocksdb/db/db_impl/db_impl_experimental.cc b/src/rocksdb/db/db_impl/db_impl_experimental.cc new file mode 100644 index 000000000..f0c17ce95 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_experimental.cc @@ -0,0 +1,151 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl/db_impl.h" + +#include +#include + +#include "db/column_family.h" +#include "db/job_context.h" +#include "db/version_set.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + InternalKey start_key, end_key; + if (begin != nullptr) { + start_key.SetMinPossibleForUserKey(*begin); + } + if (end != nullptr) { + end_key.SetMaxPossibleForUserKey(*end); + } + { + InstrumentedMutexLock l(&mutex_); + auto vstorage = cfd->current()->storage_info(); + for (int level = 0; level < vstorage->num_non_empty_levels() - 1; ++level) { + std::vector inputs; + vstorage->GetOverlappingInputs( + level, begin == nullptr ? nullptr : &start_key, + end == nullptr ? nullptr : &end_key, &inputs); + for (auto f : inputs) { + f->marked_for_compaction = true; + } + } + // Since we have some more files to compact, we should also recompute + // compaction score + vstorage->ComputeCompactionScore(*cfd->ioptions(), + *cfd->GetLatestMutableCFOptions()); + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + } + return Status::OK(); +} + +Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { + assert(column_family); + + if (target_level < 1) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Invalid target level %d\n", target_level); + return Status::InvalidArgument("Invalid target level"); + } + + Status status; + VersionEdit edit; + JobContext job_context(next_job_id_.fetch_add(1), true); + { + InstrumentedMutexLock l(&mutex_); + auto* cfd = static_cast(column_family)->cfd(); + const auto* vstorage = cfd->current()->storage_info(); + + if (target_level >= vstorage->num_levels()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Target level %d does not exist\n", + target_level); + job_context.Clean(); + return Status::InvalidArgument("Target level does not exist"); + } + + // Sort L0 files by range. + const InternalKeyComparator* icmp = &cfd->internal_comparator(); + auto l0_files = vstorage->LevelFiles(0); + std::sort(l0_files.begin(), l0_files.end(), + [icmp](FileMetaData* f1, FileMetaData* f2) { + return icmp->Compare(f1->largest, f2->largest) < 0; + }); + + // Check that no L0 file is being compacted and that they have + // non-overlapping ranges. + for (size_t i = 0; i < l0_files.size(); ++i) { + auto f = l0_files[i]; + if (f->being_compacted) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. File %" PRIu64 " being compacted\n", + f->fd.GetNumber()); + job_context.Clean(); + return Status::InvalidArgument("PromoteL0 called during L0 compaction"); + } + + if (i == 0) continue; + auto prev_f = l0_files[i - 1]; + if (icmp->Compare(prev_f->largest, f->smallest) >= 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Files %" PRIu64 " and %" PRIu64 + " have overlapping ranges\n", + prev_f->fd.GetNumber(), f->fd.GetNumber()); + job_context.Clean(); + return Status::InvalidArgument("L0 has overlapping files"); + } + } + + // Check that all levels up to target_level are empty. + for (int level = 1; level <= target_level; ++level) { + if (vstorage->NumLevelFiles(level) > 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "PromoteL0 FAILED. Level %d not empty\n", level); + job_context.Clean(); + return Status::InvalidArgument( + "All levels up to target_level " + "must be empty"); + } + } + + edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : l0_files) { + edit.DeleteFile(0, f->fd.GetNumber()); + edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->oldest_blob_file_number, + f->oldest_ancester_time, f->file_creation_time, + f->file_checksum, f->file_checksum_func_name); + } + + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, + &job_context.superversion_contexts[0], + *cfd->GetLatestMutableCFOptions()); + } + } // lock released here + LogFlush(immutable_db_options_.info_log); + job_context.Clean(); + + return status; +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_files.cc b/src/rocksdb/db/db_impl/db_impl_files.cc new file mode 100644 index 000000000..c5d07dd01 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_files.cc @@ -0,0 +1,667 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include +#include +#include +#include "db/event_helpers.h" +#include "db/memtable_list.h" +#include "file/file_util.h" +#include "file/sst_file_manager_impl.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +uint64_t DBImpl::MinLogNumberToKeep() { + if (allow_2pc()) { + return versions_->min_log_number_to_keep_2pc(); + } else { + return versions_->MinLogNumberWithUnflushedData(); + } +} + +uint64_t DBImpl::MinObsoleteSstNumberToKeep() { + mutex_.AssertHeld(); + if (!pending_outputs_.empty()) { + return *pending_outputs_.begin(); + } + return std::numeric_limits::max(); +} + +// * Returns the list of live files in 'sst_live' +// If it's doing full scan: +// * Returns the list of all files in the filesystem in +// 'full_scan_candidate_files'. +// Otherwise, gets obsolete files from VersionSet. +// no_full_scan = true -- never do the full scan using GetChildren() +// force = false -- don't force the full scan, except every +// mutable_db_options_.delete_obsolete_files_period_micros +// force = true -- force the full scan +void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, + bool no_full_scan) { + mutex_.AssertHeld(); + + // if deletion is disabled, do nothing + if (disable_delete_obsolete_files_ > 0) { + return; + } + + bool doing_the_full_scan = false; + + // logic for figuring out if we're doing the full scan + if (no_full_scan) { + doing_the_full_scan = false; + } else if (force || + mutable_db_options_.delete_obsolete_files_period_micros == 0) { + doing_the_full_scan = true; + } else { + const uint64_t now_micros = env_->NowMicros(); + if ((delete_obsolete_files_last_run_ + + mutable_db_options_.delete_obsolete_files_period_micros) < + now_micros) { + doing_the_full_scan = true; + delete_obsolete_files_last_run_ = now_micros; + } + } + + // don't delete files that might be currently written to from compaction + // threads + // Since job_context->min_pending_output is set, until file scan finishes, + // mutex_ cannot be released. Otherwise, we might see no min_pending_output + // here but later find newer generated unfinalized files while scanning. + if (!pending_outputs_.empty()) { + job_context->min_pending_output = *pending_outputs_.begin(); + } else { + // delete all of them + job_context->min_pending_output = std::numeric_limits::max(); + } + + // Get obsolete files. This function will also update the list of + // pending files in VersionSet(). + versions_->GetObsoleteFiles(&job_context->sst_delete_files, + &job_context->manifest_delete_files, + job_context->min_pending_output); + + // Mark the elements in job_context->sst_delete_files as grabbedForPurge + // so that other threads calling FindObsoleteFiles with full_scan=true + // will not add these files to candidate list for purge. + for (const auto& sst_to_del : job_context->sst_delete_files) { + MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber()); + } + + // store the current filenum, lognum, etc + job_context->manifest_file_number = versions_->manifest_file_number(); + job_context->pending_manifest_file_number = + versions_->pending_manifest_file_number(); + job_context->log_number = MinLogNumberToKeep(); + job_context->prev_log_number = versions_->prev_log_number(); + + versions_->AddLiveFiles(&job_context->sst_live); + if (doing_the_full_scan) { + InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), + dbname_); + std::set paths; + for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size(); + path_id++) { + paths.insert(immutable_db_options_.db_paths[path_id].path); + } + + // Note that if cf_paths is not specified in the ColumnFamilyOptions + // of a particular column family, we use db_paths as the cf_paths + // setting. Hence, there can be multiple duplicates of files from db_paths + // in the following code. The duplicate are removed while identifying + // unique files in PurgeObsoleteFiles. + for (auto cfd : *versions_->GetColumnFamilySet()) { + for (size_t path_id = 0; path_id < cfd->ioptions()->cf_paths.size(); + path_id++) { + auto& path = cfd->ioptions()->cf_paths[path_id].path; + + if (paths.find(path) == paths.end()) { + paths.insert(path); + } + } + } + + for (auto& path : paths) { + // set of all files in the directory. We'll exclude files that are still + // alive in the subsequent processings. + std::vector files; + env_->GetChildren(path, &files); // Ignore errors + for (const std::string& file : files) { + uint64_t number; + FileType type; + // 1. If we cannot parse the file name, we skip; + // 2. If the file with file_number equals number has already been + // grabbed for purge by another compaction job, or it has already been + // schedule for purge, we also skip it if we + // are doing full scan in order to avoid double deletion of the same + // file under race conditions. See + // https://github.com/facebook/rocksdb/issues/3573 + if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) || + !ShouldPurge(number)) { + continue; + } + + // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes + job_context->full_scan_candidate_files.emplace_back("/" + file, path); + } + } + + // Add log files in wal_dir + if (immutable_db_options_.wal_dir != dbname_) { + std::vector log_files; + env_->GetChildren(immutable_db_options_.wal_dir, + &log_files); // Ignore errors + for (const std::string& log_file : log_files) { + job_context->full_scan_candidate_files.emplace_back( + log_file, immutable_db_options_.wal_dir); + } + } + // Add info log files in db_log_dir + if (!immutable_db_options_.db_log_dir.empty() && + immutable_db_options_.db_log_dir != dbname_) { + std::vector info_log_files; + // Ignore errors + env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files); + for (std::string& log_file : info_log_files) { + job_context->full_scan_candidate_files.emplace_back( + log_file, immutable_db_options_.db_log_dir); + } + } + } + + // logs_ is empty when called during recovery, in which case there can't yet + // be any tracked obsolete logs + if (!alive_log_files_.empty() && !logs_.empty()) { + uint64_t min_log_number = job_context->log_number; + size_t num_alive_log_files = alive_log_files_.size(); + // find newly obsoleted log files + while (alive_log_files_.begin()->number < min_log_number) { + auto& earliest = *alive_log_files_.begin(); + if (immutable_db_options_.recycle_log_file_num > + log_recycle_files_.size()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "adding log %" PRIu64 " to recycle list\n", + earliest.number); + log_recycle_files_.push_back(earliest.number); + } else { + job_context->log_delete_files.push_back(earliest.number); + } + if (job_context->size_log_to_delete == 0) { + job_context->prev_total_log_size = total_log_size_; + job_context->num_alive_log_files = num_alive_log_files; + } + job_context->size_log_to_delete += earliest.size; + total_log_size_ -= earliest.size; + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + alive_log_files_.pop_front(); + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + // Current log should always stay alive since it can't have + // number < MinLogNumber(). + assert(alive_log_files_.size()); + } + while (!logs_.empty() && logs_.front().number < min_log_number) { + auto& log = logs_.front(); + if (log.getting_synced) { + log_sync_cv_.Wait(); + // logs_ could have changed while we were waiting. + continue; + } + logs_to_free_.push_back(log.ReleaseWriter()); + { + InstrumentedMutexLock wl(&log_write_mutex_); + logs_.pop_front(); + } + } + // Current log cannot be obsolete. + assert(!logs_.empty()); + } + + // We're just cleaning up for DB::Write(). + assert(job_context->logs_to_free.empty()); + job_context->logs_to_free = logs_to_free_; + job_context->log_recycle_files.assign(log_recycle_files_.begin(), + log_recycle_files_.end()); + if (job_context->HaveSomethingToDelete()) { + ++pending_purge_obsolete_files_; + } + logs_to_free_.clear(); +} + +namespace { +bool CompareCandidateFile(const JobContext::CandidateFileInfo& first, + const JobContext::CandidateFileInfo& second) { + if (first.file_name > second.file_name) { + return true; + } else if (first.file_name < second.file_name) { + return false; + } else { + return (first.file_path > second.file_path); + } +} +}; // namespace + +// Delete obsolete files and log status and information of file deletion +void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, + const std::string& path_to_sync, + FileType type, uint64_t number) { + Status file_deletion_status; + if (type == kTableFile || type == kLogFile) { + file_deletion_status = + DeleteDBFile(&immutable_db_options_, fname, path_to_sync, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_); + } else { + file_deletion_status = env_->DeleteFile(fname); + } + TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion", + &file_deletion_status); + if (file_deletion_status.ok()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id, + fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } else if (env_->FileExists(fname).IsNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64 + " -- %s\n", + job_id, fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n", + job_id, fname.c_str(), type, number, + file_deletion_status.ToString().c_str()); + } + if (type == kTableFile) { + EventHelpers::LogAndNotifyTableFileDeletion( + &event_logger_, job_id, number, fname, file_deletion_status, GetName(), + immutable_db_options_.listeners); + } +} + +// Diffs the files listed in filenames and those that do not +// belong to live files are possibly removed. Also, removes all the +// files in sst_delete_files and log_delete_files. +// It is not necessary to hold the mutex when invoking this method. +void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:Begin"); + // we'd better have sth to delete + assert(state.HaveSomethingToDelete()); + + // FindObsoleteFiles() should've populated this so nonzero + assert(state.manifest_file_number != 0); + + // Now, convert live list to an unordered map, WITHOUT mutex held; + // set is slow. + std::unordered_map sst_live_map; + for (const FileDescriptor& fd : state.sst_live) { + sst_live_map[fd.GetNumber()] = &fd; + } + std::unordered_set log_recycle_files_set( + state.log_recycle_files.begin(), state.log_recycle_files.end()); + + auto candidate_files = state.full_scan_candidate_files; + candidate_files.reserve( + candidate_files.size() + state.sst_delete_files.size() + + state.log_delete_files.size() + state.manifest_delete_files.size()); + // We may ignore the dbname when generating the file names. + for (auto& file : state.sst_delete_files) { + candidate_files.emplace_back( + MakeTableFileName(file.metadata->fd.GetNumber()), file.path); + if (file.metadata->table_reader_handle) { + table_cache_->Release(file.metadata->table_reader_handle); + } + file.DeleteMetadata(); + } + + for (auto file_num : state.log_delete_files) { + if (file_num > 0) { + candidate_files.emplace_back(LogFileName(file_num), + immutable_db_options_.wal_dir); + } + } + for (const auto& filename : state.manifest_delete_files) { + candidate_files.emplace_back(filename, dbname_); + } + + // dedup state.candidate_files so we don't try to delete the same + // file twice + std::sort(candidate_files.begin(), candidate_files.end(), + CompareCandidateFile); + candidate_files.erase( + std::unique(candidate_files.begin(), candidate_files.end()), + candidate_files.end()); + + if (state.prev_total_log_size > 0) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Try to delete WAL files size %" PRIu64 + ", prev total WAL file size %" PRIu64 + ", number of live WAL files %" ROCKSDB_PRIszt ".\n", + state.job_id, state.size_log_to_delete, + state.prev_total_log_size, state.num_alive_log_files); + } + + std::vector old_info_log_files; + InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), + dbname_); + + // File numbers of most recent two OPTIONS file in candidate_files (found in + // previos FindObsoleteFiles(full_scan=true)) + // At this point, there must not be any duplicate file numbers in + // candidate_files. + uint64_t optsfile_num1 = std::numeric_limits::min(); + uint64_t optsfile_num2 = std::numeric_limits::min(); + for (const auto& candidate_file : candidate_files) { + const std::string& fname = candidate_file.file_name; + uint64_t number; + FileType type; + if (!ParseFileName(fname, &number, info_log_prefix.prefix, &type) || + type != kOptionsFile) { + continue; + } + if (number > optsfile_num1) { + optsfile_num2 = optsfile_num1; + optsfile_num1 = number; + } else if (number > optsfile_num2) { + optsfile_num2 = number; + } + } + + // Close WALs before trying to delete them. + for (const auto w : state.logs_to_free) { + // TODO: maybe check the return value of Close. + w->Close(); + } + + bool own_files = OwnTablesAndLogs(); + std::unordered_set files_to_del; + for (const auto& candidate_file : candidate_files) { + const std::string& to_delete = candidate_file.file_name; + uint64_t number; + FileType type; + // Ignore file if we cannot recognize it. + if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) { + continue; + } + + bool keep = true; + switch (type) { + case kLogFile: + keep = ((number >= state.log_number) || + (number == state.prev_log_number) || + (log_recycle_files_set.find(number) != + log_recycle_files_set.end())); + break; + case kDescriptorFile: + // Keep my manifest file, and any newer incarnations' + // (can happen during manifest roll) + keep = (number >= state.manifest_file_number); + break; + case kTableFile: + // If the second condition is not there, this makes + // DontDeletePendingOutputs fail + keep = (sst_live_map.find(number) != sst_live_map.end()) || + number >= state.min_pending_output; + if (!keep) { + files_to_del.insert(number); + } + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live". + // Also, SetCurrentFile creates a temp file when writing out new + // manifest, which is equal to state.pending_manifest_file_number. We + // should not delete that file + // + // TODO(yhchiang): carefully modify the third condition to safely + // remove the temp options files. + keep = (sst_live_map.find(number) != sst_live_map.end()) || + (number == state.pending_manifest_file_number) || + (to_delete.find(kOptionsFileNamePrefix) != std::string::npos); + break; + case kInfoLogFile: + keep = true; + if (number != 0) { + old_info_log_files.push_back(to_delete); + } + break; + case kOptionsFile: + keep = (number >= optsfile_num2); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1", + reinterpret_cast(&number)); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2", + reinterpret_cast(&keep)); + break; + case kCurrentFile: + case kDBLockFile: + case kIdentityFile: + case kMetaDatabase: + case kBlobFile: + keep = true; + break; + } + + if (keep) { + continue; + } + + std::string fname; + std::string dir_to_sync; + if (type == kTableFile) { + // evict from cache + TableCache::Evict(table_cache_.get(), number); + fname = MakeTableFileName(candidate_file.file_path, number); + dir_to_sync = candidate_file.file_path; + } else { + dir_to_sync = + (type == kLogFile) ? immutable_db_options_.wal_dir : dbname_; + fname = dir_to_sync + + ((!dir_to_sync.empty() && dir_to_sync.back() == '/') || + (!to_delete.empty() && to_delete.front() == '/') + ? "" + : "/") + + to_delete; + } + +#ifndef ROCKSDB_LITE + if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 || + immutable_db_options_.wal_size_limit_mb > 0)) { + wal_manager_.ArchiveWALFile(fname, number); + continue; + } +#endif // !ROCKSDB_LITE + + // If I do not own these files, e.g. secondary instance with max_open_files + // = -1, then no need to delete or schedule delete these files since they + // will be removed by their owner, e.g. the primary instance. + if (!own_files) { + continue; + } + Status file_deletion_status; + if (schedule_only) { + InstrumentedMutexLock guard_lock(&mutex_); + SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id); + } else { + DeleteObsoleteFileImpl(state.job_id, fname, dir_to_sync, type, number); + } + } + + { + // After purging obsolete files, remove them from files_grabbed_for_purge_. + InstrumentedMutexLock guard_lock(&mutex_); + autovector to_be_removed; + for (auto fn : files_grabbed_for_purge_) { + if (files_to_del.count(fn) != 0) { + to_be_removed.emplace_back(fn); + } + } + for (auto fn : to_be_removed) { + files_grabbed_for_purge_.erase(fn); + } + } + + // Delete old info log files. + size_t old_info_log_file_count = old_info_log_files.size(); + if (old_info_log_file_count != 0 && + old_info_log_file_count >= immutable_db_options_.keep_log_file_num) { + std::sort(old_info_log_files.begin(), old_info_log_files.end()); + size_t end = + old_info_log_file_count - immutable_db_options_.keep_log_file_num; + for (unsigned int i = 0; i <= end; i++) { + std::string& to_delete = old_info_log_files.at(i); + std::string full_path_to_delete = + (immutable_db_options_.db_log_dir.empty() + ? dbname_ + : immutable_db_options_.db_log_dir) + + "/" + to_delete; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[JOB %d] Delete info log file %s\n", state.job_id, + full_path_to_delete.c_str()); + Status s = env_->DeleteFile(full_path_to_delete); + if (!s.ok()) { + if (env_->FileExists(full_path_to_delete).IsNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "[JOB %d] Tried to delete non-existing info log file %s FAILED " + "-- %s\n", + state.job_id, to_delete.c_str(), s.ToString().c_str()); + } else { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "[JOB %d] Delete info log file %s FAILED -- %s\n", + state.job_id, to_delete.c_str(), + s.ToString().c_str()); + } + } + } + } +#ifndef ROCKSDB_LITE + wal_manager_.PurgeObsoleteWALFiles(); +#endif // ROCKSDB_LITE + LogFlush(immutable_db_options_.info_log); + InstrumentedMutexLock l(&mutex_); + --pending_purge_obsolete_files_; + assert(pending_purge_obsolete_files_ >= 0); + if (pending_purge_obsolete_files_ == 0) { + bg_cv_.SignalAll(); + } + TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:End"); +} + +void DBImpl::DeleteObsoleteFiles() { + mutex_.AssertHeld(); + JobContext job_context(next_job_id_.fetch_add(1)); + FindObsoleteFiles(&job_context, true); + + mutex_.Unlock(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); +} + +uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, const ColumnFamilyData* cfd_to_flush, + const autovector& memtables_to_flush) { + uint64_t min_log = 0; + + // we must look through the memtables for two phase transactions + // that have been committed but not yet flushed + for (auto loop_cfd : *vset->GetColumnFamilySet()) { + if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) { + continue; + } + + auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection( + memtables_to_flush); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + + log = loop_cfd->mem()->GetMinLogContainingPrepSection(); + + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +uint64_t PrecomputeMinLogNumberToKeep( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + autovector edit_list, + const autovector& memtables_to_flush, + LogsWithPrepTracker* prep_tracker) { + assert(vset != nullptr); + assert(prep_tracker != nullptr); + // Calculate updated min_log_number_to_keep + // Since the function should only be called in 2pc mode, log number in + // the version edit should be sufficient. + + // Precompute the min log number containing unflushed data for the column + // family being flushed (`cfd_to_flush`). + uint64_t cf_min_log_number_to_keep = 0; + for (auto& e : edit_list) { + if (e->HasLogNumber()) { + cf_min_log_number_to_keep = + std::max(cf_min_log_number_to_keep, e->GetLogNumber()); + } + } + if (cf_min_log_number_to_keep == 0) { + // No version edit contains information on log number. The log number + // for this column family should stay the same as it is. + cf_min_log_number_to_keep = cfd_to_flush.GetLogNumber(); + } + + // Get min log number containing unflushed data for other column families. + uint64_t min_log_number_to_keep = + vset->PreComputeMinLogNumberWithUnflushedData(&cfd_to_flush); + if (cf_min_log_number_to_keep != 0) { + min_log_number_to_keep = + std::min(cf_min_log_number_to_keep, min_log_number_to_keep); + } + + // if are 2pc we must consider logs containing prepared + // sections of outstanding transactions. + // + // We must check min logs with outstanding prep before we check + // logs references by memtables because a log referenced by the + // first data structure could transition to the second under us. + // + // TODO: iterating over all column families under db mutex. + // should find more optimal solution + auto min_log_in_prep_heap = + prep_tracker->FindMinLogContainingOutstandingPrep(); + + if (min_log_in_prep_heap != 0 && + min_log_in_prep_heap < min_log_number_to_keep) { + min_log_number_to_keep = min_log_in_prep_heap; + } + + uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable( + vset, &cfd_to_flush, memtables_to_flush); + + if (min_log_refed_by_mem != 0 && + min_log_refed_by_mem < min_log_number_to_keep) { + min_log_number_to_keep = min_log_refed_by_mem; + } + return min_log_number_to_keep; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_open.cc b/src/rocksdb/db/db_impl/db_impl_open.cc new file mode 100644 index 000000000..6ae4ead54 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_open.cc @@ -0,0 +1,1651 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include + +#include "db/builder.h" +#include "db/error_handler.h" +#include "env/composite_env_wrapper.h" +#include "file/read_write_util.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "monitoring/persistent_stats_history.h" +#include "options/options_helper.h" +#include "rocksdb/wal_filter.h" +#include "table/block_based/block_based_table_factory.h" +#include "test_util/sync_point.h" +#include "util/rate_limiter.h" + +namespace ROCKSDB_NAMESPACE { +Options SanitizeOptions(const std::string& dbname, const Options& src) { + auto db_options = SanitizeOptions(dbname, DBOptions(src)); + ImmutableDBOptions immutable_db_options(db_options); + auto cf_options = + SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src)); + return Options(db_options, cf_options); +} + +DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { + DBOptions result(src); + + if (result.file_system == nullptr) { + if (result.env == Env::Default()) { + result.file_system = FileSystem::Default(); + } else { + result.file_system.reset(new LegacyFileSystemWrapper(result.env)); + } + } else { + if (result.env == nullptr) { + result.env = Env::Default(); + } + } + + // result.max_open_files means an "infinite" open files. + if (result.max_open_files != -1) { + int max_max_open_files = port::GetMaxOpenFiles(); + if (max_max_open_files == -1) { + max_max_open_files = 0x400000; + } + ClipToRange(&result.max_open_files, 20, max_max_open_files); + TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles", + &result.max_open_files); + } + + if (result.info_log == nullptr) { + Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); + if (!s.ok()) { + // No place suitable for logging + result.info_log = nullptr; + } + } + + if (!result.write_buffer_manager) { + result.write_buffer_manager.reset( + new WriteBufferManager(result.db_write_buffer_size)); + } + auto bg_job_limits = DBImpl::GetBGJobLimits( + result.max_background_flushes, result.max_background_compactions, + result.max_background_jobs, true /* parallelize_compactions */); + result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions, + Env::Priority::LOW); + result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes, + Env::Priority::HIGH); + + if (result.rate_limiter.get() != nullptr) { + if (result.bytes_per_sync == 0) { + result.bytes_per_sync = 1024 * 1024; + } + } + + if (result.delayed_write_rate == 0) { + if (result.rate_limiter.get() != nullptr) { + result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond(); + } + if (result.delayed_write_rate == 0) { + result.delayed_write_rate = 16 * 1024 * 1024; + } + } + + if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) { + result.recycle_log_file_num = false; + } + + if (result.recycle_log_file_num && + (result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery || + result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) { + // kPointInTimeRecovery is inconsistent with recycle log file feature since + // we define the "end" of the log as the first corrupt record we encounter. + // kAbsoluteConsistency doesn't make sense because even a clean + // shutdown leaves old junk at the end of the log file. + result.recycle_log_file_num = 0; + } + + if (result.wal_dir.empty()) { + // Use dbname as default + result.wal_dir = dbname; + } + if (result.wal_dir.back() == '/') { + result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); + } + + if (result.db_paths.size() == 0) { + result.db_paths.emplace_back(dbname, std::numeric_limits::max()); + } + + if (result.use_direct_reads && result.compaction_readahead_size == 0) { + TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr); + result.compaction_readahead_size = 1024 * 1024 * 2; + } + + if (result.compaction_readahead_size > 0 || result.use_direct_reads) { + result.new_table_reader_for_compaction_inputs = true; + } + + // Force flush on DB open if 2PC is enabled, since with 2PC we have no + // guarantee that consecutive log files have consecutive sequence id, which + // make recovery complicated. + if (result.allow_2pc) { + result.avoid_flush_during_recovery = false; + } + +#ifndef ROCKSDB_LITE + ImmutableDBOptions immutable_db_options(result); + if (!IsWalDirSameAsDBPath(&immutable_db_options)) { + // Either the WAL dir and db_paths[0]/db_name are not the same, or we + // cannot tell for sure. In either case, assume they're different and + // explicitly cleanup the trash log files (bypass DeleteScheduler) + // Do this first so even if we end up calling + // DeleteScheduler::CleanupDirectory on the same dir later, it will be + // safe + std::vector filenames; + result.env->GetChildren(result.wal_dir, &filenames); + for (std::string& filename : filenames) { + if (filename.find(".log.trash", filename.length() - + std::string(".log.trash").length()) != + std::string::npos) { + std::string trash_file = result.wal_dir + "/" + filename; + result.env->DeleteFile(trash_file); + } + } + } + // When the DB is stopped, it's possible that there are some .trash files that + // were not deleted yet, when we open the DB we will find these .trash files + // and schedule them to be deleted (or delete immediately if SstFileManager + // was not used) + auto sfm = static_cast(result.sst_file_manager.get()); + for (size_t i = 0; i < result.db_paths.size(); i++) { + DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path); + } + + // Create a default SstFileManager for purposes of tracking compaction size + // and facilitating recovery from out of space errors. + if (result.sst_file_manager.get() == nullptr) { + std::shared_ptr sst_file_manager( + NewSstFileManager(result.env, result.info_log)); + result.sst_file_manager = sst_file_manager; + } +#endif + + if (!result.paranoid_checks) { + result.skip_checking_sst_file_sizes_on_db_open = true; + ROCKS_LOG_INFO(result.info_log, + "file size check will be skipped during open."); + } + + return result; +} + +namespace { +Status SanitizeOptionsByTable( + const DBOptions& db_opts, + const std::vector& column_families) { + Status s; + for (auto cf : column_families) { + s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} +} // namespace + +Status DBImpl::ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families) { + Status s; + for (auto& cfd : column_families) { + s = ColumnFamilyData::ValidateOptions(db_options, cfd.options); + if (!s.ok()) { + return s; + } + } + s = ValidateOptions(db_options); + return s; +} + +Status DBImpl::ValidateOptions(const DBOptions& db_options) { + if (db_options.db_paths.size() > 4) { + return Status::NotSupported( + "More than four DB paths are not supported yet. "); + } + + if (db_options.allow_mmap_reads && db_options.use_direct_reads) { + // Protect against assert in PosixMMapReadableFile constructor + return Status::NotSupported( + "If memory mapped reads (allow_mmap_reads) are enabled " + "then direct I/O reads (use_direct_reads) must be disabled. "); + } + + if (db_options.allow_mmap_writes && + db_options.use_direct_io_for_flush_and_compaction) { + return Status::NotSupported( + "If memory mapped writes (allow_mmap_writes) are enabled " + "then direct I/O writes (use_direct_io_for_flush_and_compaction) must " + "be disabled. "); + } + + if (db_options.keep_log_file_num == 0) { + return Status::InvalidArgument("keep_log_file_num must be greater than 0"); + } + + if (db_options.unordered_write && + !db_options.allow_concurrent_memtable_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with !allow_concurrent_memtable_write"); + } + + if (db_options.unordered_write && db_options.enable_pipelined_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with enable_pipelined_write"); + } + + if (db_options.atomic_flush && db_options.enable_pipelined_write) { + return Status::InvalidArgument( + "atomic_flush is incompatible with enable_pipelined_write"); + } + + return Status::OK(); +} + +Status DBImpl::NewDB() { + VersionEdit new_db; + Status s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + if (immutable_db_options_.write_dbid_to_manifest) { + std::string temp_db_id; + GetDbIdentityFromIdentityFile(&temp_db_id); + new_db.SetDBId(temp_db_id); + } + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n"); + const std::string manifest = DescriptorFileName(dbname_, 1); + { + std::unique_ptr file; + FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_); + s = NewWritableFile(fs_.get(), manifest, &file, file_options); + if (!s.ok()) { + return s; + } + file->SetPreallocationBlockSize( + immutable_db_options_.manifest_preallocation_size); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), manifest, file_options, env_, nullptr /* stats */, + immutable_db_options_.listeners)); + log::Writer log(std::move(file_writer), 0, false); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + if (s.ok()) { + s = SyncManifest(env_, &immutable_db_options_, log.file()); + } + } + if (s.ok()) { + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir()); + } else { + fs_->DeleteFile(manifest, IOOptions(), nullptr); + } + return s; +} + +Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname, + std::unique_ptr* directory) { + // We call CreateDirIfMissing() as the directory may already exist (if we + // are reopening a DB), when this happens we don't want creating the + // directory to cause an error. However, we need to check if creating the + // directory fails or else we may get an obscure message about the lock + // file not existing. One real-world example of this occurring is if + // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. + // when dbname_ is "dir/db" but when "dir" doesn't exist. + Status s = env->CreateDirIfMissing(dirname); + if (!s.ok()) { + return s; + } + return env->NewDirectory(dirname, directory); +} + +Status Directories::SetDirectories(Env* env, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths) { + Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_); + if (!s.ok()) { + return s; + } + if (!wal_dir.empty() && dbname != wal_dir) { + s = DBImpl::CreateAndNewDirectory(env, wal_dir, &wal_dir_); + if (!s.ok()) { + return s; + } + } + + data_dirs_.clear(); + for (auto& p : data_paths) { + const std::string db_path = p.path; + if (db_path == dbname) { + data_dirs_.emplace_back(nullptr); + } else { + std::unique_ptr path_directory; + s = DBImpl::CreateAndNewDirectory(env, db_path, &path_directory); + if (!s.ok()) { + return s; + } + data_dirs_.emplace_back(path_directory.release()); + } + } + assert(data_dirs_.size() == data_paths.size()); + return Status::OK(); +} + +Status DBImpl::Recover( + const std::vector& column_families, bool read_only, + bool error_if_log_file_exist, bool error_if_data_exists_in_logs, + uint64_t* recovered_seq) { + mutex_.AssertHeld(); + + bool is_new_db = false; + assert(db_lock_ == nullptr); + if (!read_only) { + Status s = directories_.SetDirectories(env_, dbname_, + immutable_db_options_.wal_dir, + immutable_db_options_.db_paths); + if (!s.ok()) { + return s; + } + + s = env_->LockFile(LockFileName(dbname_), &db_lock_); + if (!s.ok()) { + return s; + } + + std::string current_fname = CurrentFileName(dbname_); + s = env_->FileExists(current_fname); + if (s.IsNotFound()) { + if (immutable_db_options_.create_if_missing) { + s = NewDB(); + is_new_db = true; + if (!s.ok()) { + return s; + } + } else { + return Status::InvalidArgument( + current_fname, "does not exist (create_if_missing is false)"); + } + } else if (s.ok()) { + if (immutable_db_options_.error_if_exists) { + return Status::InvalidArgument(dbname_, + "exists (error_if_exists is true)"); + } + } else { + // Unexpected error reading file + assert(s.IsIOError()); + return s; + } + // Verify compatibility of file_options_ and filesystem + { + std::unique_ptr idfile; + FileOptions customized_fs(file_options_); + customized_fs.use_direct_reads |= + immutable_db_options_.use_direct_io_for_flush_and_compaction; + s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile, + nullptr); + if (!s.ok()) { + std::string error_str = s.ToString(); + // Check if unsupported Direct I/O is the root cause + customized_fs.use_direct_reads = false; + s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile, + nullptr); + if (s.ok()) { + return Status::InvalidArgument( + "Direct I/O is not supported by the specified DB."); + } else { + return Status::InvalidArgument( + "Found options incompatible with filesystem", error_str.c_str()); + } + } + } + } + assert(db_id_.empty()); + Status s = versions_->Recover(column_families, read_only, &db_id_); + if (!s.ok()) { + return s; + } + // Happens when immutable_db_options_.write_dbid_to_manifest is set to true + // the very first time. + if (db_id_.empty()) { + // Check for the IDENTITY file and create it if not there. + s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr); + // Typically Identity file is created in NewDB() and for some reason if + // it is no longer available then at this point DB ID is not in Identity + // file or Manifest. + if (s.IsNotFound()) { + s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + } else if (!s.ok()) { + assert(s.IsIOError()); + return s; + } + s = GetDbIdentityFromIdentityFile(&db_id_); + if (immutable_db_options_.write_dbid_to_manifest && s.ok()) { + VersionEdit edit; + edit.SetDBId(db_id_); + Options options; + MutableCFOptions mutable_cf_options(options); + versions_->db_id_ = db_id_; + s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options, &edit, &mutex_, nullptr, + false); + } + } else { + s = SetIdentityFile(env_, dbname_, db_id_); + } + + if (immutable_db_options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + if (s.ok() && !read_only) { + std::map> created_dirs; + for (auto cfd : *versions_->GetColumnFamilySet()) { + s = cfd->AddDirectories(&created_dirs); + if (!s.ok()) { + return s; + } + } + } + // DB mutex is already held + if (s.ok() && immutable_db_options_.persist_stats_to_disk) { + s = InitPersistStatsColumnFamily(); + } + + if (s.ok()) { + // Initial max_total_in_memory_state_ before recovery logs. Log recovery + // may check this value to decide whether to flush. + max_total_in_memory_state_ = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + + SequenceNumber next_sequence(kMaxSequenceNumber); + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + // TODO(Zhongyi): handle single_column_family_mode_ when + // persistent_stats is enabled + single_column_family_mode_ = + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; + + // Recover from all newer log files than the ones named in the + // descriptor (new log files may have been added by the previous + // incarnation without registering them in the descriptor). + // + // Note that prev_log_number() is no longer used, but we pay + // attention to it in case we are recovering a database + // produced by an older version of rocksdb. + std::vector filenames; + s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); + if (s.IsNotFound()) { + return Status::InvalidArgument("wal_dir not found", + immutable_db_options_.wal_dir); + } else if (!s.ok()) { + return s; + } + + std::vector logs; + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) { + if (is_new_db) { + return Status::Corruption( + "While creating a new Db, wal_dir contains " + "existing log file: ", + filenames[i]); + } else { + logs.push_back(number); + } + } + } + + if (logs.size() > 0) { + if (error_if_log_file_exist) { + return Status::Corruption( + "The db was opened in readonly mode with error_if_log_file_exist" + "flag but a log file already exists"); + } else if (error_if_data_exists_in_logs) { + for (auto& log : logs) { + std::string fname = LogFileName(immutable_db_options_.wal_dir, log); + uint64_t bytes; + s = env_->GetFileSize(fname, &bytes); + if (s.ok()) { + if (bytes > 0) { + return Status::Corruption( + "error_if_data_exists_in_logs is set but there are data " + " in log files."); + } + } + } + } + } + + if (!logs.empty()) { + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + bool corrupted_log_found = false; + s = RecoverLogFiles(logs, &next_sequence, read_only, + &corrupted_log_found); + if (corrupted_log_found && recovered_seq != nullptr) { + *recovered_seq = next_sequence; + } + if (!s.ok()) { + // Clear memtables if recovery failed + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + kMaxSequenceNumber); + } + } + } + } + + if (read_only) { + // If we are opening as read-only, we need to update options_file_number_ + // to reflect the most recent OPTIONS file. It does not matter for regular + // read-write db instance because options_file_number_ will later be + // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile. + std::vector file_names; + if (s.ok()) { + s = env_->GetChildren(GetName(), &file_names); + } + if (s.ok()) { + uint64_t number = 0; + uint64_t options_file_number = 0; + FileType type; + for (const auto& fname : file_names) { + if (ParseFileName(fname, &number, &type) && type == kOptionsFile) { + options_file_number = std::max(number, options_file_number); + } + } + versions_->options_file_number_ = options_file_number; + } + } + + return s; +} + +Status DBImpl::PersistentStatsProcessFormatVersion() { + mutex_.AssertHeld(); + Status s; + // persist version when stats CF doesn't exist + bool should_persist_format_version = !persistent_stats_cfd_exists_; + mutex_.Unlock(); + if (persistent_stats_cfd_exists_) { + // Check persistent stats format version compatibility. Drop and recreate + // persistent stats CF if format version is incompatible + uint64_t format_version_recovered = 0; + Status s_format = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kFormatVersion, &format_version_recovered); + uint64_t compatible_version_recovered = 0; + Status s_compatible = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kCompatibleVersion, + &compatible_version_recovered); + // abort reading from existing stats CF if any of following is true: + // 1. failed to read format version or compatible version from disk + // 2. sst's format version is greater than current format version, meaning + // this sst is encoded with a newer RocksDB release, and current compatible + // version is below the sst's compatible version + if (!s_format.ok() || !s_compatible.ok() || + (kStatsCFCurrentFormatVersion < format_version_recovered && + kStatsCFCompatibleFormatVersion < compatible_version_recovered)) { + if (!s_format.ok() || !s_compatible.ok()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Reading persistent stats version key failed. Format key: %s, " + "compatible key: %s", + s_format.ToString().c_str(), s_compatible.ToString().c_str()); + } else { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Disable persistent stats due to corrupted or incompatible format " + "version\n"); + } + DropColumnFamily(persist_stats_cf_handle_); + DestroyColumnFamilyHandle(persist_stats_cf_handle_); + ColumnFamilyHandle* handle = nullptr; + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + persist_stats_cf_handle_ = static_cast(handle); + // should also persist version here because old stats CF is discarded + should_persist_format_version = true; + } + } + if (s.ok() && should_persist_format_version) { + // Persistent stats CF being created for the first time, need to write + // format version key + WriteBatch batch; + batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString, + ToString(kStatsCFCurrentFormatVersion)); + batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString, + ToString(kStatsCFCompatibleFormatVersion)); + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + s = Write(wo, &batch); + } + mutex_.Lock(); + return s; +} + +Status DBImpl::InitPersistStatsColumnFamily() { + mutex_.AssertHeld(); + assert(!persist_stats_cf_handle_); + ColumnFamilyData* persistent_stats_cfd = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr; + + Status s; + if (persistent_stats_cfd != nullptr) { + // We are recovering from a DB which already contains persistent stats CF, + // the CF is already created in VersionSet::ApplyOneVersionEdit, but + // column family handle was not. Need to explicitly create handle here. + persist_stats_cf_handle_ = + new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_); + } else { + mutex_.Unlock(); + ColumnFamilyHandle* handle = nullptr; + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + persist_stats_cf_handle_ = static_cast(handle); + mutex_.Lock(); + } + return s; +} + +// REQUIRES: log_numbers are sorted in ascending order +Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, bool read_only, + bool* corrupted_log_found) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + Status* status; // nullptr if immutable_db_options_.paranoid_checks==false + void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", + (this->status == nullptr ? "(ignoring error) " : ""), + fname, static_cast(bytes), s.ToString().c_str()); + if (this->status != nullptr && this->status->ok()) { + *this->status = s; + } + } + }; + + mutex_.AssertHeld(); + Status status; + std::unordered_map version_edits; + // no need to refcount because iteration is under mutex + for (auto cfd : *versions_->GetColumnFamilySet()) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + version_edits.insert({cfd->GetID(), edit}); + } + int job_id = next_job_id_.fetch_add(1); + { + auto stream = event_logger_.Log(); + stream << "job" << job_id << "event" + << "recovery_started"; + stream << "log_files"; + stream.StartArray(); + for (auto log_number : log_numbers) { + stream << log_number; + } + stream.EndArray(); + } + +#ifndef ROCKSDB_LITE + if (immutable_db_options_.wal_filter != nullptr) { + std::map cf_name_id_map; + std::map cf_lognumber_map; + for (auto cfd : *versions_->GetColumnFamilySet()) { + cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID())); + cf_lognumber_map.insert( + std::make_pair(cfd->GetID(), cfd->GetLogNumber())); + } + + immutable_db_options_.wal_filter->ColumnFamilyLogNumberMap(cf_lognumber_map, + cf_name_id_map); + } +#endif + + bool stop_replay_by_wal_filter = false; + bool stop_replay_for_corruption = false; + bool flushed = false; + uint64_t corrupted_log_number = kMaxSequenceNumber; + uint64_t min_log_number = MinLogNumberToKeep(); + for (auto log_number : log_numbers) { + if (log_number < min_log_number) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Skipping log #%" PRIu64 + " since it is older than min log to keep #%" PRIu64, + log_number, min_log_number); + continue; + } + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(log_number); + // Open the log file + std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Recovering log #%" PRIu64 " mode %d", log_number, + static_cast(immutable_db_options_.wal_recovery_mode)); + auto logFileDropped = [this, &fname]() { + uint64_t bytes; + if (env_->GetFileSize(fname, &bytes).ok()) { + auto info_log = immutable_db_options_.info_log.get(); + ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(), + static_cast(bytes)); + } + }; + if (stop_replay_by_wal_filter) { + logFileDropped(); + continue; + } + + std::unique_ptr file_reader; + { + std::unique_ptr file; + status = fs_->NewSequentialFile(fname, + fs_->OptimizeForLogRead(file_options_), + &file, nullptr); + if (!status.ok()) { + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Fail with one log file, but that's ok. + // Try next one. + continue; + } + } + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size)); + } + + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = immutable_db_options_.info_log.get(); + reporter.fname = fname.c_str(); + if (!immutable_db_options_.paranoid_checks || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kSkipAnyCorruptedRecords) { + reporter.status = nullptr; + } else { + reporter.status = &status; + } + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(immutable_db_options_.info_log, std::move(file_reader), + &reporter, true /*checksum*/, log_number); + + // Determine if we should tolerate incomplete records at the tail end of the + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + + while (!stop_replay_by_wal_filter && + reader.ReadRecord(&record, &scratch, + immutable_db_options_.wal_recovery_mode) && + status.ok()) { + if (record.size() < WriteBatchInternal::kHeader) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + SequenceNumber sequence = WriteBatchInternal::Sequence(&batch); + + if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery) { + // In point-in-time recovery mode, if sequence id of log files are + // consecutive, we continue recovery despite corruption. This could + // happen when we open and write to a corrupted DB, where sequence id + // will start from the last sequence id we recovered. + if (sequence == *next_sequence) { + stop_replay_for_corruption = false; + } + if (stop_replay_for_corruption) { + logFileDropped(); + break; + } + } + +#ifndef ROCKSDB_LITE + if (immutable_db_options_.wal_filter != nullptr) { + WriteBatch new_batch; + bool batch_changed = false; + + WalFilter::WalProcessingOption wal_processing_option = + immutable_db_options_.wal_filter->LogRecordFound( + log_number, fname, batch, &new_batch, &batch_changed); + + switch (wal_processing_option) { + case WalFilter::WalProcessingOption::kContinueProcessing: + // do nothing, proceeed normally + break; + case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: + // skip current record + continue; + case WalFilter::WalProcessingOption::kStopReplay: + // skip current record and stop replay + stop_replay_by_wal_filter = true; + continue; + case WalFilter::WalProcessingOption::kCorruptedRecord: { + status = + Status::Corruption("Corruption reported by Wal Filter ", + immutable_db_options_.wal_filter->Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + reporter.Corruption(record.size(), status); + continue; + } + break; + } + default: { + assert(false); // unhandled case + status = Status::NotSupported( + "Unknown WalProcessingOption returned" + " by Wal Filter ", + immutable_db_options_.wal_filter->Name()); + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Ignore the error with current record processing. + continue; + } + } + } + + if (batch_changed) { + // Make sure that the count in the new batch is + // within the orignal count. + int new_count = WriteBatchInternal::Count(&new_batch); + int original_count = WriteBatchInternal::Count(&batch); + if (new_count > original_count) { + ROCKS_LOG_FATAL( + immutable_db_options_.info_log, + "Recovering log #%" PRIu64 + " mode %d log filter %s returned " + "more records (%d) than original (%d) which is not allowed. " + "Aborting recovery.", + log_number, + static_cast(immutable_db_options_.wal_recovery_mode), + immutable_db_options_.wal_filter->Name(), new_count, + original_count); + status = Status::NotSupported( + "More than original # of records " + "returned by Wal Filter ", + immutable_db_options_.wal_filter->Name()); + return status; + } + // Set the same sequence number in the new_batch + // as the original batch. + WriteBatchInternal::SetSequence(&new_batch, + WriteBatchInternal::Sequence(&batch)); + batch = new_batch; + } + } +#endif // ROCKSDB_LITE + + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- + // we just ignore the update. + // That's why we set ignore missing column families to true + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), &flush_scheduler_, + &trim_history_scheduler_, true, log_number, this, + false /* concurrent_memtable_writes */, next_sequence, + &has_valid_writes, seq_per_batch_, batch_per_txn_); + MaybeIgnoreError(&status); + if (!status.ok()) { + // We are treating this as a failure while reading since we read valid + // blocks that do not form coherent data + reporter.Corruption(record.size(), status); + continue; + } + + if (has_valid_writes && !read_only) { + // we can do this because this is called before client has access to the + // DB and there is only a single thread operating on DB + ColumnFamilyData* cfd; + + while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { + cfd->UnrefAndTryDelete(); + // If this asserts, it means that InsertInto failed in + // filtering updates to already-flushed column families + assert(cfd->GetLogNumber() <= log_number); + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); + if (!status.ok()) { + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + return status; + } + flushed = true; + + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + *next_sequence); + } + } + } + + if (!status.ok()) { + if (status.IsNotSupported()) { + // We should not treat NotSupported as corruption. It is rather a clear + // sign that we are processing a WAL that is produced by an incompatible + // version of the code. + return status; + } + if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kSkipAnyCorruptedRecords) { + // We should ignore all errors unconditionally + status = Status::OK(); + } else if (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery) { + // We should ignore the error but not continue replaying + status = Status::OK(); + stop_replay_for_corruption = true; + corrupted_log_number = log_number; + if (corrupted_log_found != nullptr) { + *corrupted_log_found = true; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Point in time recovered to log #%" PRIu64 + " seq #%" PRIu64, + log_number, *next_sequence); + } else { + assert(immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kAbsoluteConsistency); + return status; + } + } + + flush_scheduler_.Clear(); + trim_history_scheduler_.Clear(); + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastAllocatedSequence(last_sequence); + versions_->SetLastPublishedSequence(last_sequence); + versions_->SetLastSequence(last_sequence); + } + } + // Compare the corrupted log number to all columnfamily's current log number. + // Abort Open() if any column family's log number is greater than + // the corrupted log number, which means CF contains data beyond the point of + // corruption. This could during PIT recovery when the WAL is corrupted and + // some (but not all) CFs are flushed + // Exclude the PIT case where no log is dropped after the corruption point. + // This is to cover the case for empty logs after corrupted log, in which we + // don't reset stop_replay_for_corruption. + if (stop_replay_for_corruption == true && + (immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kPointInTimeRecovery || + immutable_db_options_.wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords)) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->GetLogNumber() > corrupted_log_number) { + ROCKS_LOG_ERROR(immutable_db_options_.info_log, + "Column family inconsistency: SST file contains data" + " beyond the point of corruption."); + return Status::Corruption("SST file is ahead of WALs"); + } + } + } + + // True if there's any data in the WALs; if not, we can skip re-processing + // them later + bool data_seen = false; + if (!read_only) { + // no need to refcount since client still doesn't have access + // to the DB and can not drop column families while we iterate + auto max_log_number = log_numbers.back(); + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + VersionEdit* edit = &iter->second; + + if (cfd->GetLogNumber() > max_log_number) { + // Column family cfd has already flushed the data + // from all logs. Memtable has to be empty because + // we filter the updates based on log_number + // (in WriteBatch::InsertInto) + assert(cfd->mem()->GetFirstSequenceNumber() == 0); + assert(edit->NumEntries() == 0); + continue; + } + + TEST_SYNC_POINT_CALLBACK( + "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr); + + // flush the final memtable (if non-empty) + if (cfd->mem()->GetFirstSequenceNumber() != 0) { + // If flush happened in the middle of recovery (e.g. due to memtable + // being full), we flush at the end. Otherwise we'll need to record + // where we were on last flush, which make the logic complicated. + if (flushed || !immutable_db_options_.avoid_flush_during_recovery) { + status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); + if (!status.ok()) { + // Recovery failed + break; + } + flushed = true; + + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), + versions_->LastSequence()); + } + data_seen = true; + } + + // Update the log number info in the version edit corresponding to this + // column family. Note that the version edits will be written to MANIFEST + // together later. + // writing log_number in the manifest means that any log file + // with number strongly less than (log_number + 1) is already + // recovered and should be ignored on next reincarnation. + // Since we already recovered max_log_number, we want all logs + // with numbers `<= max_log_number` (includes this one) to be ignored + if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) { + edit->SetLogNumber(max_log_number + 1); + } + } + if (status.ok()) { + // we must mark the next log number as used, even though it's + // not actually used. that is because VersionSet assumes + // VersionSet::next_file_number_ always to be strictly greater than any + // log number + versions_->MarkFileNumberUsed(max_log_number + 1); + + autovector cfds; + autovector cf_opts; + autovector> edit_lists; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + cfds.push_back(cfd); + cf_opts.push_back(cfd->GetLatestMutableCFOptions()); + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + edit_lists.push_back({&iter->second}); + } + // write MANIFEST with update + status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_, + directories_.GetDbDir(), + /*new_descriptor_log=*/true); + } + } + + if (status.ok() && data_seen && !flushed) { + status = RestoreAliveLogFiles(log_numbers); + } + + event_logger_.Log() << "job" << job_id << "event" + << "recovery_finished"; + + return status; +} + +Status DBImpl::RestoreAliveLogFiles(const std::vector& log_numbers) { + if (log_numbers.empty()) { + return Status::OK(); + } + Status s; + mutex_.AssertHeld(); + assert(immutable_db_options_.avoid_flush_during_recovery); + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + // Mark these as alive so they'll be considered for deletion later by + // FindObsoleteFiles() + total_log_size_ = 0; + log_empty_ = false; + for (auto log_number : log_numbers) { + LogFileNumberSize log(log_number); + std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); + // This gets the appear size of the logs, not including preallocated space. + s = env_->GetFileSize(fname, &log.size); + if (!s.ok()) { + break; + } + total_log_size_ += log.size; + alive_log_files_.push_back(log); + // We preallocate space for logs, but then after a crash and restart, those + // preallocated space are not needed anymore. It is likely only the last + // log has such preallocated space, so we only truncate for the last log. + if (log_number == log_numbers.back()) { + std::unique_ptr last_log; + Status truncate_status = fs_->ReopenWritableFile( + fname, + fs_->OptimizeForLogWrite( + file_options_, + BuildDBOptions(immutable_db_options_, mutable_db_options_)), + &last_log, nullptr); + if (truncate_status.ok()) { + truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr); + } + if (truncate_status.ok()) { + truncate_status = last_log->Close(IOOptions(), nullptr); + } + // Not a critical error if fail to truncate. + if (!truncate_status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to truncate log #%" PRIu64 ": %s", log_number, + truncate_status.ToString().c_str()); + } + } + } + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + return s; +} + +Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, + MemTable* mem, VersionEdit* edit) { + mutex_.AssertHeld(); + const uint64_t start_micros = env_->NowMicros(); + FileMetaData meta; + std::unique_ptr::iterator> pending_outputs_inserted_elem( + new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + Status s; + TableProperties table_properties; + { + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); + + // Get the latest mutable cf options while the mutex is still locked + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + bool paranoid_file_checks = + cfd->GetLatestMutableCFOptions()->paranoid_file_checks; + + int64_t _current_time = 0; + env_->GetCurrentTime(&_current_time); // ignore error + const uint64_t current_time = static_cast(_current_time); + meta.oldest_ancester_time = current_time; + + { + auto write_hint = cfd->CalculateSSTWriteHint(0); + mutex_.Unlock(); + + SequenceNumber earliest_write_conflict_snapshot; + std::vector snapshot_seqs = + snapshots_.GetAll(&earliest_write_conflict_snapshot); + auto snapshot_checker = snapshot_checker_.get(); + if (use_custom_gc_ && snapshot_checker == nullptr) { + snapshot_checker = DisableGCSnapshotChecker::Instance(); + } + std::vector> + range_del_iters; + auto range_del_iter = + mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } + s = BuildTable( + dbname_, env_, fs_.get(), *cfd->ioptions(), mutable_cf_options, + file_options_for_compaction_, cfd->table_cache(), iter.get(), + std::move(range_del_iters), &meta, cfd->internal_comparator(), + cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(), + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), + mutable_cf_options.sample_for_compression, + cfd->ioptions()->compression_opts, paranoid_file_checks, + cfd->internal_stats(), TableFileCreationReason::kRecovery, + &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */, + -1 /* level */, current_time, write_hint); + LogFlush(immutable_db_options_.info_log); + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", + cfd->GetName().c_str(), meta.fd.GetNumber(), + meta.fd.GetFileSize(), s.ToString().c_str()); + mutex_.Lock(); + } + } + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + int level = 0; + if (s.ok() && meta.fd.GetFileSize() > 0) { + edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), + meta.fd.GetFileSize(), meta.smallest, meta.largest, + meta.fd.smallest_seqno, meta.fd.largest_seqno, + meta.marked_for_compaction, meta.oldest_blob_file_number, + meta.oldest_ancester_time, meta.file_creation_time, + meta.file_checksum, meta.file_checksum_func_name); + } + + InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); + stats.micros = env_->NowMicros() - start_micros; + stats.bytes_written = meta.fd.GetFileSize(); + stats.num_output_files = 1; + cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats); + cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED, + meta.fd.GetFileSize()); + RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); + return s; +} + +Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + if (db_options.persist_stats_to_disk) { + column_families.push_back( + ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options)); + } + std::vector handles; + Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + if (db_options.persist_stats_to_disk) { + assert(handles.size() == 2); + } else { + assert(handles.size() == 1); + } + // i can delete the handle since DBImpl is always holding a reference to + // default column family + if (db_options.persist_stats_to_disk && handles[1] != nullptr) { + delete handles[1]; + } + delete handles[0]; + } + return s; +} + +Status DB::Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + const bool kSeqPerBatch = true; + const bool kBatchPerTxn = true; + return DBImpl::Open(db_options, dbname, column_families, handles, dbptr, + !kSeqPerBatch, kBatchPerTxn); +} + +Status DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, log::Writer** new_log) { + Status s; + std::unique_ptr lfile; + + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + FileOptions opt_file_options = + fs_->OptimizeForLogWrite(file_options_, db_options); + std::string log_fname = + LogFileName(immutable_db_options_.wal_dir, log_file_num); + + if (recycle_log_number) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "reusing log %" PRIu64 " from recycle list\n", + recycle_log_number); + std::string old_log_fname = + LogFileName(immutable_db_options_.wal_dir, recycle_log_number); + TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1"); + TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2"); + s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options, + &lfile, /*dbg=*/nullptr); + } else { + s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options); + } + + if (s.ok()) { + lfile->SetWriteLifeTimeHint(CalculateWALWriteHint()); + lfile->SetPreallocationBlockSize(preallocate_block_size); + + const auto& listeners = immutable_db_options_.listeners; + std::unique_ptr file_writer( + new WritableFileWriter(std::move(lfile), log_fname, opt_file_options, + env_, nullptr /* stats */, listeners)); + *new_log = new log::Writer(std::move(file_writer), log_file_num, + immutable_db_options_.recycle_log_file_num > 0, + immutable_db_options_.manual_wal_flush); + } + return s; +} + +Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + const bool seq_per_batch, const bool batch_per_txn) { + Status s = SanitizeOptionsByTable(db_options, column_families); + if (!s.ok()) { + return s; + } + + s = ValidateOptions(db_options, column_families); + if (!s.ok()) { + return s; + } + + *dbptr = nullptr; + handles->clear(); + + size_t max_write_buffer_size = 0; + for (auto cf : column_families) { + max_write_buffer_size = + std::max(max_write_buffer_size, cf.options.write_buffer_size); + } + + DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn); + s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir); + if (s.ok()) { + std::vector paths; + for (auto& db_path : impl->immutable_db_options_.db_paths) { + paths.emplace_back(db_path.path); + } + for (auto& cf : column_families) { + for (auto& cf_path : cf.options.cf_paths) { + paths.emplace_back(cf_path.path); + } + } + for (auto& path : paths) { + s = impl->env_->CreateDirIfMissing(path); + if (!s.ok()) { + break; + } + } + + // For recovery from NoSpace() error, we can only handle + // the case where the database is stored in a single path + if (paths.size() <= 1) { + impl->error_handler_.EnableAutoRecovery(); + } + } + + if (!s.ok()) { + delete impl; + return s; + } + + s = impl->CreateArchivalDirectory(); + if (!s.ok()) { + delete impl; + return s; + } + + impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_); + + impl->mutex_.Lock(); + // Handles create_if_missing, error_if_exists + uint64_t recovered_seq(kMaxSequenceNumber); + s = impl->Recover(column_families, false, false, false, &recovered_seq); + if (s.ok()) { + uint64_t new_log_number = impl->versions_->NewFileNumber(); + log::Writer* new_log = nullptr; + const size_t preallocate_block_size = + impl->GetWalPreallocateBlockSize(max_write_buffer_size); + s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/, + preallocate_block_size, &new_log); + if (s.ok()) { + InstrumentedMutexLock wl(&impl->log_write_mutex_); + impl->logfile_number_ = new_log_number; + assert(new_log != nullptr); + impl->logs_.emplace_back(new_log_number, new_log); + } + + if (s.ok()) { + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd != nullptr) { + handles->push_back( + new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + impl->NewThreadStatusCfInfo(cfd); + } else { + if (db_options.create_missing_column_families) { + // missing column family, create it + ColumnFamilyHandle* handle; + impl->mutex_.Unlock(); + s = impl->CreateColumnFamily(cf.options, cf.name, &handle); + impl->mutex_.Lock(); + if (s.ok()) { + handles->push_back(handle); + } else { + break; + } + } else { + s = Status::InvalidArgument("Column family not found: ", cf.name); + break; + } + } + } + } + if (s.ok()) { + SuperVersionContext sv_context(/* create_superversion */ true); + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + impl->InstallSuperVersionAndScheduleWork( + cfd, &sv_context, *cfd->GetLatestMutableCFOptions()); + } + sv_context.Clean(); + if (impl->two_write_queues_) { + impl->log_write_mutex_.Lock(); + } + impl->alive_log_files_.push_back( + DBImpl::LogFileNumberSize(impl->logfile_number_)); + if (impl->two_write_queues_) { + impl->log_write_mutex_.Unlock(); + } + + impl->DeleteObsoleteFiles(); + s = impl->directories_.GetDbDir()->Fsync(); + } + if (s.ok()) { + // In WritePrepared there could be gap in sequence numbers. This breaks + // the trick we use in kPointInTimeRecovery which assumes the first seq in + // the log right after the corrupted log is one larger than the last seq + // we read from the logs. To let this trick keep working, we add a dummy + // entry with the expected sequence to the first log right after recovery. + // In non-WritePrepared case also the new log after recovery could be + // empty, and thus missing the consecutive seq hint to distinguish + // middle-log corruption to corrupted-log-remained-after-recovery. This + // case also will be addressed by a dummy write. + if (recovered_seq != kMaxSequenceNumber) { + WriteBatch empty_batch; + WriteBatchInternal::SetSequence(&empty_batch, recovered_seq); + WriteOptions write_options; + uint64_t log_used, log_size; + log::Writer* log_writer = impl->logs_.back().writer; + s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size); + if (s.ok()) { + // Need to fsync, otherwise it might get lost after a power reset. + s = impl->FlushWAL(false); + if (s.ok()) { + s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync); + } + } + } + } + } + if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { + // try to read format version but no need to fail Open() even if it fails + s = impl->PersistentStatsProcessFormatVersion(); + } + + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + if (cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + auto* vstorage = cfd->current()->storage_info(); + for (int i = 1; i < vstorage->num_levels(); ++i) { + int num_files = vstorage->NumLevelFiles(i); + if (num_files > 0) { + s = Status::InvalidArgument( + "Not all files are at level 0. Cannot " + "open with FIFO compaction style."); + break; + } + } + } + if (!cfd->mem()->IsSnapshotSupported()) { + impl->is_snapshot_supported_ = false; + } + if (cfd->ioptions()->merge_operator != nullptr && + !cfd->mem()->IsMergeOperatorSupported()) { + s = Status::InvalidArgument( + "The memtable of column family %s does not support merge operator " + "its options.merge_operator is non-null", + cfd->GetName().c_str()); + } + if (!s.ok()) { + break; + } + } + } + TEST_SYNC_POINT("DBImpl::Open:Opened"); + Status persist_options_status; + if (s.ok()) { + // Persist RocksDB Options before scheduling the compaction. + // The WriteOptionsFile() will release and lock the mutex internally. + persist_options_status = impl->WriteOptionsFile( + false /*need_mutex_lock*/, false /*need_enter_write_thread*/); + + *dbptr = impl; + impl->opened_successfully_ = true; + impl->MaybeScheduleFlushOrCompaction(); + } + impl->mutex_.Unlock(); + +#ifndef ROCKSDB_LITE + auto sfm = static_cast( + impl->immutable_db_options_.sst_file_manager.get()); + if (s.ok() && sfm) { + // Notify SstFileManager about all sst files that already exist in + // db_paths[0] and cf_paths[0] when the DB is opened. + + // SstFileManagerImpl needs to know sizes of the files. For files whose size + // we already know (sst files that appear in manifest - typically that's the + // vast majority of all files), we'll pass the size to SstFileManager. + // For all other files SstFileManager will query the size from filesystem. + + std::vector metadata; + + impl->mutex_.Lock(); + impl->versions_->GetLiveFilesMetaData(&metadata); + impl->mutex_.Unlock(); + + std::unordered_map known_file_sizes; + for (const auto& md : metadata) { + std::string name = md.name; + if (!name.empty() && name[0] == '/') { + name = name.substr(1); + } + known_file_sizes[name] = md.size; + } + + std::vector paths; + paths.emplace_back(impl->immutable_db_options_.db_paths[0].path); + for (auto& cf : column_families) { + if (!cf.options.cf_paths.empty()) { + paths.emplace_back(cf.options.cf_paths[0].path); + } + } + // Remove duplicate paths. + std::sort(paths.begin(), paths.end()); + paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); + for (auto& path : paths) { + std::vector existing_files; + impl->immutable_db_options_.env->GetChildren(path, &existing_files); + for (auto& file_name : existing_files) { + uint64_t file_number; + FileType file_type; + std::string file_path = path + "/" + file_name; + if (ParseFileName(file_name, &file_number, &file_type) && + file_type == kTableFile) { + if (known_file_sizes.count(file_name)) { + // We're assuming that each sst file name exists in at most one of + // the paths. + sfm->OnAddFile(file_path, known_file_sizes.at(file_name), + /* compaction */ false); + } else { + sfm->OnAddFile(file_path); + } + } + } + } + + // Reserve some disk buffer space. This is a heuristic - when we run out + // of disk space, this ensures that there is atleast write_buffer_size + // amount of free space before we resume DB writes. In low disk space + // conditions, we want to avoid a lot of small L0 files due to frequent + // WAL write failures and resultant forced flushes + sfm->ReserveDiskBuffer(max_write_buffer_size, + impl->immutable_db_options_.db_paths[0].path); + } +#endif // !ROCKSDB_LITE + + if (s.ok()) { + ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p", + impl); + LogFlush(impl->immutable_db_options_.info_log); + assert(impl->TEST_WALBufferIsEmpty()); + // If the assert above fails then we need to FlushWAL before returning + // control back to the user. + if (!persist_options_status.ok()) { + s = Status::IOError( + "DB::Open() failed --- Unable to persist Options file", + persist_options_status.ToString()); + } + } + if (s.ok()) { + impl->StartTimedTasks(); + } + if (!s.ok()) { + for (auto* h : *handles) { + delete h; + } + handles->clear(); + delete impl; + *dbptr = nullptr; + } + return s; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.cc b/src/rocksdb/db/db_impl/db_impl_readonly.cc new file mode 100644 index 000000000..a4242bfe1 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_readonly.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_impl/db_impl_readonly.h" +#include "db/arena_wrapped_db_iter.h" + +#include "db/compacted_db_impl.h" +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/merge_context.h" +#include "monitoring/perf_context_imp.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE + +DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, + const std::string& dbname) + : DBImpl(db_options, dbname) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Opening the db in read only mode"); + LogFlush(immutable_db_options_.info_log); +} + +DBImplReadOnly::~DBImplReadOnly() {} + +// Implementations of the DB interface +Status DBImplReadOnly::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + // TODO: stopwatch DB_GET needed?, perf timer needed? + PERF_TIMER_GUARD(get_snapshot_time); + Status s; + SequenceNumber snapshot = versions_->LastSequence(); + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(column_family, key); + } + } + SuperVersion* super_version = cfd->GetSuperVersion(); + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + LookupKey lkey(key, snapshot); + PERF_TIMER_STOP(get_snapshot_time); + if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, + &max_covering_tombstone_seq, read_options)) { + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else { + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->Get(read_options, lkey, pinnable_val, &s, + &merge_context, &max_covering_tombstone_seq); + RecordTick(stats_, MEMTABLE_MISS); + } + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = pinnable_val->size(); + RecordTick(stats_, BYTES_READ, size); + RecordInHistogram(stats_, BYTES_PER_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + return s; +} + +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + SequenceNumber latest_snapshot = versions_->LastSequence(); + SequenceNumber read_seq = + read_options.snapshot != nullptr + ? reinterpret_cast(read_options.snapshot) + ->number_ + : latest_snapshot; + ReadCallback* read_callback = nullptr; // No read callback provided. + auto db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, + read_seq, + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + super_version->version_number, read_callback); + auto internal_iter = + NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), read_seq); + db_iter->SetIterUnderDBIter(internal_iter); + return db_iter; +} + +Status DBImplReadOnly::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + ReadCallback* read_callback = nullptr; // No read callback provided. + if (iterators == nullptr) { + return Status::InvalidArgument("iterators not allowed to be nullptr"); + } + iterators->clear(); + iterators->reserve(column_families.size()); + SequenceNumber latest_snapshot = versions_->LastSequence(); + SequenceNumber read_seq = + read_options.snapshot != nullptr + ? reinterpret_cast(read_options.snapshot) + ->number_ + : latest_snapshot; + + for (auto cfh : column_families) { + auto* cfd = reinterpret_cast(cfh)->cfd(); + auto* sv = cfd->GetSuperVersion()->Ref(); + auto* db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_callback); + auto* internal_iter = + NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), read_seq); + db_iter->SetIterUnderDBIter(internal_iter); + iterators->push_back(db_iter); + } + + return Status::OK(); +} + +Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, + DB** dbptr, bool /*error_if_log_file_exist*/) { + *dbptr = nullptr; + + // Try to first open DB as fully compacted DB + Status s; + s = CompactedDBImpl::Open(options, dbname, dbptr); + if (s.ok()) { + return s; + } + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + + s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a + // reference to default column family + delete handles[0]; + } + return s; +} + +Status DB::OpenForReadOnly( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_log_file_exist) { + *dbptr = nullptr; + handles->clear(); + + SuperVersionContext sv_context(/* create_superversion */ true); + DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname); + impl->mutex_.Lock(); + Status s = impl->Recover(column_families, true /* read only */, + error_if_log_file_exist); + if (s.ok()) { + // set column family handles + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (cfd == nullptr) { + s = Status::InvalidArgument("Column family not found: ", cf.name); + break; + } + handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } + } + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + sv_context.NewSuperVersion(); + cfd->InstallSuperVersion(&sv_context, &impl->mutex_); + } + } + impl->mutex_.Unlock(); + sv_context.Clean(); + if (s.ok()) { + *dbptr = impl; + for (auto* h : *handles) { + impl->NewThreadStatusCfInfo( + reinterpret_cast(h)->cfd()); + } + } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete impl; + } + return s; +} + +#else // !ROCKSDB_LITE + +Status DB::OpenForReadOnly(const Options& /*options*/, + const std::string& /*dbname*/, DB** /*dbptr*/, + bool /*error_if_log_file_exist*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} + +Status DB::OpenForReadOnly( + const DBOptions& /*db_options*/, const std::string& /*dbname*/, + const std::vector& /*column_families*/, + std::vector* /*handles*/, DB** /*dbptr*/, + bool /*error_if_log_file_exist*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.h b/src/rocksdb/db/db_impl/db_impl_readonly.h new file mode 100644 index 000000000..04d06b4a1 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_readonly.h @@ -0,0 +1,137 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +class DBImplReadOnly : public DBImpl { + public: + DBImplReadOnly(const DBOptions& options, const std::string& dbname); + // No copying allowed + DBImplReadOnly(const DBImplReadOnly&) = delete; + void operator=(const DBImplReadOnly&) = delete; + + virtual ~DBImplReadOnly(); + + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + + // TODO: Implement ReadOnly MultiGet? + + using DBImpl::NewIterator; + virtual Iterator* NewIterator(const ReadOptions&, + ColumnFamilyHandle* column_family) override; + + virtual Status NewIterators( + const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + using DBImpl::Put; + virtual Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/, const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::SingleDelete; + virtual Status SingleDelete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, + const Slice* /*end*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr, + CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + virtual Status DisableFileDeletions() override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + virtual Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + virtual Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { + return DBImpl::GetLiveFiles(ret, manifest_file_size, + false /* flush_memtable */); + } + + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DBImpl::SyncWAL; + virtual Status SyncWAL() override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + private: + friend class DB; +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.cc b/src/rocksdb/db/db_impl/db_impl_secondary.cc new file mode 100644 index 000000000..f0ec27c32 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_secondary.cc @@ -0,0 +1,671 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_impl/db_impl_secondary.h" + +#include + +#include "db/arena_wrapped_db_iter.h" +#include "db/merge_context.h" +#include "logging/auto_roll_logger.h" +#include "monitoring/perf_context_imp.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +DBImplSecondary::DBImplSecondary(const DBOptions& db_options, + const std::string& dbname) + : DBImpl(db_options, dbname) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Opening the db in secondary mode"); + LogFlush(immutable_db_options_.info_log); +} + +DBImplSecondary::~DBImplSecondary() {} + +Status DBImplSecondary::Recover( + const std::vector& column_families, + bool /*readonly*/, bool /*error_if_log_file_exist*/, + bool /*error_if_data_exists_in_logs*/, uint64_t*) { + mutex_.AssertHeld(); + + JobContext job_context(0); + Status s; + s = static_cast(versions_.get()) + ->Recover(column_families, &manifest_reader_, &manifest_reporter_, + &manifest_reader_status_); + if (!s.ok()) { + return s; + } + if (immutable_db_options_.paranoid_checks && s.ok()) { + s = CheckConsistency(); + } + // Initial max_total_in_memory_state_ before recovery logs. + max_total_in_memory_state_ = 0; + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + } + if (s.ok()) { + default_cf_handle_ = new ColumnFamilyHandleImpl( + versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + single_column_family_mode_ = + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; + + std::unordered_set cfds_changed; + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); + } + + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } + // TODO: update options_file_number_ needed? + + job_context.Clean(); + return s; +} + +// find new WAL and apply them in order to the secondary instance +Status DBImplSecondary::FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); + Status s; + std::vector logs; + s = FindNewLogNumbers(&logs); + if (s.ok() && !logs.empty()) { + SequenceNumber next_sequence(kMaxSequenceNumber); + s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context); + } + return s; +} + +// List wal_dir and find all new WALs, return these log numbers +Status DBImplSecondary::FindNewLogNumbers(std::vector* logs) { + assert(logs != nullptr); + std::vector filenames; + Status s; + s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); + if (s.IsNotFound()) { + return Status::InvalidArgument("Failed to open wal_dir", + immutable_db_options_.wal_dir); + } else if (!s.ok()) { + return s; + } + + // if log_readers_ is non-empty, it means we have applied all logs with log + // numbers smaller than the smallest log in log_readers_, so there is no + // need to pass these logs to RecoverLogFiles + uint64_t log_number_min = 0; + if (!log_readers_.empty()) { + log_number_min = log_readers_.begin()->first; + } + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) && type == kLogFile && + number >= log_number_min) { + logs->push_back(number); + } + } + // Recover logs in the order that they were generated + if (!logs->empty()) { + std::sort(logs->begin(), logs->end()); + } + return s; +} + +Status DBImplSecondary::MaybeInitLogReader( + uint64_t log_number, log::FragmentBufferedReader** log_reader) { + auto iter = log_readers_.find(log_number); + // make sure the log file is still present + if (iter == log_readers_.end() || + iter->second->reader_->GetLogNumber() != log_number) { + // delete the obsolete log reader if log number mismatch + if (iter != log_readers_.end()) { + log_readers_.erase(iter); + } + // initialize log reader from log_number + // TODO: min_log_number_to_keep_2pc check needed? + // Open the log file + std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Recovering log #%" PRIu64 " mode %d", log_number, + static_cast(immutable_db_options_.wal_recovery_mode)); + + std::unique_ptr file_reader; + { + std::unique_ptr file; + Status status = fs_->NewSequentialFile( + fname, fs_->OptimizeForLogRead(file_options_), &file, + nullptr); + if (!status.ok()) { + *log_reader = nullptr; + return status; + } + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size)); + } + + // Create the log reader. + LogReaderContainer* log_reader_container = new LogReaderContainer( + env_, immutable_db_options_.info_log, std::move(fname), + std::move(file_reader), log_number); + log_readers_.insert(std::make_pair( + log_number, std::unique_ptr(log_reader_container))); + } + iter = log_readers_.find(log_number); + assert(iter != log_readers_.end()); + *log_reader = iter->second->reader_; + return Status::OK(); +} + +// After manifest recovery, replay WALs and refresh log_readers_ if necessary +// REQUIRES: log_numbers are sorted in ascending order +Status DBImplSecondary::RecoverLogFiles( + const std::vector& log_numbers, SequenceNumber* next_sequence, + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); + mutex_.AssertHeld(); + Status status; + for (auto log_number : log_numbers) { + log::FragmentBufferedReader* reader = nullptr; + status = MaybeInitLogReader(log_number, &reader); + if (!status.ok()) { + return status; + } + assert(reader != nullptr); + } + for (auto log_number : log_numbers) { + auto it = log_readers_.find(log_number); + assert(it != log_readers_.end()); + log::FragmentBufferedReader* reader = it->second->reader_; + // Manually update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(log_number); + + // Determine if we should tolerate incomplete records at the tail end of the + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + + while (reader->ReadRecord(&record, &scratch, + immutable_db_options_.wal_recovery_mode) && + status.ok()) { + if (record.size() < WriteBatchInternal::kHeader) { + reader->GetReporter()->Corruption( + record.size(), Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); + SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch); + std::vector column_family_ids; + status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids); + if (status.ok()) { + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); + } + const std::vector& l0_files = + cfd->current()->storage_info()->LevelFiles(0); + SequenceNumber seq = + l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno; + // If the write batch's sequence number is smaller than the last + // sequence number of the largest sequence persisted for this column + // family, then its data must reside in an SST that has already been + // added in the prior MANIFEST replay. + if (seq_of_batch <= seq) { + continue; + } + auto curr_log_num = port::kMaxUint64; + if (cfd_to_current_log_.count(cfd) > 0) { + curr_log_num = cfd_to_current_log_[cfd]; + } + // If the active memtable contains records added by replaying an + // earlier WAL, then we need to seal the memtable, add it to the + // immutable memtable list and create a new active memtable. + if (!cfd->mem()->IsEmpty() && (curr_log_num == port::kMaxUint64 || + curr_log_num != log_number)) { + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + MemTable* new_mem = + cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch); + cfd->mem()->SetNextLogNumber(log_number); + cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free); + new_mem->Ref(); + cfd->SetMemtable(new_mem); + } + } + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), + nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/, + true, log_number, this, false /* concurrent_memtable_writes */, + next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_); + } + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- + // we just ignore the update. + // That's why we set ignore missing column families to true + // passing null flush_scheduler will disable memtable flushing which is + // needed for secondary instances + if (status.ok()) { + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + std::unordered_map::iterator iter = + cfd_to_current_log_.find(cfd); + if (iter == cfd_to_current_log_.end()) { + cfd_to_current_log_.insert({cfd, log_number}); + } else if (log_number > iter->second) { + iter->second = log_number; + } + } + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastAllocatedSequence(last_sequence); + versions_->SetLastPublishedSequence(last_sequence); + versions_->SetLastSequence(last_sequence); + } + } else { + // We are treating this as a failure while reading since we read valid + // blocks that do not form coherent data + reader->GetReporter()->Corruption(record.size(), status); + } + } + if (!status.ok()) { + return status; + } + } + // remove logreaders from map after successfully recovering the WAL + if (log_readers_.size() > 1) { + auto erase_iter = log_readers_.begin(); + std::advance(erase_iter, log_readers_.size() - 1); + log_readers_.erase(log_readers_.begin(), erase_iter); + } + return status; +} + +// Implementation of the DB interface +Status DBImplSecondary::Get(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + return GetImpl(read_options, column_family, key, value); +} + +Status DBImplSecondary::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); + StopWatch sw(env_, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); + + auto cfh = static_cast(column_family); + ColumnFamilyData* cfd = cfh->cfd(); + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Get(column_family, key); + } + } + // Acquire SuperVersion + SuperVersion* super_version = GetAndRefSuperVersion(cfd); + SequenceNumber snapshot = versions_->LastSequence(); + MergeContext merge_context; + SequenceNumber max_covering_tombstone_seq = 0; + Status s; + LookupKey lkey(key, snapshot); + PERF_TIMER_STOP(get_snapshot_time); + + bool done = false; + if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, + &max_covering_tombstone_seq, read_options)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + super_version->imm->Get( + lkey, pinnable_val->GetSelf(), &s, &merge_context, + &max_covering_tombstone_seq, read_options)) { + done = true; + pinnable_val->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } + if (!done && !s.ok() && !s.IsMergeInProgress()) { + ReturnAndCleanupSuperVersion(cfd, super_version); + return s; + } + if (!done) { + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->Get(read_options, lkey, pinnable_val, &s, + &merge_context, &max_covering_tombstone_seq); + RecordTick(stats_, MEMTABLE_MISS); + } + { + PERF_TIMER_GUARD(get_post_process_time); + ReturnAndCleanupSuperVersion(cfd, super_version); + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = pinnable_val->size(); + RecordTick(stats_, BYTES_READ, size); + RecordTimeToHistogram(stats_, BYTES_PER_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + } + return s; +} + +Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, + ColumnFamilyHandle* column_family) { + if (read_options.managed) { + return NewErrorIterator( + Status::NotSupported("Managed iterator is not supported anymore.")); + } + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); + } + Iterator* result = nullptr; + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + ReadCallback* read_callback = nullptr; // No read callback provided. + if (read_options.tailing) { + return NewErrorIterator(Status::NotSupported( + "tailing iterator not supported in secondary mode")); + } else if (read_options.snapshot != nullptr) { + // TODO (yanqin) support snapshot. + return NewErrorIterator( + Status::NotSupported("snapshot not supported in secondary mode")); + } else { + auto snapshot = versions_->LastSequence(); + result = NewIteratorImpl(read_options, cfd, snapshot, read_callback); + } + return result; +} + +ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( + const ReadOptions& read_options, ColumnFamilyData* cfd, + SequenceNumber snapshot, ReadCallback* read_callback) { + assert(nullptr != cfd); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); + auto db_iter = NewArenaWrappedDbIterator( + env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, + snapshot, + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + super_version->version_number, read_callback); + auto internal_iter = + NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(), + db_iter->GetRangeDelAggregator(), snapshot); + db_iter->SetIterUnderDBIter(internal_iter); + return db_iter; +} + +Status DBImplSecondary::NewIterators( + const ReadOptions& read_options, + const std::vector& column_families, + std::vector* iterators) { + if (read_options.managed) { + return Status::NotSupported("Managed iterator is not supported anymore."); + } + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } + ReadCallback* read_callback = nullptr; // No read callback provided. + if (iterators == nullptr) { + return Status::InvalidArgument("iterators not allowed to be nullptr"); + } + iterators->clear(); + iterators->reserve(column_families.size()); + if (read_options.tailing) { + return Status::NotSupported( + "tailing iterator not supported in secondary mode"); + } else if (read_options.snapshot != nullptr) { + // TODO (yanqin) support snapshot. + return Status::NotSupported("snapshot not supported in secondary mode"); + } else { + SequenceNumber read_seq = versions_->LastSequence(); + for (auto cfh : column_families) { + ColumnFamilyData* cfd = static_cast(cfh)->cfd(); + iterators->push_back( + NewIteratorImpl(read_options, cfd, read_seq, read_callback)); + } + } + return Status::OK(); +} + +Status DBImplSecondary::CheckConsistency() { + mutex_.AssertHeld(); + Status s = DBImpl::CheckConsistency(); + // If DBImpl::CheckConsistency() which is stricter returns success, then we + // do not need to give a second chance. + if (s.ok()) { + return s; + } + // It's possible that DBImpl::CheckConssitency() can fail because the primary + // may have removed certain files, causing the GetFileSize(name) call to + // fail and returning a PathNotFound. In this case, we take a best-effort + // approach and just proceed. + TEST_SYNC_POINT_CALLBACK( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s); + + if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) { + return Status::OK(); + } + + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + + std::string corruption_messages; + for (const auto& md : metadata) { + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; + + uint64_t fsize = 0; + s = env_->GetFileSize(file_path, &fsize); + if (!s.ok() && + (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() || + s.IsPathNotFound())) { + s = Status::OK(); + } + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } + } + return corruption_messages.empty() ? Status::OK() + : Status::Corruption(corruption_messages); +} + +Status DBImplSecondary::TryCatchUpWithPrimary() { + assert(versions_.get() != nullptr); + assert(manifest_reader_.get() != nullptr); + Status s; + // read the manifest and apply new changes to the secondary instance + std::unordered_set cfds_changed; + JobContext job_context(0, true /*create_superversion*/); + { + InstrumentedMutexLock lock_guard(&mutex_); + s = static_cast_with_check(versions_.get()) + ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64, + static_cast(versions_->LastSequence())); + for (ColumnFamilyData* cfd : cfds_changed) { + if (cfd->IsDropped()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n", + cfd->GetName().c_str()); + continue; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, + "[%s] Level summary: %s\n", cfd->GetName().c_str(), + cfd->current()->storage_info()->LevelSummary(&tmp)); + } + + // list wal_dir to discover new WALs and apply new changes to the secondary + // instance + if (s.ok()) { + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); + } + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } + if (s.ok()) { + for (auto cfd : cfds_changed) { + cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(), + &job_context.memtables_to_free); + auto& sv_context = job_context.superversion_contexts.back(); + cfd->InstallSuperVersion(&sv_context, &mutex_); + sv_context.NewSuperVersion(); + } + } + } + job_context.Clean(); + + // Cleanup unused, obsolete files. + JobContext purge_files_job_context(0); + { + InstrumentedMutexLock lock_guard(&mutex_); + // Currently, secondary instance does not own the database files, thus it + // is unnecessary for the secondary to force full scan. + FindObsoleteFiles(&purge_files_job_context, /*force=*/false); + } + if (purge_files_job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(purge_files_job_context); + } + purge_files_job_context.Clean(); + return s; +} + +Status DB::OpenAsSecondary(const Options& options, const std::string& dbname, + const std::string& secondary_path, DB** dbptr) { + *dbptr = nullptr; + + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.emplace_back(kDefaultColumnFamilyName, cf_options); + std::vector handles; + + Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path, + column_families, &handles, dbptr); + if (s.ok()) { + assert(handles.size() == 1); + delete handles[0]; + } + return s; +} + +Status DB::OpenAsSecondary( + const DBOptions& db_options, const std::string& dbname, + const std::string& secondary_path, + const std::vector& column_families, + std::vector* handles, DB** dbptr) { + *dbptr = nullptr; + if (db_options.max_open_files != -1) { + // TODO (yanqin) maybe support max_open_files != -1 by creating hard links + // on SST files so that db secondary can still have access to old SSTs + // while primary instance may delete original. + return Status::InvalidArgument("require max_open_files to be -1"); + } + + DBOptions tmp_opts(db_options); + Status s; + if (nullptr == tmp_opts.info_log) { + s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log); + if (!s.ok()) { + tmp_opts.info_log = nullptr; + } + } + + handles->clear(); + DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname); + impl->versions_.reset(new ReactiveVersionSet( + dbname, &impl->immutable_db_options_, impl->file_options_, + impl->table_cache_.get(), impl->write_buffer_manager_, + &impl->write_controller_)); + impl->column_family_memtables_.reset( + new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet())); + impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_); + + impl->mutex_.Lock(); + s = impl->Recover(column_families, true, false, false); + if (s.ok()) { + for (auto cf : column_families) { + auto cfd = + impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); + if (nullptr == cfd) { + s = Status::InvalidArgument("Column family not found: ", cf.name); + break; + } + handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + } + } + SuperVersionContext sv_context(true /* create_superversion */); + if (s.ok()) { + for (auto cfd : *impl->versions_->GetColumnFamilySet()) { + sv_context.NewSuperVersion(); + cfd->InstallSuperVersion(&sv_context, &impl->mutex_); + } + } + impl->mutex_.Unlock(); + sv_context.Clean(); + if (s.ok()) { + *dbptr = impl; + for (auto h : *handles) { + impl->NewThreadStatusCfInfo( + reinterpret_cast(h)->cfd()); + } + } else { + for (auto h : *handles) { + delete h; + } + handles->clear(); + delete impl; + } + return s; +} +#else // !ROCKSDB_LITE + +Status DB::OpenAsSecondary(const Options& /*options*/, + const std::string& /*name*/, + const std::string& /*secondary_path*/, + DB** /*dbptr*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} + +Status DB::OpenAsSecondary( + const DBOptions& /*db_options*/, const std::string& /*dbname*/, + const std::string& /*secondary_path*/, + const std::vector& /*column_families*/, + std::vector* /*handles*/, DB** /*dbptr*/) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.h b/src/rocksdb/db/db_impl/db_impl_secondary.h new file mode 100644 index 000000000..24f2e7767 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_secondary.h @@ -0,0 +1,333 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +// A wrapper class to hold log reader, log reporter, log status. +class LogReaderContainer { + public: + LogReaderContainer() + : reader_(nullptr), reporter_(nullptr), status_(nullptr) {} + LogReaderContainer(Env* env, std::shared_ptr info_log, + std::string fname, + std::unique_ptr&& file_reader, + uint64_t log_number) { + LogReporter* reporter = new LogReporter(); + status_ = new Status(); + reporter->env = env; + reporter->info_log = info_log.get(); + reporter->fname = std::move(fname); + reporter->status = status_; + reporter_ = reporter; + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + reader_ = new log::FragmentBufferedReader(info_log, std::move(file_reader), + reporter, true /*checksum*/, + log_number); + } + log::FragmentBufferedReader* reader_; + log::Reader::Reporter* reporter_; + Status* status_; + ~LogReaderContainer() { + delete reader_; + delete reporter_; + delete status_; + } + private: + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + std::string fname; + Status* status; // nullptr if immutable_db_options_.paranoid_checks==false + void Corruption(size_t bytes, const Status& s) override { + ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", + (this->status == nullptr ? "(ignoring error) " : ""), + fname.c_str(), static_cast(bytes), + s.ToString().c_str()); + if (this->status != nullptr && this->status->ok()) { + *this->status = s; + } + } + }; +}; + +// The secondary instance shares access to the storage as the primary. +// The secondary is able to read and replay changes described in both the +// MANIFEST and the WAL files without coordination with the primary. +// The secondary instance can be opened using `DB::OpenAsSecondary`. After +// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best +// effort attempts to catch up with the primary. +class DBImplSecondary : public DBImpl { + public: + DBImplSecondary(const DBOptions& options, const std::string& dbname); + ~DBImplSecondary() override; + + // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_ + // and log_readers_ to facilitate future operations. + Status Recover(const std::vector& column_families, + bool read_only, bool error_if_log_file_exist, + bool error_if_data_exists_in_logs, + uint64_t* = nullptr) override; + + // Implementations of the DB interface + using DB::Get; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; + + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value); + + using DBImpl::NewIterator; + Iterator* NewIterator(const ReadOptions&, + ColumnFamilyHandle* column_family) override; + + ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback); + + Status NewIterators(const ReadOptions& options, + const std::vector& column_families, + std::vector* iterators) override; + + using DBImpl::Put; + Status Put(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Merge; + Status Merge(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Delete; + Status Delete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SingleDelete; + Status SingleDelete(const WriteOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice& /*key*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status Write(const WriteOptions& /*options*/, + WriteBatch* /*updates*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::CompactRange; + Status CompactRange(const CompactRangeOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Slice* /*begin*/, const Slice* /*end*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::CompactFiles; + Status CompactFiles( + const CompactionOptions& /*compact_options*/, + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*input_file_names*/, + const int /*output_level*/, const int /*output_path_id*/ = -1, + std::vector* const /*output_file_names*/ = nullptr, + CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status DisableFileDeletions() override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status EnableFileDeletions(bool /*force*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + Status GetLiveFiles(std::vector&, + uint64_t* /*manifest_file_size*/, + bool /*flush_memtable*/ = true) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::Flush; + Status Flush(const FlushOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SetDBOptions; + Status SetDBOptions(const std::unordered_map& + /*options_map*/) override { + // Currently not supported because changing certain options may cause + // flush/compaction. + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SetOptions; + Status SetOptions( + ColumnFamilyHandle* /*cfd*/, + const std::unordered_map& /*options_map*/) + override { + // Currently not supported because changing certain options may cause + // flush/compaction and/or write to MANIFEST. + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DBImpl::SyncWAL; + Status SyncWAL() override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + using DB::IngestExternalFile; + Status IngestExternalFile( + ColumnFamilyHandle* /*column_family*/, + const std::vector& /*external_files*/, + const IngestExternalFileOptions& /*ingestion_options*/) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } + + // Try to catch up with the primary by reading as much as possible from the + // log files until there is nothing more to read or encounters an error. If + // the amount of information in the log files to process is huge, this + // method can take long time due to all the I/O and CPU costs. + Status TryCatchUpWithPrimary() override; + + + // Try to find log reader using log_number from log_readers_ map, initialize + // if it doesn't exist + Status MaybeInitLogReader(uint64_t log_number, + log::FragmentBufferedReader** log_reader); + + // Check if all live files exist on file system and that their file sizes + // matche to the in-memory records. It is possible that some live files may + // have been deleted by the primary. In this case, CheckConsistency() does + // not flag the missing file as inconsistency. + Status CheckConsistency() override; + + protected: + // ColumnFamilyCollector is a write batch handler which does nothing + // except recording unique column family IDs + class ColumnFamilyCollector : public WriteBatch::Handler { + std::unordered_set column_family_ids_; + + Status AddColumnFamilyId(uint32_t column_family_id) { + if (column_family_ids_.find(column_family_id) == + column_family_ids_.end()) { + column_family_ids_.insert(column_family_id); + } + return Status::OK(); + } + + public: + explicit ColumnFamilyCollector() {} + + ~ColumnFamilyCollector() override {} + + Status PutCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteRangeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status MergeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status PutBlobIndexCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + const std::unordered_set& column_families() const { + return column_family_ids_; + } + }; + + Status CollectColumnFamilyIdsFromWriteBatch( + const WriteBatch& batch, std::vector* column_family_ids) { + assert(column_family_ids != nullptr); + column_family_ids->clear(); + ColumnFamilyCollector handler; + Status s = batch.Iterate(&handler); + if (s.ok()) { + for (const auto& cf : handler.column_families()) { + column_family_ids->push_back(cf); + } + } + return s; + } + + bool OwnTablesAndLogs() const override { + // Currently, the secondary instance does not own the database files. It + // simply opens the files of the primary instance and tracks their file + // descriptors until they become obsolete. In the future, the secondary may + // create links to database files. OwnTablesAndLogs will return true then. + return false; + } + + private: + friend class DB; + + // No copying allowed + DBImplSecondary(const DBImplSecondary&); + void operator=(const DBImplSecondary&); + + using DBImpl::Recover; + + Status FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context); + Status FindNewLogNumbers(std::vector* logs); + // After manifest recovery, replay WALs and refresh log_readers_ if necessary + // REQUIRES: log_numbers are sorted in ascending order + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, + std::unordered_set* cfds_changed, + JobContext* job_context); + + std::unique_ptr manifest_reader_; + std::unique_ptr manifest_reporter_; + std::unique_ptr manifest_reader_status_; + + // Cache log readers for each log number, used for continue WAL replay + // after recovery + std::map> log_readers_; + + // Current WAL number replayed for each column family. + std::unordered_map cfd_to_current_log_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE diff --git a/src/rocksdb/db/db_impl/db_impl_write.cc b/src/rocksdb/db/db_impl/db_impl_write.cc new file mode 100644 index 000000000..8f6f685e4 --- /dev/null +++ b/src/rocksdb/db/db_impl/db_impl_write.cc @@ -0,0 +1,1839 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/db_impl/db_impl.h" + +#include +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "monitoring/perf_context_imp.h" +#include "options/options_helper.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { +// Convenience methods +Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + return DB::Put(o, column_family, key, val); +} + +Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& val) { + auto cfh = reinterpret_cast(column_family); + if (!cfh->cfd()->ioptions()->merge_operator) { + return Status::NotSupported("Provide a merge_operator when opening DB"); + } else { + return DB::Merge(o, column_family, key, val); + } +} + +Status DBImpl::Delete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, const Slice& key) { + return DB::Delete(write_options, column_family, key); +} + +Status DBImpl::SingleDelete(const WriteOptions& write_options, + ColumnFamilyHandle* column_family, + const Slice& key) { + return DB::SingleDelete(write_options, column_family, key); +} + +void DBImpl::SetRecoverableStatePreReleaseCallback( + PreReleaseCallback* callback) { + recoverable_state_pre_release_callback_.reset(callback); +} + +Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { + return WriteImpl(write_options, my_batch, nullptr, nullptr); +} + +#ifndef ROCKSDB_LITE +Status DBImpl::WriteWithCallback(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback) { + return WriteImpl(write_options, my_batch, callback, nullptr); +} +#endif // ROCKSDB_LITE + +// The main write queue. This is the only write queue that updates LastSequence. +// When using one write queue, the same sequence also indicates the last +// published sequence. +Status DBImpl::WriteImpl(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t* log_used, uint64_t log_ref, + bool disable_memtable, uint64_t* seq_used, + size_t batch_cnt, + PreReleaseCallback* pre_release_callback) { + assert(!seq_per_batch_ || batch_cnt != 0); + if (my_batch == nullptr) { + return Status::Corruption("Batch is nullptr!"); + } + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + tracer_->Write(my_batch); + } + } + if (write_options.sync && write_options.disableWAL) { + return Status::InvalidArgument("Sync writes has to enable WAL."); + } + if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) { + return Status::NotSupported( + "pipelined_writes is not compatible with concurrent prepares"); + } + if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) { + // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt + return Status::NotSupported( + "pipelined_writes is not compatible with seq_per_batch"); + } + if (immutable_db_options_.unordered_write && + immutable_db_options_.enable_pipelined_write) { + return Status::NotSupported( + "pipelined_writes is not compatible with unordered_write"); + } + // Otherwise IsLatestPersistentState optimization does not make sense + assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) || + disable_memtable); + + Status status; + if (write_options.low_pri) { + status = ThrottleLowPriWritesIfNeeded(write_options, my_batch); + if (!status.ok()) { + return status; + } + } + + if (two_write_queues_ && disable_memtable) { + AssignOrder assign_order = + seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder; + // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and + // they don't consume sequence. + return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch, + callback, log_used, log_ref, seq_used, batch_cnt, + pre_release_callback, assign_order, + kDontPublishLastSeq, disable_memtable); + } + + if (immutable_db_options_.unordered_write) { + const size_t sub_batch_cnt = batch_cnt != 0 + ? batch_cnt + // every key is a sub-batch consuming a seq + : WriteBatchInternal::Count(my_batch); + uint64_t seq; + // Use a write thread to i) optimize for WAL write, ii) publish last + // sequence in in increasing order, iii) call pre_release_callback serially + status = WriteImplWALOnly(&write_thread_, write_options, my_batch, callback, + log_used, log_ref, &seq, sub_batch_cnt, + pre_release_callback, kDoAssignOrder, + kDoPublishLastSeq, disable_memtable); + TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL"); + if (!status.ok()) { + return status; + } + if (seq_used) { + *seq_used = seq; + } + if (!disable_memtable) { + TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"); + status = UnorderedWriteMemtable(write_options, my_batch, callback, + log_ref, seq, sub_batch_cnt); + } + return status; + } + + if (immutable_db_options_.enable_pipelined_write) { + return PipelinedWriteImpl(write_options, my_batch, callback, log_used, + log_ref, disable_memtable, seq_used); + } + + PERF_TIMER_GUARD(write_pre_and_post_process_time); + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable, batch_cnt, pre_release_callback); + + if (!write_options.disableWAL) { + RecordTick(stats_, WRITE_WITH_WAL); + } + + StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + + write_thread_.JoinBatchGroup(&w); + if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { + // we are a non-leader in a parallel group + + if (w.ShouldWriteToMemtable()) { + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_GUARD(write_memtable_time); + + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt, + batch_per_txn_, write_options.memtable_insert_hint_per_batch); + + PERF_TIMER_START(write_pre_and_post_process_time); + } + + if (write_thread_.CompleteParallelMemTableWriter(&w)) { + // we're responsible for exit batch group + // TODO(myabandeh): propagate status to write_group + auto last_sequence = w.write_group->last_sequence; + versions_->SetLastSequence(last_sequence); + MemTableInsertStatusCheck(w.status); + write_thread_.ExitAsBatchGroupFollower(&w); + } + assert(w.state == WriteThread::STATE_COMPLETED); + // STATE_COMPLETED conditional below handles exit + + status = w.FinalStatus(); + } + if (w.state == WriteThread::STATE_COMPLETED) { + if (log_used != nullptr) { + *log_used = w.log_used; + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + // write is complete and leader has updated sequence + return w.FinalStatus(); + } + // else we are the leader of the write batch group + assert(w.state == WriteThread::STATE_GROUP_LEADER); + + // Once reaches this point, the current writer "w" will try to do its write + // job. It may also pick up some of the remaining writers in the "writers_" + // when it finds suitable, and finish them in the same write batch. + // This is how a write job could be done by the other writer. + WriteContext write_context; + WriteThread::WriteGroup write_group; + bool in_parallel_group = false; + uint64_t last_sequence = kMaxSequenceNumber; + + mutex_.Lock(); + + bool need_log_sync = write_options.sync; + bool need_log_dir_sync = need_log_sync && !log_dir_synced_; + if (!two_write_queues_ || !disable_memtable) { + // With concurrent writes we do preprocess only in the write thread that + // also does write to memtable to avoid sync issue on shared data structure + // with the other thread + + // PreprocessWrite does its own perf timing. + PERF_TIMER_STOP(write_pre_and_post_process_time); + + status = PreprocessWrite(write_options, &need_log_sync, &write_context); + if (!two_write_queues_) { + // Assign it after ::PreprocessWrite since the sequence might advance + // inside it by WriteRecoverableState + last_sequence = versions_->LastSequence(); + } + + PERF_TIMER_START(write_pre_and_post_process_time); + } + log::Writer* log_writer = logs_.back().writer; + + mutex_.Unlock(); + + // Add to log and apply to memtable. We can release the lock + // during this phase since &w is currently responsible for logging + // and protects against concurrent loggers and concurrent writes + // into memtables + + TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters"); + last_batch_group_size_ = + write_thread_.EnterAsBatchGroupLeader(&w, &write_group); + + if (status.ok()) { + // Rules for when we can update the memtable concurrently + // 1. supported by memtable + // 2. Puts are not okay if inplace_update_support + // 3. Merges are not okay + // + // Rules 1..2 are enforced by checking the options + // during startup (CheckConcurrentWritesSupported), so if + // options.allow_concurrent_memtable_write is true then they can be + // assumed to be true. Rule 3 is checked for each batch. We could + // relax rules 2 if we could prevent write batches from referring + // more than once to a particular key. + bool parallel = immutable_db_options_.allow_concurrent_memtable_write && + write_group.size > 1; + size_t total_count = 0; + size_t valid_batches = 0; + size_t total_byte_size = 0; + size_t pre_release_callback_cnt = 0; + for (auto* writer : write_group) { + if (writer->CheckCallback(this)) { + valid_batches += writer->batch_cnt; + if (writer->ShouldWriteToMemtable()) { + total_count += WriteBatchInternal::Count(writer->batch); + parallel = parallel && !writer->batch->HasMerge(); + } + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } + } + } + // Note about seq_per_batch_: either disableWAL is set for the entire write + // group or not. In either case we inc seq for each write batch with no + // failed callback. This means that there could be a batch with + // disalbe_memtable in between; although we do not write this batch to + // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc + // the seq per valid written key to mem. + size_t seq_inc = seq_per_batch_ ? valid_batches : total_count; + + const bool concurrent_update = two_write_queues_; + // Update stats while we are an exclusive group leader, so we know + // that nobody else can be writing to these particular stats. + // We're optimistic, updating the stats before we successfully + // commit. That lets us release our leader status early. + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count, + concurrent_update); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size, + concurrent_update); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1, + concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_SELF); + auto write_done_by_other = write_group.size - 1; + if (write_done_by_other > 0) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + write_done_by_other, concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); + } + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + if (write_options.disableWAL) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + if (!two_write_queues_) { + if (status.ok() && !write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + status = WriteToWAL(write_group, log_writer, log_used, need_log_sync, + need_log_dir_sync, last_sequence + 1); + } + } else { + if (status.ok() && !write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + // LastAllocatedSequence is increased inside WriteToWAL under + // wal_write_mutex_ to ensure ordered events in WAL + status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, + seq_inc); + } else { + // Otherwise we inc seq number for memtable writes + last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + } + } + assert(last_sequence != kMaxSequenceNumber); + const SequenceNumber current_sequence = last_sequence + 1; + last_sequence += seq_inc; + + // PreReleaseCallback is called after WAL write and before memtable write + if (status.ok()) { + SequenceNumber next_sequence = current_sequence; + size_t index = 0; + // Note: the logic for advancing seq here must be consistent with the + // logic in WriteBatchInternal::InsertInto(write_group...) as well as + // with WriteBatchInternal::InsertInto(write_batch...) that is called on + // the merged batch during recovery from the WAL. + for (auto* writer : write_group) { + if (writer->CallbackFailed()) { + continue; + } + writer->sequence = next_sequence; + if (writer->pre_release_callback) { + Status ws = writer->pre_release_callback->Callback( + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); + if (!ws.ok()) { + status = ws; + break; + } + } + if (seq_per_batch_) { + assert(writer->batch_cnt); + next_sequence += writer->batch_cnt; + } else if (writer->ShouldWriteToMemtable()) { + next_sequence += WriteBatchInternal::Count(writer->batch); + } + } + } + + if (status.ok()) { + PERF_TIMER_GUARD(write_memtable_time); + + if (!parallel) { + // w.sequence will be set inside InsertInto + w.status = WriteBatchInternal::InsertInto( + write_group, current_sequence, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, + write_options.ignore_missing_column_families, + 0 /*recovery_log_number*/, this, parallel, seq_per_batch_, + batch_per_txn_); + } else { + write_group.last_sequence = last_sequence; + write_thread_.LaunchParallelMemTableWriters(&write_group); + in_parallel_group = true; + + // Each parallel follower is doing each own writes. The leader should + // also do its own. + if (w.ShouldWriteToMemtable()) { + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + assert(w.sequence == current_sequence); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, + this, true /*concurrent_memtable_writes*/, seq_per_batch_, + w.batch_cnt, batch_per_txn_, + write_options.memtable_insert_hint_per_batch); + } + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + } + } + PERF_TIMER_START(write_pre_and_post_process_time); + + if (!w.CallbackFailed()) { + WriteStatusCheck(status); + } + + if (need_log_sync) { + mutex_.Lock(); + MarkLogsSynced(logfile_number_, need_log_dir_sync, status); + mutex_.Unlock(); + // Requesting sync with two_write_queues_ is expected to be very rare. We + // hence provide a simple implementation that is not necessarily efficient. + if (two_write_queues_) { + if (manual_wal_flush_) { + status = FlushWAL(true); + } else { + status = SyncWAL(); + } + } + } + + bool should_exit_batch_group = true; + if (in_parallel_group) { + // CompleteParallelWorker returns true if this thread should + // handle exit, false means somebody else did + should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w); + } + if (should_exit_batch_group) { + if (status.ok()) { + // Note: if we are to resume after non-OK statuses we need to revisit how + // we reacts to non-OK statuses here. + versions_->SetLastSequence(last_sequence); + } + MemTableInsertStatusCheck(w.status); + write_thread_.ExitAsBatchGroupLeader(write_group, status); + } + + if (status.ok()) { + status = w.FinalStatus(); + } + return status; +} + +Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t* log_used, uint64_t log_ref, + bool disable_memtable, uint64_t* seq_used) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + + WriteContext write_context; + + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable); + write_thread_.JoinBatchGroup(&w); + if (w.state == WriteThread::STATE_GROUP_LEADER) { + WriteThread::WriteGroup wal_write_group; + if (w.callback && !w.callback->AllowWriteBatching()) { + write_thread_.WaitForMemTableWriters(); + } + mutex_.Lock(); + bool need_log_sync = !write_options.disableWAL && write_options.sync; + bool need_log_dir_sync = need_log_sync && !log_dir_synced_; + // PreprocessWrite does its own perf timing. + PERF_TIMER_STOP(write_pre_and_post_process_time); + w.status = PreprocessWrite(write_options, &need_log_sync, &write_context); + PERF_TIMER_START(write_pre_and_post_process_time); + log::Writer* log_writer = logs_.back().writer; + mutex_.Unlock(); + + // This can set non-OK status if callback fail. + last_batch_group_size_ = + write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group); + const SequenceNumber current_sequence = + write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1; + size_t total_count = 0; + size_t total_byte_size = 0; + + if (w.status.ok()) { + SequenceNumber next_sequence = current_sequence; + for (auto writer : wal_write_group) { + if (writer->CheckCallback(this)) { + if (writer->ShouldWriteToMemtable()) { + writer->sequence = next_sequence; + size_t count = WriteBatchInternal::Count(writer->batch); + next_sequence += count; + total_count += count; + } + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + } + } + if (w.disable_wal) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + write_thread_.UpdateLastSequence(current_sequence + total_count - 1); + } + + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + if (w.status.ok() && !write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1); + RecordTick(stats_, WRITE_DONE_BY_SELF, 1); + if (wal_write_group.size > 1) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + wal_write_group.size - 1); + RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1); + } + w.status = WriteToWAL(wal_write_group, log_writer, log_used, + need_log_sync, need_log_dir_sync, current_sequence); + } + + if (!w.CallbackFailed()) { + WriteStatusCheck(w.status); + } + + if (need_log_sync) { + mutex_.Lock(); + MarkLogsSynced(logfile_number_, need_log_dir_sync, w.status); + mutex_.Unlock(); + } + + write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); + } + + WriteThread::WriteGroup memtable_write_group; + if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) { + PERF_TIMER_GUARD(write_memtable_time); + assert(w.ShouldWriteToMemtable()); + write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group); + if (memtable_write_group.size > 1 && + immutable_db_options_.allow_concurrent_memtable_write) { + write_thread_.LaunchParallelMemTableWriters(&memtable_write_group); + } else { + memtable_write_group.status = WriteBatchInternal::InsertInto( + memtable_write_group, w.sequence, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_); + versions_->SetLastSequence(memtable_write_group.last_sequence); + write_thread_.ExitAsMemTableWriter(&w, memtable_write_group); + } + } + + if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { + assert(w.ShouldWriteToMemtable()); + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, + false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/, + write_options.memtable_insert_hint_per_batch); + if (write_thread_.CompleteParallelMemTableWriter(&w)) { + MemTableInsertStatusCheck(w.status); + versions_->SetLastSequence(w.write_group->last_sequence); + write_thread_.ExitAsMemTableWriter(&w, *w.write_group); + } + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + + assert(w.state == WriteThread::STATE_COMPLETED); + return w.FinalStatus(); +} + +Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback, uint64_t log_ref, + SequenceNumber seq, + const size_t sub_batch_cnt) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + false /*disable_memtable*/); + + if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) { + w.sequence = seq; + size_t total_count = WriteBatchInternal::Count(my_batch); + InternalStats* stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, + seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/, + write_options.memtable_insert_hint_per_batch); + + WriteStatusCheck(w.status); + if (write_options.disableWAL) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + } + + size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1; + if (pending_cnt == 0) { + // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex + // before notify ensures that cv is in waiting state when it is notified + // thus not missing the update to pending_memtable_writes_ even though it is + // not modified under the mutex. + std::lock_guard lck(switch_mutex_); + switch_cv_.notify_all(); + } + + if (!w.FinalStatus().ok()) { + return w.FinalStatus(); + } + return Status::OK(); +} + +// The 2nd write queue. If enabled it will be used only for WAL-only writes. +// This is the only queue that updates LastPublishedSequence which is only +// applicable in a two-queue setting. +Status DBImpl::WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable) { + Status status; + PERF_TIMER_GUARD(write_pre_and_post_process_time); + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + disable_memtable, sub_batch_cnt, pre_release_callback); + RecordTick(stats_, WRITE_WITH_WAL); + StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + + write_thread->JoinBatchGroup(&w); + assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER); + if (w.state == WriteThread::STATE_COMPLETED) { + if (log_used != nullptr) { + *log_used = w.log_used; + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + return w.FinalStatus(); + } + // else we are the leader of the write batch group + assert(w.state == WriteThread::STATE_GROUP_LEADER); + + if (publish_last_seq == kDoPublishLastSeq) { + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + WriteContext write_context; + if (error_handler_.IsDBStopped()) { + status = error_handler_.GetBGError(); + } + // TODO(myabandeh): Make preliminary checks thread-safe so we could do them + // without paying the cost of obtaining the mutex. + if (status.ok()) { + InstrumentedMutexLock l(&mutex_); + bool need_log_sync = false; + status = PreprocessWrite(write_options, &need_log_sync, &write_context); + WriteStatusCheck(status); + } + if (!status.ok()) { + WriteThread::WriteGroup write_group; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + write_thread->ExitAsBatchGroupLeader(write_group, status); + return status; + } + } + + WriteThread::WriteGroup write_group; + uint64_t last_sequence; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + // Note: no need to update last_batch_group_size_ here since the batch writes + // to WAL only + + size_t pre_release_callback_cnt = 0; + size_t total_byte_size = 0; + for (auto* writer : write_group) { + if (writer->CheckCallback(this)) { + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } + } + } + + const bool concurrent_update = true; + // Update stats while we are an exclusive group leader, so we know + // that nobody else can be writing to these particular stats. + // We're optimistic, updating the stats before we successfully + // commit. That lets us release our leader status early. + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size, + concurrent_update); + RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1, + concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_SELF); + auto write_done_by_other = write_group.size - 1; + if (write_done_by_other > 0) { + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + write_done_by_other, concurrent_update); + RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); + } + RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); + + PERF_TIMER_STOP(write_pre_and_post_process_time); + + PERF_TIMER_GUARD(write_wal_time); + // LastAllocatedSequence is increased inside WriteToWAL under + // wal_write_mutex_ to ensure ordered events in WAL + size_t seq_inc = 0 /* total_count */; + if (assign_order == kDoAssignOrder) { + size_t total_batch_cnt = 0; + for (auto* writer : write_group) { + assert(writer->batch_cnt || !seq_per_batch_); + if (!writer->CallbackFailed()) { + total_batch_cnt += writer->batch_cnt; + } + } + seq_inc = total_batch_cnt; + } + if (!write_options.disableWAL) { + status = + ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc); + } else { + // Otherwise we inc seq number to do solely the seq allocation + last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + } + + size_t memtable_write_cnt = 0; + auto curr_seq = last_sequence + 1; + for (auto* writer : write_group) { + if (writer->CallbackFailed()) { + continue; + } + writer->sequence = curr_seq; + if (assign_order == kDoAssignOrder) { + assert(writer->batch_cnt || !seq_per_batch_); + curr_seq += writer->batch_cnt; + } + if (!writer->disable_memtable) { + memtable_write_cnt++; + } + // else seq advances only by memtable writes + } + if (status.ok() && write_options.sync) { + assert(!write_options.disableWAL); + // Requesting sync with two_write_queues_ is expected to be very rare. We + // hance provide a simple implementation that is not necessarily efficient. + if (manual_wal_flush_) { + status = FlushWAL(true); + } else { + status = SyncWAL(); + } + } + PERF_TIMER_START(write_pre_and_post_process_time); + + if (!w.CallbackFailed()) { + WriteStatusCheck(status); + } + if (status.ok()) { + size_t index = 0; + for (auto* writer : write_group) { + if (!writer->CallbackFailed() && writer->pre_release_callback) { + assert(writer->sequence != kMaxSequenceNumber); + Status ws = writer->pre_release_callback->Callback( + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); + if (!ws.ok()) { + status = ws; + break; + } + } + } + } + if (publish_last_seq == kDoPublishLastSeq) { + versions_->SetLastSequence(last_sequence + seq_inc); + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + } + if (immutable_db_options_.unordered_write && status.ok()) { + pending_memtable_writes_ += memtable_write_cnt; + } + write_thread->ExitAsBatchGroupLeader(write_group, status); + if (status.ok()) { + status = w.FinalStatus(); + } + if (seq_used != nullptr) { + *seq_used = w.sequence; + } + return status; +} + +void DBImpl::WriteStatusCheck(const Status& status) { + // Is setting bg_error_ enough here? This will at least stop + // compaction and fail any further writes. + if (immutable_db_options_.paranoid_checks && !status.ok() && + !status.IsBusy() && !status.IsIncomplete()) { + mutex_.Lock(); + error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); + mutex_.Unlock(); + } +} + +void DBImpl::MemTableInsertStatusCheck(const Status& status) { + // A non-OK status here indicates that the state implied by the + // WAL has diverged from the in-memory state. This could be + // because of a corrupt write_batch (very bad), or because the + // client specified an invalid column family and didn't specify + // ignore_missing_column_families. + if (!status.ok()) { + mutex_.Lock(); + assert(!error_handler_.IsBGWorkStopped()); + error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable); + mutex_.Unlock(); + } +} + +Status DBImpl::PreprocessWrite(const WriteOptions& write_options, + bool* need_log_sync, + WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr && need_log_sync != nullptr); + Status status; + + if (error_handler_.IsDBStopped()) { + status = error_handler_.GetBGError(); + } + + PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time); + + assert(!single_column_family_mode_ || + versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); + if (UNLIKELY(status.ok() && !single_column_family_mode_ && + total_log_size_ > GetMaxTotalWalSize())) { + WaitForPendingWrites(); + status = SwitchWAL(write_context); + } + + if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) { + // Before a new memtable is added in SwitchMemtable(), + // write_buffer_manager_->ShouldFlush() will keep returning true. If another + // thread is writing to another DB with the same write buffer, they may also + // be flushed. We may end up with flushing much more DBs than needed. It's + // suboptimal but still correct. + WaitForPendingWrites(); + status = HandleWriteBufferFull(write_context); + } + + if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) { + status = TrimMemtableHistory(write_context); + } + + if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { + WaitForPendingWrites(); + status = ScheduleFlushes(write_context); + } + + PERF_TIMER_STOP(write_scheduling_flushes_compactions_time); + PERF_TIMER_GUARD(write_pre_and_post_process_time); + + if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || + write_controller_.NeedsDelay()))) { + PERF_TIMER_STOP(write_pre_and_post_process_time); + PERF_TIMER_GUARD(write_delay_time); + // We don't know size of curent batch so that we always use the size + // for previous one. It might create a fairness issue that expiration + // might happen for smaller writes but larger writes can go through. + // Can optimize it if it is an issue. + status = DelayWrite(last_batch_group_size_, write_options); + PERF_TIMER_START(write_pre_and_post_process_time); + } + + if (status.ok() && *need_log_sync) { + // Wait until the parallel syncs are finished. Any sync process has to sync + // the front log too so it is enough to check the status of front() + // We do a while loop since log_sync_cv_ is signalled when any sync is + // finished + // Note: there does not seem to be a reason to wait for parallel sync at + // this early step but it is not important since parallel sync (SyncWAL) and + // need_log_sync are usually not used together. + while (logs_.front().getting_synced) { + log_sync_cv_.Wait(); + } + for (auto& log : logs_) { + assert(!log.getting_synced); + // This is just to prevent the logs to be synced by a parallel SyncWAL + // call. We will do the actual syncing later after we will write to the + // WAL. + // Note: there does not seem to be a reason to set this early before we + // actually write to the WAL + log.getting_synced = true; + } + } else { + *need_log_sync = false; + } + + return status; +} + +WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group, + WriteBatch* tmp_batch, size_t* write_with_wal, + WriteBatch** to_be_cached_state) { + assert(write_with_wal != nullptr); + assert(tmp_batch != nullptr); + assert(*to_be_cached_state == nullptr); + WriteBatch* merged_batch = nullptr; + *write_with_wal = 0; + auto* leader = write_group.leader; + assert(!leader->disable_wal); // Same holds for all in the batch group + if (write_group.size == 1 && !leader->CallbackFailed() && + leader->batch->GetWalTerminationPoint().is_cleared()) { + // we simply write the first WriteBatch to WAL if the group only + // contains one batch, that batch should be written to the WAL, + // and the batch is not wanting to be truncated + merged_batch = leader->batch; + if (WriteBatchInternal::IsLatestPersistentState(merged_batch)) { + *to_be_cached_state = merged_batch; + } + *write_with_wal = 1; + } else { + // WAL needs all of the batches flattened into a single batch. + // We could avoid copying here with an iov-like AddRecord + // interface + merged_batch = tmp_batch; + for (auto writer : write_group) { + if (!writer->CallbackFailed()) { + WriteBatchInternal::Append(merged_batch, writer->batch, + /*WAL_only*/ true); + if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) { + // We only need to cache the last of such write batch + *to_be_cached_state = writer->batch; + } + (*write_with_wal)++; + } + } + } + return merged_batch; +} + +// When two_write_queues_ is disabled, this function is called from the only +// write thread. Otherwise this must be called holding log_write_mutex_. +Status DBImpl::WriteToWAL(const WriteBatch& merged_batch, + log::Writer* log_writer, uint64_t* log_used, + uint64_t* log_size) { + assert(log_size != nullptr); + Slice log_entry = WriteBatchInternal::Contents(&merged_batch); + *log_size = log_entry.size(); + // When two_write_queues_ WriteToWAL has to be protected from concurretn calls + // from the two queues anyway and log_write_mutex_ is already held. Otherwise + // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord + // from possible concurrent calls via the FlushWAL by the application. + const bool needs_locking = manual_wal_flush_ && !two_write_queues_; + // Due to performance cocerns of missed branch prediction penalize the new + // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case + // when we do not need any locking. + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Lock(); + } + Status status = log_writer->AddRecord(log_entry); + if (UNLIKELY(needs_locking)) { + log_write_mutex_.Unlock(); + } + if (log_used != nullptr) { + *log_used = logfile_number_; + } + total_log_size_ += log_entry.size(); + // TODO(myabandeh): it might be unsafe to access alive_log_files_.back() here + // since alive_log_files_ might be modified concurrently + alive_log_files_.back().AddSize(log_entry.size()); + log_empty_ = false; + return status; +} + +Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, + log::Writer* log_writer, uint64_t* log_used, + bool need_log_sync, bool need_log_dir_sync, + SequenceNumber sequence) { + Status status; + + assert(!write_group.leader->disable_wal); + // Same holds for all in the batch group + size_t write_with_wal = 0; + WriteBatch* to_be_cached_state = nullptr; + WriteBatch* merged_batch = MergeBatch(write_group, &tmp_batch_, + &write_with_wal, &to_be_cached_state); + if (merged_batch == write_group.leader->batch) { + write_group.leader->log_used = logfile_number_; + } else if (write_with_wal > 1) { + for (auto writer : write_group) { + writer->log_used = logfile_number_; + } + } + + WriteBatchInternal::SetSequence(merged_batch, sequence); + + uint64_t log_size; + status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size); + if (to_be_cached_state) { + cached_recoverable_state_ = *to_be_cached_state; + cached_recoverable_state_empty_ = false; + } + + if (status.ok() && need_log_sync) { + StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); + // It's safe to access logs_ with unlocked mutex_ here because: + // - we've set getting_synced=true for all logs, + // so other threads won't pop from logs_ while we're here, + // - only writer thread can push to logs_, and we're in + // writer thread, so no one will push to logs_, + // - as long as other threads don't modify it, it's safe to read + // from std::deque from multiple threads concurrently. + for (auto& log : logs_) { + status = log.writer->file()->Sync(immutable_db_options_.use_fsync); + if (!status.ok()) { + break; + } + } + if (status.ok() && need_log_dir_sync) { + // We only sync WAL directory the first time WAL syncing is + // requested, so that in case users never turn on WAL sync, + // we can avoid the disk I/O in the write code path. + status = directories_.GetWalDir()->Fsync(); + } + } + + if (merged_batch == &tmp_batch_) { + tmp_batch_.Clear(); + } + if (status.ok()) { + auto stats = default_cf_internal_stats_; + if (need_log_sync) { + stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1); + RecordTick(stats_, WAL_FILE_SYNCED); + } + stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size); + RecordTick(stats_, WAL_FILE_BYTES, log_size); + stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal); + RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); + } + return status; +} + +Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group, + uint64_t* log_used, + SequenceNumber* last_sequence, + size_t seq_inc) { + Status status; + + assert(!write_group.leader->disable_wal); + // Same holds for all in the batch group + WriteBatch tmp_batch; + size_t write_with_wal = 0; + WriteBatch* to_be_cached_state = nullptr; + WriteBatch* merged_batch = + MergeBatch(write_group, &tmp_batch, &write_with_wal, &to_be_cached_state); + + // We need to lock log_write_mutex_ since logs_ and alive_log_files might be + // pushed back concurrently + log_write_mutex_.Lock(); + if (merged_batch == write_group.leader->batch) { + write_group.leader->log_used = logfile_number_; + } else if (write_with_wal > 1) { + for (auto writer : write_group) { + writer->log_used = logfile_number_; + } + } + *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); + auto sequence = *last_sequence + 1; + WriteBatchInternal::SetSequence(merged_batch, sequence); + + log::Writer* log_writer = logs_.back().writer; + uint64_t log_size; + status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size); + if (to_be_cached_state) { + cached_recoverable_state_ = *to_be_cached_state; + cached_recoverable_state_empty_ = false; + } + log_write_mutex_.Unlock(); + + if (status.ok()) { + const bool concurrent = true; + auto stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size, + concurrent); + RecordTick(stats_, WAL_FILE_BYTES, log_size); + stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal, + concurrent); + RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); + } + return status; +} + +Status DBImpl::WriteRecoverableState() { + mutex_.AssertHeld(); + if (!cached_recoverable_state_empty_) { + bool dont_care_bool; + SequenceNumber next_seq; + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + SequenceNumber seq; + if (two_write_queues_) { + seq = versions_->FetchAddLastAllocatedSequence(0); + } else { + seq = versions_->LastSequence(); + } + WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1); + auto status = WriteBatchInternal::InsertInto( + &cached_recoverable_state_, column_family_memtables_.get(), + &flush_scheduler_, &trim_history_scheduler_, true, + 0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */, + &next_seq, &dont_care_bool, seq_per_batch_); + auto last_seq = next_seq - 1; + if (two_write_queues_) { + versions_->FetchAddLastAllocatedSequence(last_seq - seq); + versions_->SetLastPublishedSequence(last_seq); + } + versions_->SetLastSequence(last_seq); + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + if (status.ok() && recoverable_state_pre_release_callback_) { + const bool DISABLE_MEMTABLE = true; + for (uint64_t sub_batch_seq = seq + 1; + sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) { + uint64_t const no_log_num = 0; + // Unlock it since the callback might end up locking mutex. e.g., + // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB + mutex_.Unlock(); + status = recoverable_state_pre_release_callback_->Callback( + sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1); + mutex_.Lock(); + } + } + if (status.ok()) { + cached_recoverable_state_.Clear(); + cached_recoverable_state_empty_ = true; + } + return status; + } + return Status::OK(); +} + +void DBImpl::SelectColumnFamiliesForAtomicFlush( + autovector* cfds) { + for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + cfds->push_back(cfd); + } + } +} + +// Assign sequence number for atomic flush. +void DBImpl::AssignAtomicFlushSeq(const autovector& cfds) { + assert(immutable_db_options_.atomic_flush); + auto seq = versions_->LastSequence(); + for (auto cfd : cfds) { + cfd->imm()->AssignAtomicFlushSeq(seq); + } +} + +Status DBImpl::SwitchWAL(WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr); + Status status; + + if (alive_log_files_.begin()->getting_flushed) { + return status; + } + + auto oldest_alive_log = alive_log_files_.begin()->number; + bool flush_wont_release_oldest_log = false; + if (allow_2pc()) { + auto oldest_log_with_uncommitted_prep = + logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep(); + + assert(oldest_log_with_uncommitted_prep == 0 || + oldest_log_with_uncommitted_prep >= oldest_alive_log); + if (oldest_log_with_uncommitted_prep > 0 && + oldest_log_with_uncommitted_prep == oldest_alive_log) { + if (unable_to_release_oldest_log_) { + // we already attempted to flush all column families dependent on + // the oldest alive log but the log still contained uncommitted + // transactions so there is still nothing that we can do. + return status; + } else { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Unable to release oldest log due to uncommitted transaction"); + unable_to_release_oldest_log_ = true; + flush_wont_release_oldest_log = true; + } + } + } + if (!flush_wont_release_oldest_log) { + // we only mark this log as getting flushed if we have successfully + // flushed all data in this log. If this log contains outstanding prepared + // transactions then we cannot flush this log until those transactions are + // commited. + unable_to_release_oldest_log_ = false; + alive_log_files_.begin()->getting_flushed = true; + } + + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Flushing all column families with data in WAL number %" PRIu64 + ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, + oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize()); + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + } else { + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (cfd->OldestLogToKeep() <= oldest_alive_log) { + cfds.push_back(cfd); + } + } + MaybeFlushStatsCF(&cfds); + } + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + for (const auto cfd : cfds) { + cfd->Ref(); + status = SwitchMemtable(cfd, write_context); + cfd->UnrefAndTryDelete(); + if (!status.ok()) { + break; + } + } + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + } + for (auto cfd : cfds) { + cfd->imm()->FlushRequested(); + } + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { + mutex_.AssertHeld(); + assert(write_context != nullptr); + Status status; + + // Before a new memtable is added in SwitchMemtable(), + // write_buffer_manager_->ShouldFlush() will keep returning true. If another + // thread is writing to another DB with the same write buffer, they may also + // be flushed. We may end up with flushing much more DBs than needed. It's + // suboptimal but still correct. + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Flushing column family with oldest memtable entry. Write buffer is " + "using %" ROCKSDB_PRIszt " bytes out of a total of %" ROCKSDB_PRIszt ".", + write_buffer_manager_->memory_usage(), + write_buffer_manager_->buffer_size()); + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + } else { + ColumnFamilyData* cfd_picked = nullptr; + SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber; + + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + if (!cfd->mem()->IsEmpty()) { + // We only consider active mem table, hoping immutable memtable is + // already in the process of flushing. + uint64_t seq = cfd->mem()->GetCreationSeq(); + if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) { + cfd_picked = cfd; + seq_num_for_cf_picked = seq; + } + } + } + if (cfd_picked != nullptr) { + cfds.push_back(cfd_picked); + } + MaybeFlushStatsCF(&cfds); + } + + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + for (const auto cfd : cfds) { + if (cfd->mem()->IsEmpty()) { + continue; + } + cfd->Ref(); + status = SwitchMemtable(cfd, write_context); + cfd->UnrefAndTryDelete(); + if (!status.ok()) { + break; + } + } + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + } + for (const auto cfd : cfds) { + cfd->imm()->FlushRequested(); + } + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +uint64_t DBImpl::GetMaxTotalWalSize() const { + mutex_.AssertHeld(); + return mutable_db_options_.max_total_wal_size == 0 + ? 4 * max_total_in_memory_state_ + : mutable_db_options_.max_total_wal_size; +} + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +Status DBImpl::DelayWrite(uint64_t num_bytes, + const WriteOptions& write_options) { + uint64_t time_delayed = 0; + bool delayed = false; + { + StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed); + uint64_t delay = write_controller_.GetDelay(env_, num_bytes); + if (delay > 0) { + if (write_options.no_slowdown) { + return Status::Incomplete("Write stall"); + } + TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); + + // Notify write_thread_ about the stall so it can setup a barrier and + // fail any pending writers with no_slowdown + write_thread_.BeginWriteStall(); + TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone"); + mutex_.Unlock(); + // We will delay the write until we have slept for delay ms or + // we don't need a delay anymore + const uint64_t kDelayInterval = 1000; + uint64_t stall_end = sw.start_time() + delay; + while (write_controller_.NeedsDelay()) { + if (env_->NowMicros() >= stall_end) { + // We already delayed this write `delay` microseconds + break; + } + + delayed = true; + // Sleep for 0.001 seconds + env_->SleepForMicroseconds(kDelayInterval); + } + mutex_.Lock(); + write_thread_.EndWriteStall(); + } + + // Don't wait if there's a background error, even if its a soft error. We + // might wait here indefinitely as the background compaction may never + // finish successfully, resulting in the stall condition lasting + // indefinitely + while (error_handler_.GetBGError().ok() && write_controller_.IsStopped()) { + if (write_options.no_slowdown) { + return Status::Incomplete("Write stall"); + } + delayed = true; + + // Notify write_thread_ about the stall so it can setup a barrier and + // fail any pending writers with no_slowdown + write_thread_.BeginWriteStall(); + TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); + bg_cv_.Wait(); + write_thread_.EndWriteStall(); + } + } + assert(!delayed || !write_options.no_slowdown); + if (delayed) { + default_cf_internal_stats_->AddDBStats( + InternalStats::kIntStatsWriteStallMicros, time_delayed); + RecordTick(stats_, STALL_MICROS, time_delayed); + } + + // If DB is not in read-only mode and write_controller is not stopping + // writes, we can ignore any background errors and allow the write to + // proceed + Status s; + if (write_controller_.IsStopped()) { + // If writes are still stopped, it means we bailed due to a background + // error + s = Status::Incomplete(error_handler_.GetBGError().ToString()); + } + if (error_handler_.IsDBStopped()) { + s = error_handler_.GetBGError(); + } + return s; +} + +Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, + WriteBatch* my_batch) { + assert(write_options.low_pri); + // This is called outside the DB mutex. Although it is safe to make the call, + // the consistency condition is not guaranteed to hold. It's OK to live with + // it in this case. + // If we need to speed compaction, it means the compaction is left behind + // and we start to limit low pri writes to a limit. + if (write_controller_.NeedSpeedupCompaction()) { + if (allow_2pc() && (my_batch->HasCommit() || my_batch->HasRollback())) { + // For 2PC, we only rate limit prepare, not commit. + return Status::OK(); + } + if (write_options.no_slowdown) { + return Status::Incomplete("Low priority write stall"); + } else { + assert(my_batch != nullptr); + // Rate limit those writes. The reason that we don't completely wait + // is that in case the write is heavy, low pri writes may never have + // a chance to run. Now we guarantee we are still slowly making + // progress. + PERF_TIMER_GUARD(write_delay_time); + write_controller_.low_pri_rate_limiter()->Request( + my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kWrite); + } + } + return Status::OK(); +} + +void DBImpl::MaybeFlushStatsCF(autovector* cfds) { + assert(cfds != nullptr); + if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) { + for (ColumnFamilyData* cfd : *cfds) { + if (cfd == cfd_stats) { + // stats CF already included in cfds + return; + } + } + // force flush stats CF when its log number is less than all other CF's + // log numbers + bool force_flush_stats_cf = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + force_flush_stats_cf = false; + } + } + if (force_flush_stats_cf) { + cfds->push_back(cfd_stats); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with automated flush " + "to avoid holding old logs"); + } + } + } +} + +Status DBImpl::TrimMemtableHistory(WriteContext* context) { + autovector cfds; + ColumnFamilyData* tmp_cfd; + while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) != + nullptr) { + cfds.push_back(tmp_cfd); + } + for (auto& cfd : cfds) { + autovector to_delete; + cfd->imm()->TrimHistory(&to_delete, cfd->mem()->ApproximateMemoryUsage()); + if (!to_delete.empty()) { + for (auto m : to_delete) { + delete m; + } + context->superversion_context.NewSuperVersion(); + assert(context->superversion_context.new_superversion.get() != nullptr); + cfd->InstallSuperVersion(&context->superversion_context, &mutex_); + } + + if (cfd->UnrefAndTryDelete()) { + cfd = nullptr; + } + } + return Status::OK(); +} + +Status DBImpl::ScheduleFlushes(WriteContext* context) { + autovector cfds; + if (immutable_db_options_.atomic_flush) { + SelectColumnFamiliesForAtomicFlush(&cfds); + for (auto cfd : cfds) { + cfd->Ref(); + } + flush_scheduler_.Clear(); + } else { + ColumnFamilyData* tmp_cfd; + while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { + cfds.push_back(tmp_cfd); + } + MaybeFlushStatsCF(&cfds); + } + Status status; + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + for (auto& cfd : cfds) { + if (!cfd->mem()->IsEmpty()) { + status = SwitchMemtable(cfd, context); + } + if (cfd->UnrefAndTryDelete()) { + cfd = nullptr; + } + if (!status.ok()) { + break; + } + } + + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + + if (status.ok()) { + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(cfds); + } + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + MaybeScheduleFlushOrCompaction(); + } + return status; +} + +#ifndef ROCKSDB_LITE +void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, + const MemTableInfo& mem_table_info) { + if (immutable_db_options_.listeners.size() == 0U) { + return; + } + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + + for (auto listener : immutable_db_options_.listeners) { + listener->OnMemTableSealed(mem_table_info); + } +} +#endif // ROCKSDB_LITE + +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +// REQUIRES: this thread is currently at the front of the 2nd writer queue if +// two_write_queues_ is true (This is to simplify the reasoning.) +Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { + mutex_.AssertHeld(); + WriteThread::Writer nonmem_w; + std::unique_ptr lfile; + log::Writer* new_log = nullptr; + MemTable* new_mem = nullptr; + + // Recoverable state is persisted in WAL. After memtable switch, WAL might + // be deleted, so we write the state to memtable to be persisted as well. + Status s = WriteRecoverableState(); + if (!s.ok()) { + return s; + } + + // Attempt to switch to a new memtable and trigger flush of old. + // Do this without holding the dbmutex lock. + assert(versions_->prev_log_number() == 0); + if (two_write_queues_) { + log_write_mutex_.Lock(); + } + bool creating_new_log = !log_empty_; + if (two_write_queues_) { + log_write_mutex_.Unlock(); + } + uint64_t recycle_log_number = 0; + if (creating_new_log && immutable_db_options_.recycle_log_file_num && + !log_recycle_files_.empty()) { + recycle_log_number = log_recycle_files_.front(); + } + uint64_t new_log_number = + creating_new_log ? versions_->NewFileNumber() : logfile_number_; + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + + // Set memtable_info for memtable sealed callback +#ifndef ROCKSDB_LITE + MemTableInfo memtable_info; + memtable_info.cf_name = cfd->GetName(); + memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber(); + memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber(); + memtable_info.num_entries = cfd->mem()->num_entries(); + memtable_info.num_deletes = cfd->mem()->num_deletes(); +#endif // ROCKSDB_LITE + // Log this later after lock release. It may be outdated, e.g., if background + // flush happens before logging, but that should be ok. + int num_imm_unflushed = cfd->imm()->NumNotFlushed(); + const auto preallocate_block_size = + GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size); + mutex_.Unlock(); + if (creating_new_log) { + // TODO: Write buffer size passed in should be max of all CF's instead + // of mutable_cf_options.write_buffer_size. + s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size, + &new_log); + } + if (s.ok()) { + SequenceNumber seq = versions_->LastSequence(); + new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq); + context->superversion_context.NewSuperVersion(); + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] New memtable created with log file: #%" PRIu64 + ". Immutable memtables: %d.\n", + cfd->GetName().c_str(), new_log_number, num_imm_unflushed); + mutex_.Lock(); + if (recycle_log_number != 0) { + // Since renaming the file is done outside DB mutex, we need to ensure + // concurrent full purges don't delete the file while we're recycling it. + // To achieve that we hold the old log number in the recyclable list until + // after it has been renamed. + assert(log_recycle_files_.front() == recycle_log_number); + log_recycle_files_.pop_front(); + } + if (s.ok() && creating_new_log) { + log_write_mutex_.Lock(); + assert(new_log != nullptr); + if (!logs_.empty()) { + // Alway flush the buffer of the last log before switching to a new one + log::Writer* cur_log_writer = logs_.back().writer; + s = cur_log_writer->WriteBuffer(); + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64 + " WAL file\n", + cfd->GetName().c_str(), cur_log_writer->get_log_number(), + new_log_number); + } + } + if (s.ok()) { + logfile_number_ = new_log_number; + log_empty_ = true; + log_dir_synced_ = false; + logs_.emplace_back(logfile_number_, new_log); + alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); + } + log_write_mutex_.Unlock(); + } + + if (!s.ok()) { + // how do we fail if we're not creating new log? + assert(creating_new_log); + if (new_mem) { + delete new_mem; + } + if (new_log) { + delete new_log; + } + SuperVersion* new_superversion = + context->superversion_context.new_superversion.release(); + if (new_superversion != nullptr) { + delete new_superversion; + } + // We may have lost data from the WritableFileBuffer in-memory buffer for + // the current log, so treat it as a fatal error and set bg_error + error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable); + // Read back bg_error in order to get the right severity + s = error_handler_.GetBGError(); + return s; + } + + for (auto loop_cfd : *versions_->GetColumnFamilySet()) { + // all this is just optimization to delete logs that + // are no longer needed -- if CF is empty, that means it + // doesn't need that particular log to stay alive, so we just + // advance the log number. no need to persist this in the manifest + if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 && + loop_cfd->imm()->NumNotFlushed() == 0) { + if (creating_new_log) { + loop_cfd->SetLogNumber(logfile_number_); + } + loop_cfd->mem()->SetCreationSeq(versions_->LastSequence()); + } + } + + cfd->mem()->SetNextLogNumber(logfile_number_); + cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_); + new_mem->Ref(); + cfd->SetMemtable(new_mem); + InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context, + mutable_cf_options); +#ifndef ROCKSDB_LITE + mutex_.Unlock(); + // Notify client that memtable is sealed, now that we have successfully + // installed a new memtable + NotifyOnMemTableSealed(cfd, memtable_info); + mutex_.Lock(); +#endif // ROCKSDB_LITE + return s; +} + +size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const { + mutex_.AssertHeld(); + size_t bsize = + static_cast(write_buffer_size / 10 + write_buffer_size); + // Some users might set very high write_buffer_size and rely on + // max_total_wal_size or other parameters to control the WAL size. + if (mutable_db_options_.max_total_wal_size > 0) { + bsize = std::min( + bsize, static_cast(mutable_db_options_.max_total_wal_size)); + } + if (immutable_db_options_.db_write_buffer_size > 0) { + bsize = std::min(bsize, immutable_db_options_.db_write_buffer_size); + } + if (immutable_db_options_.write_buffer_manager && + immutable_db_options_.write_buffer_manager->enabled()) { + bsize = std::min( + bsize, immutable_db_options_.write_buffer_manager->buffer_size()); + } + + return bsize; +} + +// Default implementations of convenience methods that subclasses of DB +// can call if they wish +Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + if (nullptr == opt.timestamp) { + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24); + Status s = batch.Put(column_family, key, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); + } + const Slice* ts = opt.timestamp; + assert(nullptr != ts); + size_t ts_sz = ts->size(); + WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0, + ts_sz); + Status s = batch.Put(column_family, key, value); + if (!s.ok()) { + return s; + } + s = batch.AssignTimestamp(*ts); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + +Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key) { + WriteBatch batch; + batch.Delete(column_family, key); + return Write(opt, &batch); +} + +Status DB::SingleDelete(const WriteOptions& opt, + ColumnFamilyHandle* column_family, const Slice& key) { + WriteBatch batch; + batch.SingleDelete(column_family, key); + return Write(opt, &batch); +} + +Status DB::DeleteRange(const WriteOptions& opt, + ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + WriteBatch batch; + batch.DeleteRange(column_family, begin_key, end_key); + return Write(opt, &batch); +} + +Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) { + WriteBatch batch; + Status s = batch.Merge(column_family, key, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/db_impl/db_secondary_test.cc b/src/rocksdb/db/db_impl/db_secondary_test.cc new file mode 100644 index 000000000..0b34181de --- /dev/null +++ b/src/rocksdb/db/db_impl/db_secondary_test.cc @@ -0,0 +1,869 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl/db_impl_secondary.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE +class DBSecondaryTest : public DBTestBase { + public: + DBSecondaryTest() + : DBTestBase("/db_secondary_test"), + secondary_path_(), + handles_secondary_(), + db_secondary_(nullptr) { + secondary_path_ = + test::PerThreadDBPath(env_, "/db_secondary_test_secondary"); + } + + ~DBSecondaryTest() override { + CloseSecondary(); + if (getenv("KEEP_DB") != nullptr) { + fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str()); + } else { + Options options; + options.env = env_; + EXPECT_OK(DestroyDB(secondary_path_, options)); + } + } + + protected: + Status ReopenAsSecondary(const Options& options) { + return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_); + } + + void OpenSecondary(const Options& options); + + void OpenSecondaryWithColumnFamilies( + const std::vector& column_families, const Options& options); + + void CloseSecondary() { + for (auto h : handles_secondary_) { + db_secondary_->DestroyColumnFamilyHandle(h); + } + handles_secondary_.clear(); + delete db_secondary_; + db_secondary_ = nullptr; + } + + DBImplSecondary* db_secondary_full() { + return static_cast(db_secondary_); + } + + void CheckFileTypeCounts(const std::string& dir, int expected_log, + int expected_sst, int expected_manifest) const; + + std::string secondary_path_; + std::vector handles_secondary_; + DB* db_secondary_; +}; + +void DBSecondaryTest::OpenSecondary(const Options& options) { + Status s = + DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_); + ASSERT_OK(s); +} + +void DBSecondaryTest::OpenSecondaryWithColumnFamilies( + const std::vector& column_families, const Options& options) { + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + for (const auto& cf_name : column_families) { + cf_descs.emplace_back(cf_name, options); + } + Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs, + &handles_secondary_, &db_secondary_); + ASSERT_OK(s); +} + +void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir, + int expected_log, int expected_sst, + int expected_manifest) const { + std::vector filenames; + env_->GetChildren(dir, &filenames); + + int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; + for (auto file : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(file, &number, &type)) { + log_cnt += (type == kLogFile); + sst_cnt += (type == kTableFile); + manifest_cnt += (type == kDescriptorFile); + } + } + ASSERT_EQ(expected_log, log_cnt); + ASSERT_EQ(expected_sst, sst_cnt); + ASSERT_EQ(expected_manifest, manifest_cnt); +} + +TEST_F(DBSecondaryTest, ReopenAsSecondary) { + Options options; + options.env = env_; + Reopen(options); + ASSERT_OK(Put("foo", "foo_value")); + ASSERT_OK(Put("bar", "bar_value")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + Close(); + + ASSERT_OK(ReopenAsSecondary(options)); + ASSERT_EQ("foo_value", Get("foo")); + ASSERT_EQ("bar_value", Get("bar")); + ReadOptions ropts; + ropts.verify_checksums = true; + auto db1 = static_cast(db_); + ASSERT_NE(nullptr, db1); + Iterator* iter = db1->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + if (0 == count) { + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value", iter->value().ToString()); + } else if (1 == count) { + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value", iter->value().ToString()); + } + ++count; + } + delete iter; + ASSERT_EQ(2, count); +} + +TEST_F(DBSecondaryTest, OpenAsSecondary) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ReadOptions ropts; + ropts.verify_checksums = true; + const auto verify_db_func = [&](const std::string& foo_val, + const std::string& bar_val) { + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ(foo_val, value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ(bar_val, value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ(foo_val, iter->value().ToString()); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ(bar_val, iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; + }; + + verify_db_func("foo_value2", "bar_value2"); + + ASSERT_OK(Put("foo", "new_foo_value")); + ASSERT_OK(Put("bar", "new_bar_value")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value", "new_bar_value"); +} + +namespace { +class TraceFileEnv : public EnvWrapper { + public: + explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {} + Status NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& env_options) override { + class TracedRandomAccessFile : public RandomAccessFile { + public: + TracedRandomAccessFile(std::unique_ptr&& target, + std::atomic& counter) + : target_(std::move(target)), files_closed_(counter) {} + ~TracedRandomAccessFile() override { + files_closed_.fetch_add(1, std::memory_order_relaxed); + } + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + return target_->Read(offset, n, result, scratch); + } + + private: + std::unique_ptr target_; + std::atomic& files_closed_; + }; + Status s = target()->NewRandomAccessFile(f, r, env_options); + if (s.ok()) { + r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_)); + } + return s; + } + + int files_closed() const { + return files_closed_.load(std::memory_order_relaxed); + } + + private: + std::atomic files_closed_{0}; +}; +} // namespace + +TEST_F(DBSecondaryTest, SecondaryCloseFiles) { + Options options; + options.env = env_; + options.max_open_files = 1; + options.disable_auto_compactions = true; + Reopen(options); + Options options1; + std::unique_ptr traced_env(new TraceFileEnv(env_)); + options1.env = traced_env.get(); + OpenSecondary(options1); + + static const auto verify_db = [&]() { + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + std::unique_ptr iter2(db_secondary_->NewIterator(ReadOptions())); + for (iter1->SeekToFirst(), iter2->SeekToFirst(); + iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) { + ASSERT_EQ(iter1->key(), iter2->key()); + ASSERT_EQ(iter1->value(), iter2->value()); + } + ASSERT_FALSE(iter1->Valid()); + ASSERT_FALSE(iter2->Valid()); + }; + + ASSERT_OK(Put("a", "value")); + ASSERT_OK(Put("c", "value")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(); + + ASSERT_OK(Put("b", "value")); + ASSERT_OK(Put("d", "value")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(); + + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ASSERT_EQ(2, static_cast(traced_env.get())->files_closed()); + + Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}}); + ASSERT_TRUE(s.IsNotSupported()); + CloseSecondary(); +} + +TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + } + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + ReadOptions ropts; + ropts.verify_checksums = true; + const auto verify_db_func = [&](const std::string& foo_val, + const std::string& bar_val) { + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ(foo_val, value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ(bar_val, value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ(foo_val, iter->value().ToString()); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ(bar_val, iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; + }; + + verify_db_func("foo_value2", "bar_value2"); + + ASSERT_OK(Put("foo", "new_foo_value")); + ASSERT_OK(Put("bar", "new_bar_value")); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value", "new_bar_value"); + + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "new_foo_value_1")); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value_1", "new_bar_value"); +} + +TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) { + Options options; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options1); + cf_descs.emplace_back("pikachu", options1); + cf_descs.emplace_back("eevee", options1); + Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs, + &handles_secondary_, &db_secondary_); + ASSERT_NOK(s); +} + +TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) { + Options options; + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ASSERT_EQ(0, handles_secondary_.size()); + ASSERT_NE(nullptr, db_secondary_); + + ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value")); + ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value")); + ASSERT_OK(Flush(0 /*cf*/)); + ASSERT_OK(Flush(1 /*cf*/)); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value", value); +} + +TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) { + Options options; + options.env = env_; + Reopen(options); + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0", + "VersionSet::ProcessManifestWrites:BeforeNewManifest"}, + {"VersionSet::ProcessManifestWrites:AfterNewManifest", + "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:" + "1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + // Make sure db calls RecoverLogFiles so as to trigger a manifest write, + // which causes the db to switch to a new MANIFEST upon start. + port::Thread ro_db_thread([&]() { + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + CloseSecondary(); + }); + Reopen(options); + ro_db_thread.join(); +} + +TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; +} + +TEST_F(DBSecondaryTest, MissingTableFile) { + int table_files_not_exist = 0; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers", + [&](void* arg) { + Status s = *reinterpret_cast(arg); + if (s.IsPathNotFound()) { + ++table_files_not_exist; + } else if (!s.ok()) { + assert(false); // Should not reach here + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_NE(nullptr, db_secondary_full()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value)); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist); + ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + ASSERT_OK(db_secondary_->Get(ropts, "bar", &value)); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + value); + Iterator* iter = db_secondary_->NewIterator(ropts); + ASSERT_NE(nullptr, iter); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key().ToString()); + ASSERT_EQ("bar_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key().ToString()); + ASSERT_EQ("foo_value" + + std::to_string(options.level0_file_num_compaction_trigger - 1), + iter->value().ToString()); + size_t count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ++count; + } + ASSERT_EQ(2, count); + delete iter; +} + +TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) { + Options options; + options.env = env_; + const std::string kCfName1 = "pikachu"; + CreateAndReopenWithCF({kCfName1}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kCfName1}, options1); + ASSERT_EQ(2, handles_secondary_.size()); + + ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1")); + ASSERT_OK(Flush(1 /*cf*/)); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + ReadOptions ropts; + ropts.verify_checksums = true; + std::string value; + ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); + ASSERT_EQ("foo_val_1", value); + + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + Close(); + CheckFileTypeCounts(dbname_, 1, 0, 1); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + value.clear(); + ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value)); + ASSERT_EQ("foo_val_1", value); +} + +TEST_F(DBSecondaryTest, SwitchManifest) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + const int kNumFiles = options.level0_file_num_compaction_trigger - 1; + // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1, + // ..., 9. + const int kNumKeys = 10; + // Create two sst + for (int i = 0; i != kNumFiles; ++i) { + for (int j = 0; j != kNumKeys; ++j) { + ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i))); + } + ASSERT_OK(Flush()); + } + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + const auto& range_scan_db = [&]() { + ReadOptions tmp_ropts; + tmp_ropts.total_order_seek = true; + tmp_ropts.verify_checksums = true; + std::unique_ptr iter(db_secondary_->NewIterator(tmp_ropts)); + int cnt = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) { + ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString()); + ASSERT_EQ("value_" + std::to_string(kNumFiles - 1), + iter->value().ToString()); + } + }; + + range_scan_db(); + + // While secondary instance still keeps old MANIFEST open, we close primary, + // restart primary, performs full compaction, close again, restart again so + // that next time secondary tries to catch up with primary, the secondary + // will skip the MANIFEST in middle. + Reopen(options); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + Reopen(options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + range_scan_db(); +} + +// Here, "Snapshot" refers to the version edits written by +// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after +// switching from the old one. +TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + ASSERT_OK(Put("0", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::string value; + ReadOptions ropts; + ropts.verify_checksums = true; + ASSERT_OK(db_secondary_->Get(ropts, "0", &value)); + ASSERT_EQ("value0", value); + + Reopen(options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); +} + +TEST_F(DBSecondaryTest, SwitchWAL) { + const int kNumKeysPerMemtable = 1; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + const auto& verify_db = [](DB* db1, DB* db2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + std::unique_ptr it1(db1->NewIterator(read_opts)); + std::unique_ptr it2(db2->NewIterator(read_opts)); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + }; + for (int k = 0; k != 16; ++k) { + ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k))); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), db_secondary_); + } +} + +TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) { + const int kNumKeysPerMemtable = 1; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:ContextCleanedUp", + "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}}); + SyncPoint::GetInstance()->EnableProcessing(); + const std::string kCFName1 = "pikachu"; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + CreateAndReopenWithCF({kCFName1}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kCFName1}, options1); + ASSERT_EQ(2, handles_secondary_.size()); + + const auto& verify_db = [](DB* db1, + const std::vector& handles1, + DB* db2, + const std::vector& handles2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + ASSERT_EQ(handles1.size(), handles2.size()); + for (size_t i = 0; i != handles1.size(); ++i) { + std::unique_ptr it1(db1->NewIterator(read_opts, handles1[i])); + std::unique_ptr it2(db2->NewIterator(read_opts, handles2[i])); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + } + }; + for (int k = 0; k != 8; ++k) { + ASSERT_OK( + Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); + ASSERT_OK( + Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); + TEST_SYNC_POINT( + "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), handles_, db_secondary_, handles_secondary_); + SyncPoint::GetInstance()->ClearTrace(); + } +} + +TEST_F(DBSecondaryTest, CatchUpAfterFlush) { + const int kNumKeysPerMemtable = 16; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + WriteOptions write_opts; + WriteBatch wb; + wb.Put("key0", "value0"); + wb.Put("key1", "value1"); + ASSERT_OK(dbfull()->Write(write_opts, &wb)); + ReadOptions read_opts; + std::unique_ptr iter1(db_secondary_->NewIterator(read_opts)); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + std::unique_ptr iter2(db_secondary_->NewIterator(read_opts)); + iter2->Seek("key0"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("value0", iter2->value()); + iter2->Seek("key1"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("value1", iter2->value()); + + { + WriteBatch wb1; + wb1.Put("key0", "value01"); + wb1.Put("key1", "value11"); + ASSERT_OK(dbfull()->Write(write_opts, &wb1)); + } + + { + WriteBatch wb2; + wb2.Put("key0", "new_value0"); + wb2.Delete("key1"); + ASSERT_OK(dbfull()->Write(write_opts, &wb2)); + } + + ASSERT_OK(Flush()); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::unique_ptr iter3(db_secondary_->NewIterator(read_opts)); + // iter3 should not see value01 and value11 at all. + iter3->Seek("key0"); + ASSERT_TRUE(iter3->Valid()); + ASSERT_EQ("new_value0", iter3->value()); + iter3->Seek("key1"); + ASSERT_FALSE(iter3->Valid()); +} + +TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) { + bool called = false; + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) { + ASSERT_NE(nullptr, arg); + called = true; + auto* s = reinterpret_cast(arg); + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", + "DBImpl::CheckConsistency:BeforeGetFileSize"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("a", "value0")); + ASSERT_OK(Put("c", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "value1")); + ASSERT_OK(Put("d", "value1")); + ASSERT_OK(Flush()); + port::Thread thread([this]() { + Options opts; + opts.env = env_; + opts.max_open_files = -1; + OpenSecondary(opts); + }); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + thread.join(); + ASSERT_TRUE(called); +} +#endif //! ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} -- cgit v1.2.3