diff options
Diffstat (limited to 'src/rocksdb/db/external_sst_file_ingestion_job.cc')
-rw-r--r-- | src/rocksdb/db/external_sst_file_ingestion_job.cc | 1020 |
1 files changed, 1020 insertions, 0 deletions
diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.cc b/src/rocksdb/db/external_sst_file_ingestion_job.cc new file mode 100644 index 000000000..ba1277eab --- /dev/null +++ b/src/rocksdb/db/external_sst_file_ingestion_job.cc @@ -0,0 +1,1020 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/external_sst_file_ingestion_job.h" + +#include <algorithm> +#include <cinttypes> +#include <string> +#include <unordered_set> +#include <vector> + +#include "db/db_impl/db_impl.h" +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/random_access_file_reader.h" +#include "logging/logging.h" +#include "table/merging_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/sst_file_writer_collectors.h" +#include "table/table_builder.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +Status ExternalSstFileIngestionJob::Prepare( + const std::vector<std::string>& external_files_paths, + const std::vector<std::string>& files_checksums, + const std::vector<std::string>& files_checksum_func_names, + const Temperature& file_temperature, uint64_t next_file_number, + SuperVersion* sv) { + Status status; + + // Read the information of files we are ingesting + for (const std::string& file_path : external_files_paths) { + IngestedFileInfo file_to_ingest; + status = + GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv); + if (!status.ok()) { + return status; + } + + if (file_to_ingest.cf_id != + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily && + file_to_ingest.cf_id != cfd_->GetID()) { + return Status::InvalidArgument( + "External file column family id don't match"); + } + + if (file_to_ingest.num_entries == 0 && + file_to_ingest.num_range_deletions == 0) { + return Status::InvalidArgument("File contain no entries"); + } + + if (!file_to_ingest.smallest_internal_key.Valid() || + !file_to_ingest.largest_internal_key.Valid()) { + return Status::Corruption("Generated table have corrupted keys"); + } + + files_to_ingest_.emplace_back(std::move(file_to_ingest)); + } + + const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); + auto num_files = files_to_ingest_.size(); + if (num_files == 0) { + return Status::InvalidArgument("The list of files is empty"); + } else if (num_files > 1) { + // Verify that passed files don't have overlapping ranges + autovector<const IngestedFileInfo*> sorted_files; + for (size_t i = 0; i < num_files; i++) { + sorted_files.push_back(&files_to_ingest_[i]); + } + + std::sort( + sorted_files.begin(), sorted_files.end(), + [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) { + return sstableKeyCompare(ucmp, info1->smallest_internal_key, + info2->smallest_internal_key) < 0; + }); + + for (size_t i = 0; i + 1 < num_files; i++) { + if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key, + sorted_files[i + 1]->smallest_internal_key) >= 0) { + files_overlap_ = true; + break; + } + } + } + + // Hanlde the file temperature + for (size_t i = 0; i < num_files; i++) { + files_to_ingest_[i].file_temperature = file_temperature; + } + + if (ingestion_options_.ingest_behind && files_overlap_) { + return Status::NotSupported("Files have overlapping ranges"); + } + + // Copy/Move external files into DB + std::unordered_set<size_t> ingestion_path_ids; + for (IngestedFileInfo& f : files_to_ingest_) { + f.copy_file = false; + const std::string path_outside_db = f.external_file_path; + const std::string path_inside_db = TableFileName( + cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); + if (ingestion_options_.move_files) { + status = + fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); + if (status.ok()) { + // It is unsafe to assume application had sync the file and file + // directory before ingest the file. For integrity of RocksDB we need + // to sync the file. + std::unique_ptr<FSWritableFile> file_to_sync; + Status s = fs_->ReopenWritableFile(path_inside_db, env_options_, + &file_to_sync, nullptr); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen", + &s); + // Some file systems (especially remote/distributed) don't support + // reopening a file for writing and don't require reopening and + // syncing the file. Ignore the NotSupported error in that case. + if (!s.IsNotSupported()) { + status = s; + if (status.ok()) { + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); + status = SyncIngestedFile(file_to_sync.get()); + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::AfterSyncIngestedFile"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s: %s", + path_inside_db.c_str(), status.ToString().c_str()); + } + } + } + } else if (status.IsNotSupported() && + ingestion_options_.failed_move_fall_back_to_copy) { + // Original file is on a different FS, use copy instead of hard linking. + f.copy_file = true; + ROCKS_LOG_INFO(db_options_.info_log, + "Triy to link file %s but it's not supported : %s", + path_outside_db.c_str(), status.ToString().c_str()); + } + } else { + f.copy_file = true; + } + + if (f.copy_file) { + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", + nullptr); + // CopyFile also sync the new file. + status = + CopyFile(fs_.get(), path_outside_db, path_inside_db, 0, + db_options_.use_fsync, io_tracer_, Temperature::kUnknown); + } + TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); + if (!status.ok()) { + break; + } + f.internal_file_path = path_inside_db; + // Initialize the checksum information of ingested files. + f.file_checksum = kUnknownFileChecksum; + f.file_checksum_func_name = kUnknownFileChecksumFuncName; + ingestion_path_ids.insert(f.fd.GetPathId()); + } + + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir"); + if (status.ok()) { + for (auto path_id : ingestion_path_ids) { + status = directories_->GetDataDir(path_id)->FsyncWithDirOptions( + IOOptions(), nullptr, + DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced)); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync directory %" ROCKSDB_PRIszt + " while ingest file: %s", + path_id, status.ToString().c_str()); + break; + } + } + } + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir"); + + // Generate and check the sst file checksum. Note that, if + // IngestExternalFileOptions::write_global_seqno is true, we will not update + // the checksum information in the files_to_ingests_ here, since the file is + // upadted with the new global_seqno. After global_seqno is updated, DB will + // generate the new checksum and store it in the Manifest. In all other cases + // if ingestion_options_.write_global_seqno == true and + // verify_file_checksum is false, we only check the checksum function name. + if (status.ok() && db_options_.file_checksum_gen_factory != nullptr) { + if (ingestion_options_.verify_file_checksum == false && + files_checksums.size() == files_to_ingest_.size() && + files_checksum_func_names.size() == files_to_ingest_.size()) { + // Only when verify_file_checksum == false and the checksum for ingested + // files are provided, DB will use the provided checksum and does not + // generate the checksum for ingested files. + need_generate_file_checksum_ = false; + } else { + need_generate_file_checksum_ = true; + } + FileChecksumGenContext gen_context; + std::unique_ptr<FileChecksumGenerator> file_checksum_gen = + db_options_.file_checksum_gen_factory->CreateFileChecksumGenerator( + gen_context); + std::vector<std::string> generated_checksums; + std::vector<std::string> generated_checksum_func_names; + // Step 1: generate the checksum for ingested sst file. + if (need_generate_file_checksum_) { + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + std::string generated_checksum; + std::string generated_checksum_func_name; + std::string requested_checksum_func_name; + // TODO: rate limit file reads for checksum calculation during file + // ingestion. + IOStatus io_s = GenerateOneFileChecksum( + fs_.get(), files_to_ingest_[i].internal_file_path, + db_options_.file_checksum_gen_factory.get(), + requested_checksum_func_name, &generated_checksum, + &generated_checksum_func_name, + ingestion_options_.verify_checksums_readahead_size, + db_options_.allow_mmap_reads, io_tracer_, + db_options_.rate_limiter.get(), + Env::IO_TOTAL /* rate_limiter_priority */); + if (!io_s.ok()) { + status = io_s; + ROCKS_LOG_WARN(db_options_.info_log, + "Sst file checksum generation of file: %s failed: %s", + files_to_ingest_[i].internal_file_path.c_str(), + status.ToString().c_str()); + break; + } + if (ingestion_options_.write_global_seqno == false) { + files_to_ingest_[i].file_checksum = generated_checksum; + files_to_ingest_[i].file_checksum_func_name = + generated_checksum_func_name; + } + generated_checksums.push_back(generated_checksum); + generated_checksum_func_names.push_back(generated_checksum_func_name); + } + } + + // Step 2: based on the verify_file_checksum and ingested checksum + // information, do the verification. + if (status.ok()) { + if (files_checksums.size() == files_to_ingest_.size() && + files_checksum_func_names.size() == files_to_ingest_.size()) { + // Verify the checksum and checksum function name. + if (ingestion_options_.verify_file_checksum) { + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + if (files_checksum_func_names[i] != + generated_checksum_func_names[i]) { + status = Status::InvalidArgument( + "Checksum function name does not match with the checksum " + "function name of this DB"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + external_files_paths[i].c_str(), status.ToString().c_str()); + break; + } + if (files_checksums[i] != generated_checksums[i]) { + status = Status::Corruption( + "Ingested checksum does not match with the generated " + "checksum"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + files_to_ingest_[i].internal_file_path.c_str(), + status.ToString().c_str()); + break; + } + } + } else { + // If verify_file_checksum is not enabled, we only verify the + // checksum function name. If it does not match, fail the ingestion. + // If matches, we trust the ingested checksum information and store + // in the Manifest. + for (size_t i = 0; i < files_to_ingest_.size(); i++) { + if (files_checksum_func_names[i] != file_checksum_gen->Name()) { + status = Status::InvalidArgument( + "Checksum function name does not match with the checksum " + "function name of this DB"); + ROCKS_LOG_WARN( + db_options_.info_log, + "Sst file checksum verification of file: %s failed: %s", + external_files_paths[i].c_str(), status.ToString().c_str()); + break; + } + files_to_ingest_[i].file_checksum = files_checksums[i]; + files_to_ingest_[i].file_checksum_func_name = + files_checksum_func_names[i]; + } + } + } else if (files_checksums.size() != files_checksum_func_names.size() || + (files_checksums.size() == files_checksum_func_names.size() && + files_checksums.size() != 0)) { + // The checksum or checksum function name vector are not both empty + // and they are incomplete. + status = Status::InvalidArgument( + "The checksum information of ingested sst files are nonempty and " + "the size of checksums or the size of the checksum function " + "names " + "does not match with the number of ingested sst files"); + ROCKS_LOG_WARN( + db_options_.info_log, + "The ingested sst files checksum information is incomplete: %s", + status.ToString().c_str()); + } + } + } + + // TODO: The following is duplicated with Cleanup(). + if (!status.ok()) { + IOOptions io_opts; + // We failed, remove all files that we copied into the db + for (IngestedFileInfo& f : files_to_ingest_) { + if (f.internal_file_path.empty()) { + continue; + } + Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } + + return status; +} + +Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, + SuperVersion* super_version) { + autovector<Range> ranges; + autovector<std::string> keys; + size_t ts_sz = cfd_->user_comparator()->timestamp_size(); + if (ts_sz) { + // Check all ranges [begin, end] inclusively. Add maximum + // timestamp to include all `begin` keys, and add minimal timestamp to + // include all `end` keys. + for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { + std::string begin_str; + std::string end_str; + AppendUserKeyWithMaxTimestamp( + &begin_str, file_to_ingest.smallest_internal_key.user_key(), ts_sz); + AppendKeyWithMinTimestamp( + &end_str, file_to_ingest.largest_internal_key.user_key(), ts_sz); + keys.emplace_back(std::move(begin_str)); + keys.emplace_back(std::move(end_str)); + } + for (size_t i = 0; i < files_to_ingest_.size(); ++i) { + ranges.emplace_back(keys[2 * i], keys[2 * i + 1]); + } + } else { + for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { + ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(), + file_to_ingest.largest_internal_key.user_key()); + } + } + Status status = cfd_->RangesOverlapWithMemtables( + ranges, super_version, db_options_.allow_data_in_errors, flush_needed); + if (status.ok() && *flush_needed && + !ingestion_options_.allow_blocking_flush) { + status = Status::InvalidArgument("External file requires flush"); + } + return status; +} + +// REQUIRES: we have become the only writer by entering both write_thread_ and +// nonmem_write_thread_ +Status ExternalSstFileIngestionJob::Run() { + Status status; + SuperVersion* super_version = cfd_->GetSuperVersion(); +#ifndef NDEBUG + // We should never run the job with a memtable that is overlapping + // with the files we are ingesting + bool need_flush = false; + status = NeedsFlush(&need_flush, super_version); + if (!status.ok()) { + return status; + } + if (need_flush) { + return Status::TryAgain(); + } + assert(status.ok() && need_flush == false); +#endif + + bool force_global_seqno = false; + + if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) { + // We need to assign a global sequence number to all the files even + // if the don't overlap with any ranges since we have snapshots + force_global_seqno = true; + } + // It is safe to use this instead of LastAllocatedSequence since we are + // the only active writer, and hence they are equal + SequenceNumber last_seqno = versions_->LastSequence(); + edit_.SetColumnFamily(cfd_->GetID()); + // The levels that the files will be ingested into + + for (IngestedFileInfo& f : files_to_ingest_) { + SequenceNumber assigned_seqno = 0; + if (ingestion_options_.ingest_behind) { + status = CheckLevelForIngestedBehindFile(&f); + } else { + status = AssignLevelAndSeqnoForIngestedFile( + super_version, force_global_seqno, cfd_->ioptions()->compaction_style, + last_seqno, &f, &assigned_seqno); + } + + // Modify the smallest/largest internal key to include the sequence number + // that we just learned. Only overwrite sequence number zero. There could + // be a nonzero sequence number already to indicate a range tombstone's + // exclusive endpoint. + ParsedInternalKey smallest_parsed, largest_parsed; + if (status.ok()) { + status = ParseInternalKey(*f.smallest_internal_key.rep(), + &smallest_parsed, false /* log_err_key */); + } + if (status.ok()) { + status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed, + false /* log_err_key */); + } + if (!status.ok()) { + return status; + } + if (smallest_parsed.sequence == 0) { + UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno, + smallest_parsed.type); + } + if (largest_parsed.sequence == 0) { + UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno, + largest_parsed.type); + } + + status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", + &assigned_seqno); + if (assigned_seqno > last_seqno) { + assert(assigned_seqno == last_seqno + 1); + last_seqno = assigned_seqno; + ++consumed_seqno_count_; + } + if (!status.ok()) { + return status; + } + + status = GenerateChecksumForIngestedFile(&f); + if (!status.ok()) { + return status; + } + + // We use the import time as the ancester time. This is the time the data + // is written to the database. + int64_t temp_current_time = 0; + uint64_t current_time = kUnknownFileCreationTime; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + if (clock_->GetCurrentTime(&temp_current_time).ok()) { + current_time = oldest_ancester_time = + static_cast<uint64_t>(temp_current_time); + } + FileMetaData f_metadata( + f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), + f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, + f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber, + oldest_ancester_time, current_time, f.file_checksum, + f.file_checksum_func_name, f.unique_id); + f_metadata.temperature = f.file_temperature; + edit_.AddFile(f.picked_level, f_metadata); + } + return status; +} + +void ExternalSstFileIngestionJob::UpdateStats() { + // Update internal stats for new ingested files + uint64_t total_keys = 0; + uint64_t total_l0_files = 0; + uint64_t total_time = clock_->NowMicros() - job_start_time_; + + EventLoggerStream stream = event_logger_->Log(); + stream << "event" + << "ingest_finished"; + stream << "files_ingested"; + stream.StartArray(); + + for (IngestedFileInfo& f : files_to_ingest_) { + InternalStats::CompactionStats stats( + CompactionReason::kExternalSstIngestion, 1); + stats.micros = total_time; + // If actual copy occurred for this file, then we need to count the file + // size as the actual bytes written. If the file was linked, then we ignore + // the bytes written for file metadata. + // TODO (yanqin) maybe account for file metadata bytes for exact accuracy? + if (f.copy_file) { + stats.bytes_written = f.fd.GetFileSize(); + } else { + stats.bytes_moved = f.fd.GetFileSize(); + } + stats.num_output_files = 1; + cfd_->internal_stats()->AddCompactionStats(f.picked_level, + Env::Priority::USER, stats); + cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE, + f.fd.GetFileSize()); + total_keys += f.num_entries; + if (f.picked_level == 0) { + total_l0_files += 1; + } + ROCKS_LOG_INFO( + db_options_.info_log, + "[AddFile] External SST file %s was ingested in L%d with path %s " + "(global_seqno=%" PRIu64 ")\n", + f.external_file_path.c_str(), f.picked_level, + f.internal_file_path.c_str(), f.assigned_seqno); + stream << "file" << f.internal_file_path << "level" << f.picked_level; + } + stream.EndArray(); + + stream << "lsm_state"; + stream.StartArray(); + auto vstorage = cfd_->current()->storage_info(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); + } + stream.EndArray(); + + cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL, + total_keys); + cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL, + files_to_ingest_.size()); + cfd_->internal_stats()->AddCFStats( + InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files); +} + +void ExternalSstFileIngestionJob::Cleanup(const Status& status) { + IOOptions io_opts; + if (!status.ok()) { + // We failed to add the files to the database + // remove all the files we copied + for (IngestedFileInfo& f : files_to_ingest_) { + if (f.internal_file_path.empty()) { + continue; + } + Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + consumed_seqno_count_ = 0; + files_overlap_ = false; + } else if (status.ok() && ingestion_options_.move_files) { + // The files were moved and added successfully, remove original file links + for (IngestedFileInfo& f : files_to_ingest_) { + Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "%s was added to DB successfully but failed to remove original " + "file link : %s", + f.external_file_path.c_str(), s.ToString().c_str()); + } + } + } +} + +Status ExternalSstFileIngestionJob::GetIngestedFileInfo( + const std::string& external_file, uint64_t new_file_number, + IngestedFileInfo* file_to_ingest, SuperVersion* sv) { + file_to_ingest->external_file_path = external_file; + + // Get external file size + Status status = fs_->GetFileSize(external_file, IOOptions(), + &file_to_ingest->file_size, nullptr); + if (!status.ok()) { + return status; + } + + // Assign FD with number + file_to_ingest->fd = + FileDescriptor(new_file_number, 0, file_to_ingest->file_size); + + // Create TableReader for external file + std::unique_ptr<TableReader> table_reader; + std::unique_ptr<FSRandomAccessFile> sst_file; + std::unique_ptr<RandomAccessFileReader> sst_file_reader; + + status = + fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr); + if (!status.ok()) { + return status; + } + sst_file_reader.reset(new RandomAccessFileReader( + std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_)); + + status = cfd_->ioptions()->table_factory->NewTableReader( + TableReaderOptions( + *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, + env_options_, cfd_->internal_comparator(), + /*skip_filters*/ false, /*immortal*/ false, + /*force_direct_prefetch*/ false, /*level*/ -1, + /*block_cache_tracer*/ nullptr, + /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(), + /*cur_file_num*/ new_file_number), + std::move(sst_file_reader), file_to_ingest->file_size, &table_reader); + if (!status.ok()) { + return status; + } + + if (ingestion_options_.verify_checksums_before_ingest) { + // If customized readahead size is needed, we can pass a user option + // all the way to here. Right now we just rely on the default readahead + // to keep things simple. + ReadOptions ro; + ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; + status = table_reader->VerifyChecksum( + ro, TableReaderCaller::kExternalSSTIngestion); + } + if (!status.ok()) { + return status; + } + + // Get the external file properties + auto props = table_reader->GetTableProperties(); + const auto& uprops = props->user_collected_properties; + + // Get table version + auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion); + if (version_iter == uprops.end()) { + return Status::Corruption("External file version not found"); + } + file_to_ingest->version = DecodeFixed32(version_iter->second.c_str()); + + auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno); + if (file_to_ingest->version == 2) { + // version 2 imply that we have global sequence number + if (seqno_iter == uprops.end()) { + return Status::Corruption( + "External file global sequence number not found"); + } + + // Set the global sequence number + file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str()); + if (props->external_sst_file_global_seqno_offset == 0) { + file_to_ingest->global_seqno_offset = 0; + return Status::Corruption("Was not able to find file global seqno field"); + } + file_to_ingest->global_seqno_offset = + static_cast<size_t>(props->external_sst_file_global_seqno_offset); + } else if (file_to_ingest->version == 1) { + // SST file V1 should not have global seqno field + assert(seqno_iter == uprops.end()); + file_to_ingest->original_seqno = 0; + if (ingestion_options_.allow_blocking_flush || + ingestion_options_.allow_global_seqno) { + return Status::InvalidArgument( + "External SST file V1 does not support global seqno"); + } + } else { + return Status::InvalidArgument("External file version is not supported"); + } + // Get number of entries in table + file_to_ingest->num_entries = props->num_entries; + file_to_ingest->num_range_deletions = props->num_range_deletions; + + ParsedInternalKey key; + ReadOptions ro; + // During reading the external file we can cache blocks that we read into + // the block cache, if we later change the global seqno of this file, we will + // have block in cache that will include keys with wrong seqno. + // We need to disable fill_cache so that we read from the file without + // updating the block cache. + ro.fill_cache = false; + std::unique_ptr<InternalIterator> iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + std::unique_ptr<InternalIterator> range_del_iter( + table_reader->NewRangeTombstoneIterator(ro)); + + // Get first (smallest) and last (largest) key from file. + file_to_ingest->smallest_internal_key = + InternalKey("", 0, ValueType::kTypeValue); + file_to_ingest->largest_internal_key = + InternalKey("", 0, ValueType::kTypeValue); + bool bounds_set = false; + bool allow_data_in_errors = db_options_.allow_data_in_errors; + iter->SeekToFirst(); + if (iter->Valid()) { + Status pik_status = + ParseInternalKey(iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + if (key.sequence != 0) { + return Status::Corruption("External file has non zero sequence number"); + } + file_to_ingest->smallest_internal_key.SetFrom(key); + + iter->SeekToLast(); + pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + if (key.sequence != 0) { + return Status::Corruption("External file has non zero sequence number"); + } + file_to_ingest->largest_internal_key.SetFrom(key); + + bounds_set = true; + } + + // We may need to adjust these key bounds, depending on whether any range + // deletion tombstones extend past them. + const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); + if (range_del_iter != nullptr) { + for (range_del_iter->SeekToFirst(); range_del_iter->Valid(); + range_del_iter->Next()) { + Status pik_status = + ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + RangeTombstone tombstone(key, range_del_iter->value()); + + InternalKey start_key = tombstone.SerializeKey(); + if (!bounds_set || + sstableKeyCompare(ucmp, start_key, + file_to_ingest->smallest_internal_key) < 0) { + file_to_ingest->smallest_internal_key = start_key; + } + InternalKey end_key = tombstone.SerializeEndKey(); + if (!bounds_set || + sstableKeyCompare(ucmp, end_key, + file_to_ingest->largest_internal_key) > 0) { + file_to_ingest->largest_internal_key = end_key; + } + bounds_set = true; + } + } + + file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id); + + file_to_ingest->table_properties = *props; + + auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, + &(file_to_ingest->unique_id)); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get SST unique id for file %s", + file_to_ingest->internal_file_path.c_str()); + file_to_ingest->unique_id = kNullUniqueId64x2; + } + + return status; +} + +Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( + SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style, + SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest, + SequenceNumber* assigned_seqno) { + Status status; + *assigned_seqno = 0; + if (force_global_seqno) { + *assigned_seqno = last_seqno + 1; + if (compaction_style == kCompactionStyleUniversal || files_overlap_) { + if (ingestion_options_.fail_if_not_bottommost_level) { + status = Status::TryAgain( + "Files cannot be ingested to Lmax. Please make sure key range of " + "Lmax does not overlap with files to ingest."); + return status; + } + file_to_ingest->picked_level = 0; + return status; + } + } + + bool overlap_with_db = false; + Arena arena; + ReadOptions ro; + ro.total_order_seek = true; + int target_level = 0; + auto* vstorage = cfd_->current()->storage_info(); + + for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) { + if (lvl > 0 && lvl < vstorage->base_level()) { + continue; + } + + if (vstorage->NumLevelFiles(lvl) > 0) { + bool overlap_with_level = false; + status = sv->current->OverlapWithLevelIterator( + ro, env_options_, file_to_ingest->smallest_internal_key.user_key(), + file_to_ingest->largest_internal_key.user_key(), lvl, + &overlap_with_level); + if (!status.ok()) { + return status; + } + if (overlap_with_level) { + // We must use L0 or any level higher than `lvl` to be able to overwrite + // the keys that we overlap with in this level, We also need to assign + // this file a seqno to overwrite the existing keys in level `lvl` + overlap_with_db = true; + break; + } + + if (compaction_style == kCompactionStyleUniversal && lvl != 0) { + const std::vector<FileMetaData*>& level_files = + vstorage->LevelFiles(lvl); + const SequenceNumber level_largest_seqno = + (*std::max_element(level_files.begin(), level_files.end(), + [](FileMetaData* f1, FileMetaData* f2) { + return f1->fd.largest_seqno < + f2->fd.largest_seqno; + })) + ->fd.largest_seqno; + // should only assign seqno to current level's largest seqno when + // the file fits + if (level_largest_seqno != 0 && + IngestedFileFitInLevel(file_to_ingest, lvl)) { + *assigned_seqno = level_largest_seqno; + } else { + continue; + } + } + } else if (compaction_style == kCompactionStyleUniversal) { + continue; + } + + // We don't overlap with any keys in this level, but we still need to check + // if our file can fit in it + if (IngestedFileFitInLevel(file_to_ingest, lvl)) { + target_level = lvl; + } + } + // If files overlap, we have to ingest them at level 0 and assign the newest + // sequence number + if (files_overlap_) { + target_level = 0; + *assigned_seqno = last_seqno + 1; + } + + if (ingestion_options_.fail_if_not_bottommost_level && + target_level < cfd_->NumberLevels() - 1) { + status = Status::TryAgain( + "Files cannot be ingested to Lmax. Please make sure key range of Lmax " + "does not overlap with files to ingest."); + return status; + } + + TEST_SYNC_POINT_CALLBACK( + "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile", + &overlap_with_db); + file_to_ingest->picked_level = target_level; + if (overlap_with_db && *assigned_seqno == 0) { + *assigned_seqno = last_seqno + 1; + } + return status; +} + +Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile( + IngestedFileInfo* file_to_ingest) { + auto* vstorage = cfd_->current()->storage_info(); + // first check if new files fit in the bottommost level + int bottom_lvl = cfd_->NumberLevels() - 1; + if (!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) { + return Status::InvalidArgument( + "Can't ingest_behind file as it doesn't fit " + "at the bottommost level!"); + } + + // second check if despite allow_ingest_behind=true we still have 0 seqnums + // at some upper level + for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) { + for (auto file : vstorage->LevelFiles(lvl)) { + if (file->fd.smallest_seqno == 0) { + return Status::InvalidArgument( + "Can't ingest_behind file as despite allow_ingest_behind=true " + "there are files with 0 seqno in database at upper levels!"); + } + } + } + + file_to_ingest->picked_level = bottom_lvl; + return Status::OK(); +} + +Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( + IngestedFileInfo* file_to_ingest, SequenceNumber seqno) { + if (file_to_ingest->original_seqno == seqno) { + // This file already have the correct global seqno + return Status::OK(); + } else if (!ingestion_options_.allow_global_seqno) { + return Status::InvalidArgument("Global seqno is required, but disabled"); + } else if (file_to_ingest->global_seqno_offset == 0) { + return Status::InvalidArgument( + "Trying to set global seqno for a file that don't have a global seqno " + "field"); + } + + if (ingestion_options_.write_global_seqno) { + // Determine if we can write global_seqno to a given offset of file. + // If the file system does not support random write, then we should not. + // Otherwise we should. + std::unique_ptr<FSRandomRWFile> rwfile; + Status status = fs_->NewRandomRWFile(file_to_ingest->internal_file_path, + env_options_, &rwfile, nullptr); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile", + &status); + if (status.ok()) { + FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_, + file_to_ingest->internal_file_path); + std::string seqno_val; + PutFixed64(&seqno_val, seqno); + status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val, + IOOptions(), nullptr); + if (status.ok()) { + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno"); + status = SyncIngestedFile(fsptr.get()); + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s after writing global " + "sequence number: %s", + file_to_ingest->internal_file_path.c_str(), + status.ToString().c_str()); + } + } + if (!status.ok()) { + return status; + } + } else if (!status.IsNotSupported()) { + return status; + } + } + + file_to_ingest->assigned_seqno = seqno; + return Status::OK(); +} + +IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile( + IngestedFileInfo* file_to_ingest) { + if (db_options_.file_checksum_gen_factory == nullptr || + need_generate_file_checksum_ == false || + ingestion_options_.write_global_seqno == false) { + // If file_checksum_gen_factory is not set, we are not able to generate + // the checksum. if write_global_seqno is false, it means we will use + // file checksum generated during Prepare(). This step will be skipped. + return IOStatus::OK(); + } + std::string file_checksum; + std::string file_checksum_func_name; + std::string requested_checksum_func_name; + // TODO: rate limit file reads for checksum calculation during file ingestion. + IOStatus io_s = GenerateOneFileChecksum( + fs_.get(), file_to_ingest->internal_file_path, + db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name, + &file_checksum, &file_checksum_func_name, + ingestion_options_.verify_checksums_readahead_size, + db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(), + Env::IO_TOTAL /* rate_limiter_priority */); + if (!io_s.ok()) { + return io_s; + } + file_to_ingest->file_checksum = file_checksum; + file_to_ingest->file_checksum_func_name = file_checksum_func_name; + return IOStatus::OK(); +} + +bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( + const IngestedFileInfo* file_to_ingest, int level) { + if (level == 0) { + // Files can always fit in L0 + return true; + } + + auto* vstorage = cfd_->current()->storage_info(); + Slice file_smallest_user_key( + file_to_ingest->smallest_internal_key.user_key()); + Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key()); + + if (vstorage->OverlapInLevel(level, &file_smallest_user_key, + &file_largest_user_key)) { + // File overlap with another files in this level, we cannot + // add it to this level + return false; + } + if (cfd_->RangeOverlapWithCompaction(file_smallest_user_key, + file_largest_user_key, level)) { + // File overlap with a running compaction output that will be stored + // in this level, we cannot add this file to this level + return false; + } + + // File did not overlap with level files, our compaction output + return true; +} + +template <typename TWritableFile> +Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) { + assert(file != nullptr); + if (db_options_.use_fsync) { + return file->Fsync(IOOptions(), nullptr); + } else { + return file->Sync(IOOptions(), nullptr); + } +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // !ROCKSDB_LITE |