diff options
Diffstat (limited to 'src/rocksdb/db/builder.cc')
-rw-r--r-- | src/rocksdb/db/builder.cc | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/src/rocksdb/db/builder.cc b/src/rocksdb/db/builder.cc new file mode 100644 index 000000000..9283ffd64 --- /dev/null +++ b/src/rocksdb/db/builder.cc @@ -0,0 +1,434 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/builder.h" + +#include <algorithm> +#include <deque> +#include <vector> + +#include "db/blob/blob_file_builder.h" +#include "db/compaction/compaction_iterator.h" +#include "db/event_helpers.h" +#include "db/internal_stats.h" +#include "db/merge_helper.h" +#include "db/output_validator.h" +#include "db/range_del_aggregator.h" +#include "db/table_cache.h" +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" +#include "options/options_helper.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/format.h" +#include "table/internal_iterator.h" +#include "table/unique_id_impl.h" +#include "test_util/sync_point.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +class TableFactory; + +TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, + WritableFileWriter* file) { + assert((tboptions.column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == + tboptions.column_family_name.empty()); + return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file); +} + +Status BuildTable( + const std::string& dbname, VersionSet* versions, + const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, + const FileOptions& file_options, TableCache* table_cache, + InternalIterator* iter, + std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>> + range_del_iters, + FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions, + std::vector<SequenceNumber> snapshots, + SequenceNumber earliest_write_conflict_snapshot, + SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker, + bool paranoid_file_checks, InternalStats* internal_stats, + IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer, + BlobFileCreationReason blob_creation_reason, + const SeqnoToTimeMapping& seqno_to_time_mapping, EventLogger* event_logger, + int job_id, const Env::IOPriority io_priority, + TableProperties* table_properties, Env::WriteLifeTimeHint write_hint, + const std::string* full_history_ts_low, + BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries, + uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) { + assert((tboptions.column_family_id == + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == + tboptions.column_family_name.empty()); + auto& mutable_cf_options = tboptions.moptions; + auto& ioptions = tboptions.ioptions; + // Reports the IOStats for flush for every following bytes. + const size_t kReportFlushIOStatsEvery = 1048576; + OutputValidator output_validator( + tboptions.internal_comparator, + /*enable_order_check=*/ + mutable_cf_options.check_flush_compaction_key_order, + /*enable_hash=*/paranoid_file_checks); + Status s; + meta->fd.file_size = 0; + iter->SeekToFirst(); + std::unique_ptr<CompactionRangeDelAggregator> range_del_agg( + new CompactionRangeDelAggregator(&tboptions.internal_comparator, + snapshots, full_history_ts_low)); + uint64_t num_unfragmented_tombstones = 0; + uint64_t total_tombstone_payload_bytes = 0; + for (auto& range_del_iter : range_del_iters) { + num_unfragmented_tombstones += + range_del_iter->num_unfragmented_tombstones(); + total_tombstone_payload_bytes += + range_del_iter->total_tombstone_payload_bytes(); + range_del_agg->AddTombstones(std::move(range_del_iter)); + } + + std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(), + meta->fd.GetPathId()); + std::vector<std::string> blob_file_paths; + std::string file_checksum = kUnknownFileChecksum; + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; +#ifndef ROCKSDB_LITE + EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname, + tboptions.column_family_name, + fname, job_id, tboptions.reason); +#endif // !ROCKSDB_LITE + Env* env = db_options.env; + assert(env); + FileSystem* fs = db_options.fs.get(); + assert(fs); + + TableProperties tp; + bool table_file_created = false; + if (iter->Valid() || !range_del_agg->IsEmpty()) { + std::unique_ptr<CompactionFilter> compaction_filter; + if (ioptions.compaction_filter_factory != nullptr && + ioptions.compaction_filter_factory->ShouldFilterTableFileCreation( + tboptions.reason)) { + CompactionFilter::Context context; + context.is_full_compaction = false; + context.is_manual_compaction = false; + context.column_family_id = tboptions.column_family_id; + context.reason = tboptions.reason; + compaction_filter = + ioptions.compaction_filter_factory->CreateCompactionFilter(context); + if (compaction_filter != nullptr && + !compaction_filter->IgnoreSnapshots()) { + s.PermitUncheckedError(); + return Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + } + } + + TableBuilder* builder; + std::unique_ptr<WritableFileWriter> file_writer; + { + std::unique_ptr<FSWritableFile> file; +#ifndef NDEBUG + bool use_direct_writes = file_options.use_direct_writes; + TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes); +#endif // !NDEBUG + IOStatus io_s = NewWritableFile(fs, fname, &file, file_options); + assert(s.ok()); + s = io_s; + if (io_status->ok()) { + *io_status = io_s; + } + if (!s.ok()) { + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger, ioptions.listeners, dbname, + tboptions.column_family_name, fname, job_id, meta->fd, + kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum, + file_checksum_func_name); + return s; + } + + table_file_created = true; + FileTypeSet tmp_set = ioptions.checksum_handoff_file_types; + file->SetIOPriority(io_priority); + file->SetWriteLifeTimeHint(write_hint); + file_writer.reset(new WritableFileWriter( + std::move(file), fname, file_options, ioptions.clock, io_tracer, + ioptions.stats, ioptions.listeners, + ioptions.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile), false)); + + builder = NewTableBuilder(tboptions, file_writer.get()); + } + + MergeHelper merge( + env, tboptions.internal_comparator.user_comparator(), + ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger, + true /* internal key corruption is not ok */, + snapshots.empty() ? 0 : snapshots.back(), snapshot_checker); + + std::unique_ptr<BlobFileBuilder> blob_file_builder( + (mutable_cf_options.enable_blob_files && + tboptions.level_at_creation >= + mutable_cf_options.blob_file_starting_level && + blob_file_additions) + ? new BlobFileBuilder( + versions, fs, &ioptions, &mutable_cf_options, &file_options, + tboptions.db_id, tboptions.db_session_id, job_id, + tboptions.column_family_id, tboptions.column_family_name, + io_priority, write_hint, io_tracer, blob_callback, + blob_creation_reason, &blob_file_paths, blob_file_additions) + : nullptr); + + const std::atomic<bool> kManualCompactionCanceledFalse{false}; + CompactionIterator c_iter( + iter, tboptions.internal_comparator.user_comparator(), &merge, + kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot, + job_snapshot, snapshot_checker, env, + ShouldReportDetailedTime(env, ioptions.stats), + true /* internal key corruption is not ok */, range_del_agg.get(), + blob_file_builder.get(), ioptions.allow_data_in_errors, + ioptions.enforce_single_del_contracts, + /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, + /*compaction=*/nullptr, compaction_filter.get(), + /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low); + + c_iter.SeekToFirst(); + for (; c_iter.Valid(); c_iter.Next()) { + const Slice& key = c_iter.key(); + const Slice& value = c_iter.value(); + const ParsedInternalKey& ikey = c_iter.ikey(); + // Generate a rolling 64-bit hash of the key and values + // Note : + // Here "key" integrates 'sequence_number'+'kType'+'user key'. + s = output_validator.Add(key, value); + if (!s.ok()) { + break; + } + builder->Add(key, value); + + s = meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type); + if (!s.ok()) { + break; + } + + // TODO(noetzli): Update stats after flush, too. + if (io_priority == Env::IO_HIGH && + IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) { + ThreadStatusUtil::SetThreadOperationProperty( + ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); + } + } + if (!s.ok()) { + c_iter.status().PermitUncheckedError(); + } else if (!c_iter.status().ok()) { + s = c_iter.status(); + } + + if (s.ok()) { + auto range_del_it = range_del_agg->NewIterator(); + for (range_del_it->SeekToFirst(); range_del_it->Valid(); + range_del_it->Next()) { + auto tombstone = range_del_it->Tombstone(); + auto kv = tombstone.Serialize(); + builder->Add(kv.first.Encode(), kv.second); + meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(), + tombstone.seq_, + tboptions.internal_comparator); + } + } + + TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable"); + const bool empty = builder->IsEmpty(); + if (num_input_entries != nullptr) { + *num_input_entries = + c_iter.num_input_entry_scanned() + num_unfragmented_tombstones; + } + if (!s.ok() || empty) { + builder->Abandon(); + } else { + std::string seqno_time_mapping_str; + seqno_to_time_mapping.Encode( + seqno_time_mapping_str, meta->fd.smallest_seqno, + meta->fd.largest_seqno, meta->file_creation_time); + builder->SetSeqnoTimeTableProperties( + seqno_time_mapping_str, + ioptions.compaction_style == CompactionStyle::kCompactionStyleFIFO + ? meta->file_creation_time + : meta->oldest_ancester_time); + s = builder->Finish(); + } + if (io_status->ok()) { + *io_status = builder->io_status(); + } + + if (s.ok() && !empty) { + uint64_t file_size = builder->FileSize(); + meta->fd.file_size = file_size; + meta->marked_for_compaction = builder->NeedCompact(); + assert(meta->fd.GetFileSize() > 0); + tp = builder + ->GetTableProperties(); // refresh now that builder is finished + if (memtable_payload_bytes != nullptr && + memtable_garbage_bytes != nullptr) { + const CompactionIterationStats& ci_stats = c_iter.iter_stats(); + uint64_t total_payload_bytes = ci_stats.total_input_raw_key_bytes + + ci_stats.total_input_raw_value_bytes + + total_tombstone_payload_bytes; + uint64_t total_payload_bytes_written = + (tp.raw_key_size + tp.raw_value_size); + // Prevent underflow, which may still happen at this point + // since we only support inserts, deletes, and deleteRanges. + if (total_payload_bytes_written <= total_payload_bytes) { + *memtable_payload_bytes = total_payload_bytes; + *memtable_garbage_bytes = + total_payload_bytes - total_payload_bytes_written; + } else { + *memtable_payload_bytes = 0; + *memtable_garbage_bytes = 0; + } + } + if (table_properties) { + *table_properties = tp; + } + } + delete builder; + + // Finish and check for file errors + TEST_SYNC_POINT("BuildTable:BeforeSyncTable"); + if (s.ok() && !empty) { + StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS); + *io_status = file_writer->Sync(ioptions.use_fsync); + } + TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile"); + if (s.ok() && io_status->ok() && !empty) { + *io_status = file_writer->Close(); + } + if (s.ok() && io_status->ok() && !empty) { + // Add the checksum information to file metadata. + meta->file_checksum = file_writer->GetFileChecksum(); + meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName(); + file_checksum = meta->file_checksum; + file_checksum_func_name = meta->file_checksum_func_name; + // Set unique_id only if db_id and db_session_id exist + if (!tboptions.db_id.empty() && !tboptions.db_session_id.empty()) { + if (!GetSstInternalUniqueId(tboptions.db_id, tboptions.db_session_id, + meta->fd.GetNumber(), &(meta->unique_id)) + .ok()) { + // if failed to get unique id, just set it Null + meta->unique_id = kNullUniqueId64x2; + } + } + } + + if (s.ok()) { + s = *io_status; + } + + if (blob_file_builder) { + if (s.ok()) { + s = blob_file_builder->Finish(); + } else { + blob_file_builder->Abandon(s); + } + blob_file_builder.reset(); + } + + // TODO Also check the IO status when create the Iterator. + + TEST_SYNC_POINT("BuildTable:BeforeOutputValidation"); + if (s.ok() && !empty) { + // Verify that the table is usable + // We set for_compaction to false and don't OptimizeForCompactionTableRead + // here because this is a special case after we finish the table building. + // No matter whether use_direct_io_for_flush_and_compaction is true, + // the goal is to cache it here for further user reads. + ReadOptions read_options; + std::unique_ptr<InternalIterator> it(table_cache->NewIterator( + read_options, file_options, tboptions.internal_comparator, *meta, + nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor, + nullptr, + (internal_stats == nullptr) ? nullptr + : internal_stats->GetFileReadHist(0), + TableReaderCaller::kFlush, /*arena=*/nullptr, + /*skip_filter=*/false, tboptions.level_at_creation, + MaxFileSizeForL0MetaPin(mutable_cf_options), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key*/ nullptr, + /*allow_unprepared_value*/ false)); + s = it->status(); + if (s.ok() && paranoid_file_checks) { + OutputValidator file_validator(tboptions.internal_comparator, + /*enable_order_check=*/true, + /*enable_hash=*/true); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + // Generate a rolling 64-bit hash of the key and values + file_validator.Add(it->key(), it->value()).PermitUncheckedError(); + } + s = it->status(); + if (s.ok() && !output_validator.CompareValidator(file_validator)) { + s = Status::Corruption("Paranoid checksums do not match"); + } + } + } + } + + // Check for input iterator errors + if (!iter->status().ok()) { + s = iter->status(); + } + + if (!s.ok() || meta->fd.GetFileSize() == 0) { + TEST_SYNC_POINT("BuildTable:BeforeDeleteFile"); + + constexpr IODebugContext* dbg = nullptr; + + if (table_file_created) { + Status ignored = fs->DeleteFile(fname, IOOptions(), dbg); + ignored.PermitUncheckedError(); + } + + assert(blob_file_additions || blob_file_paths.empty()); + + if (blob_file_additions) { + for (const std::string& blob_file_path : blob_file_paths) { + Status ignored = DeleteDBFile(&db_options, blob_file_path, dbname, + /*force_bg=*/false, /*force_fg=*/false); + ignored.PermitUncheckedError(); + TEST_SYNC_POINT("BuildTable::AfterDeleteFile"); + } + } + } + + Status status_for_listener = s; + if (meta->fd.GetFileSize() == 0) { + fname = "(nil)"; + if (s.ok()) { + status_for_listener = Status::Aborted("Empty SST file not kept"); + } + } + // Output to event logger and fire events. + EventHelpers::LogAndNotifyTableFileCreationFinished( + event_logger, ioptions.listeners, dbname, tboptions.column_family_name, + fname, job_id, meta->fd, meta->oldest_blob_file_number, tp, + tboptions.reason, status_for_listener, file_checksum, + file_checksum_func_name); + + return s; +} + +} // namespace ROCKSDB_NAMESPACE |