diff options
Diffstat (limited to 'src/rocksdb/db/blob')
45 files changed, 12380 insertions, 0 deletions
diff --git a/src/rocksdb/db/blob/blob_constants.h b/src/rocksdb/db/blob/blob_constants.h new file mode 100644 index 000000000..a5d09ac76 --- /dev/null +++ b/src/rocksdb/db/blob/blob_constants.h @@ -0,0 +1,16 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cstdint> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +constexpr uint64_t kInvalidBlobFileNumber = 0; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_contents.cc b/src/rocksdb/db/blob/blob_contents.cc new file mode 100644 index 000000000..9015609e7 --- /dev/null +++ b/src/rocksdb/db/blob/blob_contents.cc @@ -0,0 +1,90 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_contents.h" + +#include <cassert> + +#include "cache/cache_entry_roles.h" +#include "cache/cache_helpers.h" +#include "port/malloc.h" + +namespace ROCKSDB_NAMESPACE { + +std::unique_ptr<BlobContents> BlobContents::Create( + CacheAllocationPtr&& allocation, size_t size) { + return std::unique_ptr<BlobContents>( + new BlobContents(std::move(allocation), size)); +} + +size_t BlobContents::ApproximateMemoryUsage() const { + size_t usage = 0; + + if (allocation_) { + MemoryAllocator* const allocator = allocation_.get_deleter().allocator; + + if (allocator) { + usage += allocator->UsableSize(allocation_.get(), data_.size()); + } else { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(allocation_.get()); +#else + usage += data_.size(); +#endif + } + } + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast<BlobContents*>(this)); +#else + usage += sizeof(*this); +#endif + + return usage; +} + +size_t BlobContents::SizeCallback(void* obj) { + assert(obj); + + return static_cast<const BlobContents*>(obj)->size(); +} + +Status BlobContents::SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + assert(from_obj); + + const BlobContents* buf = static_cast<const BlobContents*>(from_obj); + assert(buf->size() >= from_offset + length); + + memcpy(out, buf->data().data() + from_offset, length); + + return Status::OK(); +} + +Cache::CacheItemHelper* BlobContents::GetCacheItemHelper() { + static Cache::CacheItemHelper cache_helper( + &SizeCallback, &SaveToCallback, + GetCacheEntryDeleterForRole<BlobContents, CacheEntryRole::kBlobValue>()); + + return &cache_helper; +} + +Status BlobContents::CreateCallback(CacheAllocationPtr&& allocation, + const void* buf, size_t size, + void** out_obj, size_t* charge) { + assert(allocation); + + memcpy(allocation.get(), buf, size); + + std::unique_ptr<BlobContents> obj = Create(std::move(allocation), size); + BlobContents* const contents = obj.release(); + + *out_obj = contents; + *charge = contents->ApproximateMemoryUsage(); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_contents.h b/src/rocksdb/db/blob/blob_contents.h new file mode 100644 index 000000000..9b7c5b969 --- /dev/null +++ b/src/rocksdb/db/blob/blob_contents.h @@ -0,0 +1,56 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <memory> + +#include "memory/memory_allocator.h" +#include "rocksdb/cache.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// A class representing a single uncompressed value read from a blob file. +class BlobContents { + public: + static std::unique_ptr<BlobContents> Create(CacheAllocationPtr&& allocation, + size_t size); + + BlobContents(const BlobContents&) = delete; + BlobContents& operator=(const BlobContents&) = delete; + + BlobContents(BlobContents&&) = default; + BlobContents& operator=(BlobContents&&) = default; + + ~BlobContents() = default; + + const Slice& data() const { return data_; } + size_t size() const { return data_.size(); } + + size_t ApproximateMemoryUsage() const; + + // Callbacks for secondary cache + static size_t SizeCallback(void* obj); + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out); + + static Cache::CacheItemHelper* GetCacheItemHelper(); + + static Status CreateCallback(CacheAllocationPtr&& allocation, const void* buf, + size_t size, void** out_obj, size_t* charge); + + private: + BlobContents(CacheAllocationPtr&& allocation, size_t size) + : allocation_(std::move(allocation)), data_(allocation_.get(), size) {} + + CacheAllocationPtr allocation_; + Slice data_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_counting_iterator.h b/src/rocksdb/db/blob/blob_counting_iterator.h new file mode 100644 index 000000000..de549afa2 --- /dev/null +++ b/src/rocksdb/db/blob/blob_counting_iterator.h @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cassert> + +#include "db/blob/blob_garbage_meter.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "table/internal_iterator.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +// An internal iterator that passes each key-value encountered to +// BlobGarbageMeter as inflow in order to measure the total number and size of +// blobs in the compaction input on a per-blob file basis. +class BlobCountingIterator : public InternalIterator { + public: + BlobCountingIterator(InternalIterator* iter, + BlobGarbageMeter* blob_garbage_meter) + : iter_(iter), blob_garbage_meter_(blob_garbage_meter) { + assert(iter_); + assert(blob_garbage_meter_); + + UpdateAndCountBlobIfNeeded(); + } + + bool Valid() const override { return iter_->Valid() && status_.ok(); } + + void SeekToFirst() override { + iter_->SeekToFirst(); + UpdateAndCountBlobIfNeeded(); + } + + void SeekToLast() override { + iter_->SeekToLast(); + UpdateAndCountBlobIfNeeded(); + } + + void Seek(const Slice& target) override { + iter_->Seek(target); + UpdateAndCountBlobIfNeeded(); + } + + void SeekForPrev(const Slice& target) override { + iter_->SeekForPrev(target); + UpdateAndCountBlobIfNeeded(); + } + + void Next() override { + assert(Valid()); + + iter_->Next(); + UpdateAndCountBlobIfNeeded(); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(Valid()); + + const bool res = iter_->NextAndGetResult(result); + UpdateAndCountBlobIfNeeded(); + return res; + } + + void Prev() override { + assert(Valid()); + + iter_->Prev(); + UpdateAndCountBlobIfNeeded(); + } + + Slice key() const override { + assert(Valid()); + return iter_->key(); + } + + Slice user_key() const override { + assert(Valid()); + return iter_->user_key(); + } + + Slice value() const override { + assert(Valid()); + return iter_->value(); + } + + Status status() const override { return status_; } + + bool PrepareValue() override { + assert(Valid()); + return iter_->PrepareValue(); + } + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return iter_->MayBeOutOfLowerBound(); + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(Valid()); + return iter_->UpperBoundCheckResult(); + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + + bool IsKeyPinned() const override { + assert(Valid()); + return iter_->IsKeyPinned(); + } + + bool IsValuePinned() const override { + assert(Valid()); + return iter_->IsValuePinned(); + } + + Status GetProperty(std::string prop_name, std::string* prop) override { + return iter_->GetProperty(prop_name, prop); + } + + private: + void UpdateAndCountBlobIfNeeded() { + assert(!iter_->Valid() || iter_->status().ok()); + + if (!iter_->Valid()) { + status_ = iter_->status(); + return; + } + + TEST_SYNC_POINT( + "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow"); + + status_ = blob_garbage_meter_->ProcessInFlow(key(), value()); + } + + InternalIterator* iter_; + BlobGarbageMeter* blob_garbage_meter_; + Status status_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_counting_iterator_test.cc b/src/rocksdb/db/blob/blob_counting_iterator_test.cc new file mode 100644 index 000000000..c7bbc8f58 --- /dev/null +++ b/src/rocksdb/db/blob/blob_counting_iterator_test.cc @@ -0,0 +1,327 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_counting_iterator.h" + +#include <string> +#include <vector> + +#include "db/blob/blob_garbage_meter.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/dbformat.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/vector_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +void CheckInFlow(const BlobGarbageMeter& blob_garbage_meter, + uint64_t blob_file_number, uint64_t count, uint64_t bytes) { + const auto& flows = blob_garbage_meter.flows(); + + const auto it = flows.find(blob_file_number); + if (it == flows.end()) { + ASSERT_EQ(count, 0); + ASSERT_EQ(bytes, 0); + return; + } + + const auto& in = it->second.GetInFlow(); + + ASSERT_EQ(in.GetCount(), count); + ASSERT_EQ(in.GetBytes(), bytes); +} + +TEST(BlobCountingIteratorTest, CountBlobs) { + // Note: the input consists of three key-values: two are blob references to + // different blob files, while the third one is a plain value. + constexpr char user_key0[] = "key0"; + constexpr char user_key1[] = "key1"; + constexpr char user_key2[] = "key2"; + + const std::vector<std::string> keys{ + test::KeyStr(user_key0, 1, kTypeBlobIndex), + test::KeyStr(user_key1, 2, kTypeBlobIndex), + test::KeyStr(user_key2, 3, kTypeValue)}; + + constexpr uint64_t first_blob_file_number = 4; + constexpr uint64_t first_offset = 1000; + constexpr uint64_t first_size = 2000; + + std::string first_blob_index; + BlobIndex::EncodeBlob(&first_blob_index, first_blob_file_number, first_offset, + first_size, kNoCompression); + + constexpr uint64_t second_blob_file_number = 6; + constexpr uint64_t second_offset = 2000; + constexpr uint64_t second_size = 4000; + + std::string second_blob_index; + BlobIndex::EncodeBlob(&second_blob_index, second_blob_file_number, + second_offset, second_size, kNoCompression); + + const std::vector<std::string> values{first_blob_index, second_blob_index, + "raw_value"}; + + assert(keys.size() == values.size()); + + VectorIterator input(keys, values); + BlobGarbageMeter blob_garbage_meter; + + BlobCountingIterator blob_counter(&input, &blob_garbage_meter); + + constexpr uint64_t first_expected_bytes = + first_size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key0) - 1); + constexpr uint64_t second_expected_bytes = + second_size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key1) - 1); + + // Call SeekToFirst and iterate forward + blob_counter.SeekToFirst(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 0, 0); + + blob_counter.Next(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + blob_counter.Next(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + blob_counter.Next(); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 1, + first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + // Do it again using NextAndGetResult + blob_counter.SeekToFirst(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 1, + second_expected_bytes); + + { + IterateResult result; + ASSERT_TRUE(blob_counter.NextAndGetResult(&result)); + ASSERT_EQ(result.key, keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + } + + { + IterateResult result; + ASSERT_TRUE(blob_counter.NextAndGetResult(&result)); + ASSERT_EQ(result.key, keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + } + + { + IterateResult result; + ASSERT_FALSE(blob_counter.NextAndGetResult(&result)); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + } + + // Call SeekToLast and iterate backward + blob_counter.SeekToLast(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 2, + 2 * second_expected_bytes); + + blob_counter.Prev(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 2, + 2 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + blob_counter.Prev(); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 3, + 3 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + blob_counter.Prev(); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 3, + 3 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + // Call Seek for all keys (plus one that's greater than all of them) + blob_counter.Seek(keys[0]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 3, + 3 * second_expected_bytes); + + blob_counter.Seek(keys[1]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.Seek(keys[2]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.Seek("zzz"); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + // Call SeekForPrev for all keys (plus one that's less than all of them) + blob_counter.SeekForPrev("aaa"); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 4, + 4 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.SeekForPrev(keys[0]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[0]); + ASSERT_EQ(blob_counter.user_key(), user_key0); + ASSERT_EQ(blob_counter.value(), values[0]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 5, + 5 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 4, + 4 * second_expected_bytes); + + blob_counter.SeekForPrev(keys[1]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[1]); + ASSERT_EQ(blob_counter.user_key(), user_key1); + ASSERT_EQ(blob_counter.value(), values[1]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 5, + 5 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 5, + 5 * second_expected_bytes); + + blob_counter.SeekForPrev(keys[2]); + ASSERT_TRUE(blob_counter.Valid()); + ASSERT_OK(blob_counter.status()); + ASSERT_EQ(blob_counter.key(), keys[2]); + ASSERT_EQ(blob_counter.user_key(), user_key2); + ASSERT_EQ(blob_counter.value(), values[2]); + CheckInFlow(blob_garbage_meter, first_blob_file_number, 5, + 5 * first_expected_bytes); + CheckInFlow(blob_garbage_meter, second_blob_file_number, 5, + 5 * second_expected_bytes); +} + +TEST(BlobCountingIteratorTest, CorruptBlobIndex) { + const std::vector<std::string> keys{ + test::KeyStr("user_key", 1, kTypeBlobIndex)}; + const std::vector<std::string> values{"i_am_not_a_blob_index"}; + + assert(keys.size() == values.size()); + + VectorIterator input(keys, values); + BlobGarbageMeter blob_garbage_meter; + + BlobCountingIterator blob_counter(&input, &blob_garbage_meter); + + blob_counter.SeekToFirst(); + ASSERT_FALSE(blob_counter.Valid()); + ASSERT_NOK(blob_counter.status()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_fetcher.cc b/src/rocksdb/db/blob/blob_fetcher.cc new file mode 100644 index 000000000..124429f93 --- /dev/null +++ b/src/rocksdb/db/blob/blob_fetcher.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_fetcher.h" + +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobFetcher::FetchBlob(const Slice& user_key, + const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, + uint64_t* bytes_read) const { + assert(version_); + + return version_->GetBlob(read_options_, user_key, blob_index_slice, + prefetch_buffer, blob_value, bytes_read); +} + +Status BlobFetcher::FetchBlob(const Slice& user_key, + const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, + uint64_t* bytes_read) const { + assert(version_); + + return version_->GetBlob(read_options_, user_key, blob_index, prefetch_buffer, + blob_value, bytes_read); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_fetcher.h b/src/rocksdb/db/blob/blob_fetcher.h new file mode 100644 index 000000000..8aeaf965d --- /dev/null +++ b/src/rocksdb/db/blob/blob_fetcher.h @@ -0,0 +1,37 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Version; +class Slice; +class FilePrefetchBuffer; +class PinnableSlice; +class BlobIndex; + +// A thin wrapper around the blob retrieval functionality of Version. +class BlobFetcher { + public: + BlobFetcher(const Version* version, const ReadOptions& read_options) + : version_(version), read_options_(read_options) {} + + Status FetchBlob(const Slice& user_key, const Slice& blob_index_slice, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, uint64_t* bytes_read) const; + + Status FetchBlob(const Slice& user_key, const BlobIndex& blob_index, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* blob_value, uint64_t* bytes_read) const; + + private: + const Version* version_; + ReadOptions read_options_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_addition.cc b/src/rocksdb/db/blob/blob_file_addition.cc new file mode 100644 index 000000000..71b1bb7fc --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_addition.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_addition.h" + +#include <ostream> +#include <sstream> + +#include "logging/event_logger.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +// Tags for custom fields. Note that these get persisted in the manifest, +// so existing tags should not be modified. +enum BlobFileAddition::CustomFieldTags : uint32_t { + kEndMarker, + + // Add forward compatible fields here + + ///////////////////////////////////////////////////////////////////// + + kForwardIncompatibleMask = 1 << 6, + + // Add forward incompatible fields here +}; + +void BlobFileAddition::EncodeTo(std::string* output) const { + PutVarint64(output, blob_file_number_); + PutVarint64(output, total_blob_count_); + PutVarint64(output, total_blob_bytes_); + PutLengthPrefixedSlice(output, checksum_method_); + PutLengthPrefixedSlice(output, checksum_value_); + + // Encode any custom fields here. The format to use is a Varint32 tag (see + // CustomFieldTags above) followed by a length prefixed slice. Unknown custom + // fields will be ignored during decoding unless they're in the forward + // incompatible range. + + TEST_SYNC_POINT_CALLBACK("BlobFileAddition::EncodeTo::CustomFields", output); + + PutVarint32(output, kEndMarker); +} + +Status BlobFileAddition::DecodeFrom(Slice* input) { + constexpr char class_name[] = "BlobFileAddition"; + + if (!GetVarint64(input, &blob_file_number_)) { + return Status::Corruption(class_name, "Error decoding blob file number"); + } + + if (!GetVarint64(input, &total_blob_count_)) { + return Status::Corruption(class_name, "Error decoding total blob count"); + } + + if (!GetVarint64(input, &total_blob_bytes_)) { + return Status::Corruption(class_name, "Error decoding total blob bytes"); + } + + Slice checksum_method; + if (!GetLengthPrefixedSlice(input, &checksum_method)) { + return Status::Corruption(class_name, "Error decoding checksum method"); + } + checksum_method_ = checksum_method.ToString(); + + Slice checksum_value; + if (!GetLengthPrefixedSlice(input, &checksum_value)) { + return Status::Corruption(class_name, "Error decoding checksum value"); + } + checksum_value_ = checksum_value.ToString(); + + while (true) { + uint32_t custom_field_tag = 0; + if (!GetVarint32(input, &custom_field_tag)) { + return Status::Corruption(class_name, "Error decoding custom field tag"); + } + + if (custom_field_tag == kEndMarker) { + break; + } + + if (custom_field_tag & kForwardIncompatibleMask) { + return Status::Corruption( + class_name, "Forward incompatible custom field encountered"); + } + + Slice custom_field_value; + if (!GetLengthPrefixedSlice(input, &custom_field_value)) { + return Status::Corruption(class_name, + "Error decoding custom field value"); + } + } + + return Status::OK(); +} + +std::string BlobFileAddition::DebugString() const { + std::ostringstream oss; + + oss << *this; + + return oss.str(); +} + +std::string BlobFileAddition::DebugJSON() const { + JSONWriter jw; + + jw << *this; + + jw.EndObject(); + + return jw.Get(); +} + +bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs) { + return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() && + lhs.GetTotalBlobCount() == rhs.GetTotalBlobCount() && + lhs.GetTotalBlobBytes() == rhs.GetTotalBlobBytes() && + lhs.GetChecksumMethod() == rhs.GetChecksumMethod() && + lhs.GetChecksumValue() == rhs.GetChecksumValue(); +} + +bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs) { + return !(lhs == rhs); +} + +std::ostream& operator<<(std::ostream& os, + const BlobFileAddition& blob_file_addition) { + os << "blob_file_number: " << blob_file_addition.GetBlobFileNumber() + << " total_blob_count: " << blob_file_addition.GetTotalBlobCount() + << " total_blob_bytes: " << blob_file_addition.GetTotalBlobBytes() + << " checksum_method: " << blob_file_addition.GetChecksumMethod() + << " checksum_value: " + << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true); + + return os; +} + +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileAddition& blob_file_addition) { + jw << "BlobFileNumber" << blob_file_addition.GetBlobFileNumber() + << "TotalBlobCount" << blob_file_addition.GetTotalBlobCount() + << "TotalBlobBytes" << blob_file_addition.GetTotalBlobBytes() + << "ChecksumMethod" << blob_file_addition.GetChecksumMethod() + << "ChecksumValue" + << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true); + + return jw; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_addition.h b/src/rocksdb/db/blob/blob_file_addition.h new file mode 100644 index 000000000..43b1a0bcb --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_addition.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cassert> +#include <cstdint> +#include <iosfwd> +#include <string> + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter; +class Slice; +class Status; + +class BlobFileAddition { + public: + BlobFileAddition() = default; + + BlobFileAddition(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) + : blob_file_number_(blob_file_number), + total_blob_count_(total_blob_count), + total_blob_bytes_(total_blob_bytes), + checksum_method_(std::move(checksum_method)), + checksum_value_(std::move(checksum_value)) { + assert(checksum_method_.empty() == checksum_value_.empty()); + } + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetTotalBlobCount() const { return total_blob_count_; } + uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; } + const std::string& GetChecksumMethod() const { return checksum_method_; } + const std::string& GetChecksumValue() const { return checksum_value_; } + + void EncodeTo(std::string* output) const; + Status DecodeFrom(Slice* input); + + std::string DebugString() const; + std::string DebugJSON() const; + + private: + enum CustomFieldTags : uint32_t; + + uint64_t blob_file_number_ = kInvalidBlobFileNumber; + uint64_t total_blob_count_ = 0; + uint64_t total_blob_bytes_ = 0; + std::string checksum_method_; + std::string checksum_value_; +}; + +bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs); +bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs); + +std::ostream& operator<<(std::ostream& os, + const BlobFileAddition& blob_file_addition); +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileAddition& blob_file_addition); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_addition_test.cc b/src/rocksdb/db/blob/blob_file_addition_test.cc new file mode 100644 index 000000000..64cb0a9d6 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_addition_test.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_addition.h" + +#include <cstdint> +#include <cstring> +#include <string> + +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileAdditionTest : public testing::Test { + public: + static void TestEncodeDecode(const BlobFileAddition& blob_file_addition) { + std::string encoded; + blob_file_addition.EncodeTo(&encoded); + + BlobFileAddition decoded; + Slice input(encoded); + ASSERT_OK(decoded.DecodeFrom(&input)); + + ASSERT_EQ(blob_file_addition, decoded); + } +}; + +TEST_F(BlobFileAdditionTest, Empty) { + BlobFileAddition blob_file_addition; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), kInvalidBlobFileNumber); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 0); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), 0); + ASSERT_TRUE(blob_file_addition.GetChecksumMethod().empty()); + ASSERT_TRUE(blob_file_addition.GetChecksumValue().empty()); + + TestEncodeDecode(blob_file_addition); +} + +TEST_F(BlobFileAdditionTest, NonEmpty) { + constexpr uint64_t blob_file_number = 123; + constexpr uint64_t total_blob_count = 2; + constexpr uint64_t total_blob_bytes = 123456; + const std::string checksum_method("SHA1"); + const std::string checksum_value( + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"); + + BlobFileAddition blob_file_addition(blob_file_number, total_blob_count, + total_blob_bytes, checksum_method, + checksum_value); + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), total_blob_count); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), total_blob_bytes); + ASSERT_EQ(blob_file_addition.GetChecksumMethod(), checksum_method); + ASSERT_EQ(blob_file_addition.GetChecksumValue(), checksum_value); + + TestEncodeDecode(blob_file_addition); +} + +TEST_F(BlobFileAdditionTest, DecodeErrors) { + std::string str; + Slice slice(str); + + BlobFileAddition blob_file_addition; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "blob file number")); + } + + constexpr uint64_t blob_file_number = 123; + PutVarint64(&str, blob_file_number); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "total blob count")); + } + + constexpr uint64_t total_blob_count = 4567; + PutVarint64(&str, total_blob_count); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "total blob bytes")); + } + + constexpr uint64_t total_blob_bytes = 12345678; + PutVarint64(&str, total_blob_bytes); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "checksum method")); + } + + constexpr char checksum_method[] = "SHA1"; + PutLengthPrefixedSlice(&str, checksum_method); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "checksum value")); + } + + constexpr char checksum_value[] = + "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52" + "\x5c\xbd"; + PutLengthPrefixedSlice(&str, checksum_value); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field tag")); + } + + constexpr uint32_t custom_tag = 2; + PutVarint32(&str, custom_tag); + slice = str; + + { + const Status s = blob_file_addition.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field value")); + } +} + +TEST_F(BlobFileAdditionTest, ForwardCompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast<std::string*>(arg); + + constexpr uint32_t forward_compatible_tag = 2; + PutVarint32(output, forward_compatible_tag); + + PutLengthPrefixedSlice(output, "deadbeef"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 678; + constexpr uint64_t total_blob_count = 9999; + constexpr uint64_t total_blob_bytes = 100000000; + const std::string checksum_method("CRC32"); + const std::string checksum_value("\x3d\x87\xff\x57"); + + BlobFileAddition blob_file_addition(blob_file_number, total_blob_count, + total_blob_bytes, checksum_method, + checksum_value); + + TestEncodeDecode(blob_file_addition); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileAdditionTest, ForwardIncompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast<std::string*>(arg); + + constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1; + PutVarint32(output, forward_incompatible_tag); + + PutLengthPrefixedSlice(output, "foobar"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 456; + constexpr uint64_t total_blob_count = 100; + constexpr uint64_t total_blob_bytes = 2000000; + const std::string checksum_method("CRC32B"); + const std::string checksum_value("\x6d\xbd\xf2\x3a"); + + BlobFileAddition blob_file_addition(blob_file_number, total_blob_count, + total_blob_bytes, checksum_method, + checksum_value); + + std::string encoded; + blob_file_addition.EncodeTo(&encoded); + + BlobFileAddition decoded_blob_file_addition; + Slice input(encoded); + const Status s = decoded_blob_file_addition.DecodeFrom(&input); + + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_file_builder.cc b/src/rocksdb/db/blob/blob_file_builder.cc new file mode 100644 index 000000000..5e0e7f6cb --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_builder.cc @@ -0,0 +1,446 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_builder.h" + +#include <cassert> + +#include "db/blob/blob_contents.h" +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_completion_callback.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "db/event_helpers.h" +#include "db/version_set.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "logging/logging.h" +#include "options/cf_options.h" +#include "options/options_helper.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "trace_replay/io_tracer.h" +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +BlobFileBuilder::BlobFileBuilder( + VersionSet* versions, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, + std::string db_id, std::string db_session_id, int job_id, + uint32_t column_family_id, const std::string& column_family_name, + Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const std::shared_ptr<IOTracer>& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector<std::string>* blob_file_paths, + std::vector<BlobFileAddition>* blob_file_additions) + : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs, + immutable_options, mutable_cf_options, file_options, + db_id, db_session_id, job_id, column_family_id, + column_family_name, io_priority, write_hint, io_tracer, + blob_callback, creation_reason, blob_file_paths, + blob_file_additions) {} + +BlobFileBuilder::BlobFileBuilder( + std::function<uint64_t()> file_number_generator, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, + std::string db_id, std::string db_session_id, int job_id, + uint32_t column_family_id, const std::string& column_family_name, + Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const std::shared_ptr<IOTracer>& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector<std::string>* blob_file_paths, + std::vector<BlobFileAddition>* blob_file_additions) + : file_number_generator_(std::move(file_number_generator)), + fs_(fs), + immutable_options_(immutable_options), + min_blob_size_(mutable_cf_options->min_blob_size), + blob_file_size_(mutable_cf_options->blob_file_size), + blob_compression_type_(mutable_cf_options->blob_compression_type), + prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache), + file_options_(file_options), + db_id_(std::move(db_id)), + db_session_id_(std::move(db_session_id)), + job_id_(job_id), + column_family_id_(column_family_id), + column_family_name_(column_family_name), + io_priority_(io_priority), + write_hint_(write_hint), + io_tracer_(io_tracer), + blob_callback_(blob_callback), + creation_reason_(creation_reason), + blob_file_paths_(blob_file_paths), + blob_file_additions_(blob_file_additions), + blob_count_(0), + blob_bytes_(0) { + assert(file_number_generator_); + assert(fs_); + assert(immutable_options_); + assert(file_options_); + assert(blob_file_paths_); + assert(blob_file_paths_->empty()); + assert(blob_file_additions_); + assert(blob_file_additions_->empty()); +} + +BlobFileBuilder::~BlobFileBuilder() = default; + +Status BlobFileBuilder::Add(const Slice& key, const Slice& value, + std::string* blob_index) { + assert(blob_index); + assert(blob_index->empty()); + + if (value.size() < min_blob_size_) { + return Status::OK(); + } + + { + const Status s = OpenBlobFileIfNeeded(); + if (!s.ok()) { + return s; + } + } + + Slice blob = value; + std::string compressed_blob; + + { + const Status s = CompressBlobIfNeeded(&blob, &compressed_blob); + if (!s.ok()) { + return s; + } + } + + uint64_t blob_file_number = 0; + uint64_t blob_offset = 0; + + { + const Status s = + WriteBlobToFile(key, blob, &blob_file_number, &blob_offset); + if (!s.ok()) { + return s; + } + } + + { + const Status s = CloseBlobFileIfNeeded(); + if (!s.ok()) { + return s; + } + } + + { + const Status s = + PutBlobIntoCacheIfNeeded(value, blob_file_number, blob_offset); + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_options_->info_log, + "Failed to pre-populate the blob into blob cache: %s", + s.ToString().c_str()); + } + } + + BlobIndex::EncodeBlob(blob_index, blob_file_number, blob_offset, blob.size(), + blob_compression_type_); + + return Status::OK(); +} + +Status BlobFileBuilder::Finish() { + if (!IsBlobFileOpen()) { + return Status::OK(); + } + + return CloseBlobFile(); +} + +bool BlobFileBuilder::IsBlobFileOpen() const { return !!writer_; } + +Status BlobFileBuilder::OpenBlobFileIfNeeded() { + if (IsBlobFileOpen()) { + return Status::OK(); + } + + assert(!blob_count_); + assert(!blob_bytes_); + + assert(file_number_generator_); + const uint64_t blob_file_number = file_number_generator_(); + + assert(immutable_options_); + assert(!immutable_options_->cf_paths.empty()); + std::string blob_file_path = + BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number); + + if (blob_callback_) { + blob_callback_->OnBlobFileCreationStarted( + blob_file_path, column_family_name_, job_id_, creation_reason_); + } + + std::unique_ptr<FSWritableFile> file; + + { + assert(file_options_); + Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_); + + TEST_SYNC_POINT_CALLBACK( + "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s); + + if (!s.ok()) { + return s; + } + } + + // Note: files get added to blob_file_paths_ right after the open, so they + // can be cleaned up upon failure. Contrast this with blob_file_additions_, + // which only contains successfully written files. + assert(blob_file_paths_); + blob_file_paths_->emplace_back(std::move(blob_file_path)); + + assert(file); + file->SetIOPriority(io_priority_); + file->SetWriteLifeTimeHint(write_hint_); + FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types; + Statistics* const statistics = immutable_options_->stats; + std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter( + std::move(file), blob_file_paths_->back(), *file_options_, + immutable_options_->clock, io_tracer_, statistics, + immutable_options_->listeners, + immutable_options_->file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kBlobFile), false)); + + constexpr bool do_flush = false; + + std::unique_ptr<BlobLogWriter> blob_log_writer(new BlobLogWriter( + std::move(file_writer), immutable_options_->clock, statistics, + blob_file_number, immutable_options_->use_fsync, do_flush)); + + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + + BlobLogHeader header(column_family_id_, blob_compression_type_, has_ttl, + expiration_range); + + { + Status s = blob_log_writer->WriteHeader(header); + + TEST_SYNC_POINT_CALLBACK( + "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s); + + if (!s.ok()) { + return s; + } + } + + writer_ = std::move(blob_log_writer); + + assert(IsBlobFileOpen()); + + return Status::OK(); +} + +Status BlobFileBuilder::CompressBlobIfNeeded( + Slice* blob, std::string* compressed_blob) const { + assert(blob); + assert(compressed_blob); + assert(compressed_blob->empty()); + assert(immutable_options_); + + if (blob_compression_type_ == kNoCompression) { + return Status::OK(); + } + + CompressionOptions opts; + CompressionContext context(blob_compression_type_); + constexpr uint64_t sample_for_compression = 0; + + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + blob_compression_type_, sample_for_compression); + + constexpr uint32_t compression_format_version = 2; + + bool success = false; + + { + StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats, + BLOB_DB_COMPRESSION_MICROS); + success = + CompressData(*blob, info, compression_format_version, compressed_blob); + } + + if (!success) { + return Status::Corruption("Error compressing blob"); + } + + *blob = Slice(*compressed_blob); + + return Status::OK(); +} + +Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob, + uint64_t* blob_file_number, + uint64_t* blob_offset) { + assert(IsBlobFileOpen()); + assert(blob_file_number); + assert(blob_offset); + + uint64_t key_offset = 0; + + Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset); + + TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s); + + if (!s.ok()) { + return s; + } + + *blob_file_number = writer_->get_log_number(); + + ++blob_count_; + blob_bytes_ += BlobLogRecord::kHeaderSize + key.size() + blob.size(); + + return Status::OK(); +} + +Status BlobFileBuilder::CloseBlobFile() { + assert(IsBlobFileOpen()); + + BlobLogFooter footer; + footer.blob_count = blob_count_; + + std::string checksum_method; + std::string checksum_value; + + Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value); + + TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s); + + if (!s.ok()) { + return s; + } + + const uint64_t blob_file_number = writer_->get_log_number(); + + if (blob_callback_) { + s = blob_callback_->OnBlobFileCompleted( + blob_file_paths_->back(), column_family_name_, job_id_, + blob_file_number, creation_reason_, s, checksum_value, checksum_method, + blob_count_, blob_bytes_); + } + + assert(blob_file_additions_); + blob_file_additions_->emplace_back(blob_file_number, blob_count_, blob_bytes_, + std::move(checksum_method), + std::move(checksum_value)); + + assert(immutable_options_); + ROCKS_LOG_INFO(immutable_options_->logger, + "[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64 + " total blobs, %" PRIu64 " total bytes", + column_family_name_.c_str(), job_id_, blob_file_number, + blob_count_, blob_bytes_); + + writer_.reset(); + blob_count_ = 0; + blob_bytes_ = 0; + + return s; +} + +Status BlobFileBuilder::CloseBlobFileIfNeeded() { + assert(IsBlobFileOpen()); + + const WritableFileWriter* const file_writer = writer_->file(); + assert(file_writer); + + if (file_writer->GetFileSize() < blob_file_size_) { + return Status::OK(); + } + + return CloseBlobFile(); +} + +void BlobFileBuilder::Abandon(const Status& s) { + if (!IsBlobFileOpen()) { + return; + } + if (blob_callback_) { + // BlobFileBuilder::Abandon() is called because of error while writing to + // Blob files. So we can ignore the below error. + blob_callback_ + ->OnBlobFileCompleted(blob_file_paths_->back(), column_family_name_, + job_id_, writer_->get_log_number(), + creation_reason_, s, "", "", blob_count_, + blob_bytes_) + .PermitUncheckedError(); + } + + writer_.reset(); + blob_count_ = 0; + blob_bytes_ = 0; +} + +Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob, + uint64_t blob_file_number, + uint64_t blob_offset) const { + Status s = Status::OK(); + + auto blob_cache = immutable_options_->blob_cache; + auto statistics = immutable_options_->statistics.get(); + bool warm_cache = + prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly && + creation_reason_ == BlobFileCreationReason::kFlush; + + if (blob_cache && warm_cache) { + const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, + blob_file_number); + const CacheKey cache_key = base_cache_key.WithOffset(blob_offset); + const Slice key = cache_key.AsSlice(); + + const Cache::Priority priority = Cache::Priority::BOTTOM; + + // Objects to be put into the cache have to be heap-allocated and + // self-contained, i.e. own their contents. The Cache has to be able to + // take unique ownership of them. + CacheAllocationPtr allocation = + AllocateBlock(blob.size(), blob_cache->memory_allocator()); + memcpy(allocation.get(), blob.data(), blob.size()); + std::unique_ptr<BlobContents> buf = + BlobContents::Create(std::move(allocation), blob.size()); + + Cache::CacheItemHelper* const cache_item_helper = + BlobContents::GetCacheItemHelper(); + assert(cache_item_helper); + + if (immutable_options_->lowest_used_cache_tier == + CacheTier::kNonVolatileBlockTier) { + s = blob_cache->Insert(key, buf.get(), cache_item_helper, + buf->ApproximateMemoryUsage(), + nullptr /* cache_handle */, priority); + } else { + s = blob_cache->Insert(key, buf.get(), buf->ApproximateMemoryUsage(), + cache_item_helper->del_cb, + nullptr /* cache_handle */, priority); + } + + if (s.ok()) { + RecordTick(statistics, BLOB_DB_CACHE_ADD); + RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, buf->size()); + buf.release(); + } else { + RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES); + } + } + + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_builder.h b/src/rocksdb/db/blob/blob_file_builder.h new file mode 100644 index 000000000..8e7aab502 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_builder.h @@ -0,0 +1,112 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <cinttypes> +#include <functional> +#include <memory> +#include <string> +#include <vector> + +#include "rocksdb/advanced_options.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/env.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class VersionSet; +class FileSystem; +class SystemClock; +struct ImmutableOptions; +struct MutableCFOptions; +struct FileOptions; +class BlobFileAddition; +class Status; +class Slice; +class BlobLogWriter; +class IOTracer; +class BlobFileCompletionCallback; + +class BlobFileBuilder { + public: + BlobFileBuilder(VersionSet* versions, FileSystem* fs, + const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, + const FileOptions* file_options, std::string db_id, + std::string db_session_id, int job_id, + uint32_t column_family_id, + const std::string& column_family_name, + Env::IOPriority io_priority, + Env::WriteLifeTimeHint write_hint, + const std::shared_ptr<IOTracer>& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector<std::string>* blob_file_paths, + std::vector<BlobFileAddition>* blob_file_additions); + + BlobFileBuilder(std::function<uint64_t()> file_number_generator, + FileSystem* fs, const ImmutableOptions* immutable_options, + const MutableCFOptions* mutable_cf_options, + const FileOptions* file_options, std::string db_id, + std::string db_session_id, int job_id, + uint32_t column_family_id, + const std::string& column_family_name, + Env::IOPriority io_priority, + Env::WriteLifeTimeHint write_hint, + const std::shared_ptr<IOTracer>& io_tracer, + BlobFileCompletionCallback* blob_callback, + BlobFileCreationReason creation_reason, + std::vector<std::string>* blob_file_paths, + std::vector<BlobFileAddition>* blob_file_additions); + + BlobFileBuilder(const BlobFileBuilder&) = delete; + BlobFileBuilder& operator=(const BlobFileBuilder&) = delete; + + ~BlobFileBuilder(); + + Status Add(const Slice& key, const Slice& value, std::string* blob_index); + Status Finish(); + void Abandon(const Status& s); + + private: + bool IsBlobFileOpen() const; + Status OpenBlobFileIfNeeded(); + Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const; + Status WriteBlobToFile(const Slice& key, const Slice& blob, + uint64_t* blob_file_number, uint64_t* blob_offset); + Status CloseBlobFile(); + Status CloseBlobFileIfNeeded(); + + Status PutBlobIntoCacheIfNeeded(const Slice& blob, uint64_t blob_file_number, + uint64_t blob_offset) const; + + std::function<uint64_t()> file_number_generator_; + FileSystem* fs_; + const ImmutableOptions* immutable_options_; + uint64_t min_blob_size_; + uint64_t blob_file_size_; + CompressionType blob_compression_type_; + PrepopulateBlobCache prepopulate_blob_cache_; + const FileOptions* file_options_; + const std::string db_id_; + const std::string db_session_id_; + int job_id_; + uint32_t column_family_id_; + std::string column_family_name_; + Env::IOPriority io_priority_; + Env::WriteLifeTimeHint write_hint_; + std::shared_ptr<IOTracer> io_tracer_; + BlobFileCompletionCallback* blob_callback_; + BlobFileCreationReason creation_reason_; + std::vector<std::string>* blob_file_paths_; + std::vector<BlobFileAddition>* blob_file_additions_; + std::unique_ptr<BlobLogWriter> writer_; + uint64_t blob_count_; + uint64_t blob_bytes_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_builder_test.cc b/src/rocksdb/db/blob/blob_file_builder_test.cc new file mode 100644 index 000000000..3a0feee45 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_builder_test.cc @@ -0,0 +1,680 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_builder.h" + +#include <cassert> +#include <cinttypes> +#include <string> +#include <utility> +#include <vector> + +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_sequential_reader.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/options.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/compression.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +class TestFileNumberGenerator { + public: + uint64_t operator()() { return ++next_file_number_; } + + private: + uint64_t next_file_number_ = 1; +}; + +class BlobFileBuilderTest : public testing::Test { + protected: + BlobFileBuilderTest() { + mock_env_.reset(MockEnv::Create(Env::Default())); + fs_ = mock_env_->GetFileSystem().get(); + clock_ = mock_env_->GetSystemClock().get(); + } + + void VerifyBlobFile(uint64_t blob_file_number, + const std::string& blob_file_path, + uint32_t column_family_id, + CompressionType blob_compression_type, + const std::vector<std::pair<std::string, std::string>>& + expected_key_value_pairs, + const std::vector<std::string>& blob_indexes) { + assert(expected_key_value_pairs.size() == blob_indexes.size()); + + std::unique_ptr<FSRandomAccessFile> file; + constexpr IODebugContext* dbg = nullptr; + ASSERT_OK( + fs_->NewRandomAccessFile(blob_file_path, file_options_, &file, dbg)); + + std::unique_ptr<RandomAccessFileReader> file_reader( + new RandomAccessFileReader(std::move(file), blob_file_path, clock_)); + + constexpr Statistics* statistics = nullptr; + BlobLogSequentialReader blob_log_reader(std::move(file_reader), clock_, + statistics); + + BlobLogHeader header; + ASSERT_OK(blob_log_reader.ReadHeader(&header)); + ASSERT_EQ(header.version, kVersion1); + ASSERT_EQ(header.column_family_id, column_family_id); + ASSERT_EQ(header.compression, blob_compression_type); + ASSERT_FALSE(header.has_ttl); + ASSERT_EQ(header.expiration_range, ExpirationRange()); + + for (size_t i = 0; i < expected_key_value_pairs.size(); ++i) { + BlobLogRecord record; + uint64_t blob_offset = 0; + + ASSERT_OK(blob_log_reader.ReadRecord( + &record, BlobLogSequentialReader::kReadHeaderKeyBlob, &blob_offset)); + + // Check the contents of the blob file + const auto& expected_key_value = expected_key_value_pairs[i]; + const auto& key = expected_key_value.first; + const auto& value = expected_key_value.second; + + ASSERT_EQ(record.key_size, key.size()); + ASSERT_EQ(record.value_size, value.size()); + ASSERT_EQ(record.expiration, 0); + ASSERT_EQ(record.key, key); + ASSERT_EQ(record.value, value); + + // Make sure the blob reference returned by the builder points to the + // right place + BlobIndex blob_index; + ASSERT_OK(blob_index.DecodeFrom(blob_indexes[i])); + ASSERT_FALSE(blob_index.IsInlined()); + ASSERT_FALSE(blob_index.HasTTL()); + ASSERT_EQ(blob_index.file_number(), blob_file_number); + ASSERT_EQ(blob_index.offset(), blob_offset); + ASSERT_EQ(blob_index.size(), value.size()); + } + + BlobLogFooter footer; + ASSERT_OK(blob_log_reader.ReadFooter(&footer)); + ASSERT_EQ(footer.blob_count, expected_key_value_pairs.size()); + ASSERT_EQ(footer.expiration_range, ExpirationRange()); + } + + std::unique_ptr<Env> mock_env_; + FileSystem* fs_; + SystemClock* clock_; + FileOptions file_options_; +}; + +TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { + // Build a single blob file + constexpr size_t number_of_blobs = 10; + constexpr size_t key_size = 1; + constexpr size_t value_size = 4; + constexpr size_t value_offset = 1234; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_BuildAndCheckOneFile"), + 0); + options.enable_blob_files = true; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector<std::string> blob_file_paths; + std::vector<BlobFileAddition> blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + std::vector<std::pair<std::string, std::string>> expected_key_value_pairs( + number_of_blobs); + std::vector<std::string> blob_indexes(number_of_blobs); + + for (size_t i = 0; i < number_of_blobs; ++i) { + auto& expected_key_value = expected_key_value_pairs[i]; + + auto& key = expected_key_value.first; + key = std::to_string(i); + assert(key.size() == key_size); + + auto& value = expected_key_value.second; + value = std::to_string(i + value_offset); + assert(value.size() == value_size); + + auto& blob_index = blob_indexes[i]; + + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + } + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + + const std::string& blob_file_path = blob_file_paths[0]; + + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_EQ(blob_file_additions.size(), 1); + + const auto& blob_file_addition = blob_file_additions[0]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), number_of_blobs); + ASSERT_EQ( + blob_file_addition.GetTotalBlobBytes(), + number_of_blobs * (BlobLogRecord::kHeaderSize + key_size + value_size)); + + // Verify the contents of the new blob file as well as the blob references + VerifyBlobFile(blob_file_number, blob_file_path, column_family_id, + kNoCompression, expected_key_value_pairs, blob_indexes); +} + +TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) { + // Build multiple blob files: file size limit is set to the size of a single + // value, so each blob ends up in a file of its own + constexpr size_t number_of_blobs = 10; + constexpr size_t key_size = 1; + constexpr size_t value_size = 10; + constexpr size_t value_offset = 1234567890; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_BuildAndCheckMultipleFiles"), + 0); + options.enable_blob_files = true; + options.blob_file_size = value_size; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector<std::string> blob_file_paths; + std::vector<BlobFileAddition> blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + std::vector<std::pair<std::string, std::string>> expected_key_value_pairs( + number_of_blobs); + std::vector<std::string> blob_indexes(number_of_blobs); + + for (size_t i = 0; i < number_of_blobs; ++i) { + auto& expected_key_value = expected_key_value_pairs[i]; + + auto& key = expected_key_value.first; + key = std::to_string(i); + assert(key.size() == key_size); + + auto& value = expected_key_value.second; + value = std::to_string(i + value_offset); + assert(value.size() == value_size); + + auto& blob_index = blob_indexes[i]; + + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + } + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + ASSERT_EQ(blob_file_paths.size(), number_of_blobs); + ASSERT_EQ(blob_file_additions.size(), number_of_blobs); + + for (size_t i = 0; i < number_of_blobs; ++i) { + const uint64_t blob_file_number = i + 2; + + ASSERT_EQ(blob_file_paths[i], + BlobFileName(immutable_options.cf_paths.front().path, + blob_file_number)); + + const auto& blob_file_addition = blob_file_additions[i]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), + BlobLogRecord::kHeaderSize + key_size + value_size); + } + + // Verify the contents of the new blob files as well as the blob references + for (size_t i = 0; i < number_of_blobs; ++i) { + std::vector<std::pair<std::string, std::string>> expected_key_value_pair{ + expected_key_value_pairs[i]}; + std::vector<std::string> blob_index{blob_indexes[i]}; + + VerifyBlobFile(i + 2, blob_file_paths[i], column_family_id, kNoCompression, + expected_key_value_pair, blob_index); + } +} + +TEST_F(BlobFileBuilderTest, InlinedValues) { + // All values are below the min_blob_size threshold; no blob files get written + constexpr size_t number_of_blobs = 10; + constexpr size_t key_size = 1; + constexpr size_t value_size = 10; + constexpr size_t value_offset = 1234567890; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_InlinedValues"), + 0); + options.enable_blob_files = true; + options.min_blob_size = 1024; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector<std::string> blob_file_paths; + std::vector<BlobFileAddition> blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + for (size_t i = 0; i < number_of_blobs; ++i) { + const std::string key = std::to_string(i); + assert(key.size() == key_size); + + const std::string value = std::to_string(i + value_offset); + assert(value.size() == value_size); + + std::string blob_index; + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_TRUE(blob_index.empty()); + } + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + ASSERT_TRUE(blob_file_paths.empty()); + ASSERT_TRUE(blob_file_additions.empty()); +} + +TEST_F(BlobFileBuilderTest, Compression) { + // Build a blob file with a compressed blob + if (!Snappy_Supported()) { + return; + } + + constexpr size_t key_size = 1; + constexpr size_t value_size = 100; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Compression"), + 0); + options.enable_blob_files = true; + options.blob_compression_type = kSnappyCompression; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector<std::string> blob_file_paths; + std::vector<BlobFileAddition> blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + const std::string key("1"); + const std::string uncompressed_value(value_size, 'x'); + + std::string blob_index; + + ASSERT_OK(builder.Add(key, uncompressed_value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + + const std::string& blob_file_path = blob_file_paths[0]; + + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_EQ(blob_file_additions.size(), 1); + + const auto& blob_file_addition = blob_file_additions[0]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1); + + CompressionOptions opts; + CompressionContext context(kSnappyCompression); + constexpr uint64_t sample_for_compression = 0; + + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + kSnappyCompression, sample_for_compression); + + std::string compressed_value; + ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(), + uncompressed_value.size(), &compressed_value)); + + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), + BlobLogRecord::kHeaderSize + key_size + compressed_value.size()); + + // Verify the contents of the new blob file as well as the blob reference + std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{ + {key, compressed_value}}; + std::vector<std::string> blob_indexes{blob_index}; + + VerifyBlobFile(blob_file_number, blob_file_path, column_family_id, + kSnappyCompression, expected_key_value_pairs, blob_indexes); +} + +TEST_F(BlobFileBuilderTest, CompressionError) { + // Simulate an error during compression + if (!Snappy_Supported()) { + return; + } + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderTest_CompressionError"), + 0); + options.enable_blob_files = true; + options.blob_compression_type = kSnappyCompression; + options.env = mock_env_.get(); + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector<std::string> blob_file_paths; + std::vector<BlobFileAddition> blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue", + [](void* arg) { + bool* ret = static_cast<bool*>(arg); + *ret = false; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr char key[] = "1"; + constexpr char value[] = "deadbeef"; + + std::string blob_index; + + ASSERT_TRUE(builder.Add(key, value, &blob_index).IsCorruption()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + ASSERT_EQ( + blob_file_paths[0], + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_TRUE(blob_file_additions.empty()); +} + +TEST_F(BlobFileBuilderTest, Checksum) { + // Build a blob file with checksum + + class DummyFileChecksumGenerator : public FileChecksumGenerator { + public: + void Update(const char* /* data */, size_t /* n */) override {} + + void Finalize() override {} + + std::string GetChecksum() const override { return std::string("dummy"); } + + const char* Name() const override { return "DummyFileChecksum"; } + }; + + class DummyFileChecksumGenFactory : public FileChecksumGenFactory { + public: + std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator( + const FileChecksumGenContext& /* context */) override { + return std::unique_ptr<FileChecksumGenerator>( + new DummyFileChecksumGenerator); + } + + const char* Name() const override { return "DummyFileChecksumGenFactory"; } + }; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Checksum"), + 0); + options.enable_blob_files = true; + options.file_checksum_gen_factory = + std::make_shared<DummyFileChecksumGenFactory>(); + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector<std::string> blob_file_paths; + std::vector<BlobFileAddition> blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + const std::string key("1"); + const std::string value("deadbeef"); + + std::string blob_index; + + ASSERT_OK(builder.Add(key, value, &blob_index)); + ASSERT_FALSE(blob_index.empty()); + + ASSERT_OK(builder.Finish()); + + // Check the metadata generated + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + + const std::string& blob_file_path = blob_file_paths[0]; + + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); + + ASSERT_EQ(blob_file_additions.size(), 1); + + const auto& blob_file_addition = blob_file_additions[0]; + + ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1); + ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), + BlobLogRecord::kHeaderSize + key.size() + value.size()); + ASSERT_EQ(blob_file_addition.GetChecksumMethod(), "DummyFileChecksum"); + ASSERT_EQ(blob_file_addition.GetChecksumValue(), "dummy"); + + // Verify the contents of the new blob file as well as the blob reference + std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{ + {key, value}}; + std::vector<std::string> blob_indexes{blob_index}; + + VerifyBlobFile(blob_file_number, blob_file_path, column_family_id, + kNoCompression, expected_key_value_pairs, blob_indexes); +} + +class BlobFileBuilderIOErrorTest + : public testing::Test, + public testing::WithParamInterface<std::string> { + protected: + BlobFileBuilderIOErrorTest() : sync_point_(GetParam()) { + mock_env_.reset(MockEnv::Create(Env::Default())); + fs_ = mock_env_->GetFileSystem().get(); + } + + std::unique_ptr<Env> mock_env_; + FileSystem* fs_; + FileOptions file_options_; + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P( + BlobFileBuilderTest, BlobFileBuilderIOErrorTest, + ::testing::ValuesIn(std::vector<std::string>{ + "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", + "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", + "BlobFileBuilder::WriteBlobToFile:AddRecord", + "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); + +TEST_P(BlobFileBuilderIOErrorTest, IOError) { + // Simulate an I/O error during the specified step of Add() + // Note: blob_file_size will be set to value_size in order for the first blob + // to trigger close + constexpr size_t value_size = 8; + + Options options; + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileBuilderIOErrorTest_IOError"), + 0); + options.enable_blob_files = true; + options.blob_file_size = value_size; + options.env = mock_env_.get(); + + ImmutableOptions immutable_options(options); + MutableCFOptions mutable_cf_options(options); + + constexpr int job_id = 1; + constexpr uint32_t column_family_id = 123; + constexpr char column_family_name[] = "foobar"; + constexpr Env::IOPriority io_priority = Env::IO_HIGH; + constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM; + + std::vector<std::string> blob_file_paths; + std::vector<BlobFileAddition> blob_file_additions; + + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id, + column_family_id, column_family_name, io_priority, write_hint, + nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast<Status*>(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr char key[] = "1"; + constexpr char value[] = "deadbeef"; + + std::string blob_index; + + ASSERT_TRUE(builder.Add(key, value, &blob_index).IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + if (sync_point_ == "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile") { + ASSERT_TRUE(blob_file_paths.empty()); + } else { + constexpr uint64_t blob_file_number = 2; + + ASSERT_EQ(blob_file_paths.size(), 1); + ASSERT_EQ(blob_file_paths[0], + BlobFileName(immutable_options.cf_paths.front().path, + blob_file_number)); + } + + ASSERT_TRUE(blob_file_additions.empty()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_file_cache.cc b/src/rocksdb/db/blob/blob_file_cache.cc new file mode 100644 index 000000000..1a6cdf688 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_cache.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_cache.h" + +#include <cassert> +#include <memory> + +#include "db/blob/blob_file_reader.h" +#include "options/cf_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "test_util/sync_point.h" +#include "trace_replay/io_tracer.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +BlobFileCache::BlobFileCache(Cache* cache, + const ImmutableOptions* immutable_options, + const FileOptions* file_options, + uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + const std::shared_ptr<IOTracer>& io_tracer) + : cache_(cache), + mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr), + immutable_options_(immutable_options), + file_options_(file_options), + column_family_id_(column_family_id), + blob_file_read_hist_(blob_file_read_hist), + io_tracer_(io_tracer) { + assert(cache_); + assert(immutable_options_); + assert(file_options_); +} + +Status BlobFileCache::GetBlobFileReader( + uint64_t blob_file_number, + CacheHandleGuard<BlobFileReader>* blob_file_reader) { + assert(blob_file_reader); + assert(blob_file_reader->IsEmpty()); + + const Slice key = GetSlice(&blob_file_number); + + assert(cache_); + + Cache::Handle* handle = cache_->Lookup(key); + if (handle) { + *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle); + return Status::OK(); + } + + TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck"); + + // Check again while holding mutex + MutexLock lock(mutex_.get(key)); + + handle = cache_->Lookup(key); + if (handle) { + *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle); + return Status::OK(); + } + + assert(immutable_options_); + Statistics* const statistics = immutable_options_->stats; + + RecordTick(statistics, NO_FILE_OPENS); + + std::unique_ptr<BlobFileReader> reader; + + { + assert(file_options_); + const Status s = BlobFileReader::Create( + *immutable_options_, *file_options_, column_family_id_, + blob_file_read_hist_, blob_file_number, io_tracer_, &reader); + if (!s.ok()) { + RecordTick(statistics, NO_FILE_ERRORS); + return s; + } + } + + { + constexpr size_t charge = 1; + + const Status s = cache_->Insert(key, reader.get(), charge, + &DeleteCacheEntry<BlobFileReader>, &handle); + if (!s.ok()) { + RecordTick(statistics, NO_FILE_ERRORS); + return s; + } + } + + reader.release(); + + *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_cache.h b/src/rocksdb/db/blob/blob_file_cache.h new file mode 100644 index 000000000..8eec05f18 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_cache.h @@ -0,0 +1,52 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cinttypes> + +#include "cache/cache_helpers.h" +#include "rocksdb/rocksdb_namespace.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; +struct ImmutableOptions; +struct FileOptions; +class HistogramImpl; +class Status; +class BlobFileReader; +class Slice; +class IOTracer; + +class BlobFileCache { + public: + BlobFileCache(Cache* cache, const ImmutableOptions* immutable_options, + const FileOptions* file_options, uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + const std::shared_ptr<IOTracer>& io_tracer); + + BlobFileCache(const BlobFileCache&) = delete; + BlobFileCache& operator=(const BlobFileCache&) = delete; + + Status GetBlobFileReader(uint64_t blob_file_number, + CacheHandleGuard<BlobFileReader>* blob_file_reader); + + private: + Cache* cache_; + // Note: mutex_ below is used to guard against multiple threads racing to open + // the same file. + Striped<port::Mutex, Slice> mutex_; + const ImmutableOptions* immutable_options_; + const FileOptions* file_options_; + uint32_t column_family_id_; + HistogramImpl* blob_file_read_hist_; + std::shared_ptr<IOTracer> io_tracer_; + + static constexpr size_t kNumberOfMutexStripes = 1 << 7; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_cache_test.cc b/src/rocksdb/db/blob/blob_file_cache_test.cc new file mode 100644 index 000000000..d3a61b3c5 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_cache_test.cc @@ -0,0 +1,269 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_cache.h" + +#include <cassert> +#include <string> + +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Creates a test blob file with a single blob in it. +void WriteBlobFile(uint32_t column_family_id, + const ImmutableOptions& immutable_options, + uint64_t blob_file_number) { + assert(!immutable_options.cf_paths.empty()); + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + + std::unique_ptr<FSWritableFile> file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter( + std::move(file), blob_file_path, FileOptions(), immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock, + statistics, blob_file_number, use_fsync, + do_flush); + + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + + BlobLogHeader header(column_family_id, kNoCompression, has_ttl, + expiration_range); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + std::string compressed_blob; + + uint64_t key_offset = 0; + uint64_t blob_offset = 0; + + ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset)); + + BlobLogFooter footer; + footer.blob_count = 1; + footer.expiration_range = expiration_range; + + std::string checksum_method; + std::string checksum_value; + + ASSERT_OK( + blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); +} + +} // anonymous namespace + +class BlobFileCacheTest : public testing::Test { + protected: + BlobFileCacheTest() { mock_env_.reset(MockEnv::Create(Env::Default())); } + + std::unique_ptr<Env> mock_env_; +}; + +TEST_F(BlobFileCacheTest, GetBlobFileReader) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 10; + std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // First try: reader should be opened and put in cache + CacheHandleGuard<BlobFileReader> first; + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + ASSERT_NE(first.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + // Second try: reader should be served from cache + CacheHandleGuard<BlobFileReader> second; + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_NE(second.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + ASSERT_EQ(first.GetValue(), second.GetValue()); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader_Race"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 10; + std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + CacheHandleGuard<BlobFileReader> first; + CacheHandleGuard<BlobFileReader> second; + + SyncPoint::GetInstance()->SetCallBack( + "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) { + // Disabling sync points to prevent infinite recursion + SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_NE(second.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + ASSERT_NE(first.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + ASSERT_EQ(first.GetValue(), second.GetValue()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader_IOError"), + 0); + options.enable_blob_files = true; + + constexpr size_t capacity = 10; + std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity); + + ImmutableOptions immutable_options(options); + FileOptions file_options; + constexpr uint32_t column_family_id = 1; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // Note: there is no blob file with the below number + constexpr uint64_t blob_file_number = 123; + + CacheHandleGuard<BlobFileReader> reader; + + ASSERT_TRUE( + blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError()); + ASSERT_EQ(reader.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) { + Options options; + options.env = mock_env_.get(); + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileCacheTest_GetBlobFileReader_CacheFull"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 0; + constexpr int num_shard_bits = -1; // determined automatically + constexpr bool strict_capacity_limit = true; + std::shared_ptr<Cache> backing_cache = + NewLRUCache(capacity, num_shard_bits, strict_capacity_limit); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // Insert into cache should fail since it has zero capacity and + // strict_capacity_limit is set + CacheHandleGuard<BlobFileReader> reader; + + ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader) + .IsMemoryLimit()); + ASSERT_EQ(reader.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_file_completion_callback.h b/src/rocksdb/db/blob/blob_file_completion_callback.h new file mode 100644 index 000000000..ffe65a0ff --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_completion_callback.h @@ -0,0 +1,101 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "db/error_handler.h" +#include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileCompletionCallback { + public: + BlobFileCompletionCallback( + SstFileManager* sst_file_manager, InstrumentedMutex* mutex, + ErrorHandler* error_handler, EventLogger* event_logger, + const std::vector<std::shared_ptr<EventListener>>& listeners, + const std::string& dbname) + : event_logger_(event_logger), listeners_(listeners), dbname_(dbname) { +#ifndef ROCKSDB_LITE + sst_file_manager_ = sst_file_manager; + mutex_ = mutex; + error_handler_ = error_handler; +#else + (void)sst_file_manager; + (void)mutex; + (void)error_handler; +#endif // ROCKSDB_LITE + } + + void OnBlobFileCreationStarted(const std::string& file_name, + const std::string& column_family_name, + int job_id, + BlobFileCreationReason creation_reason) { +#ifndef ROCKSDB_LITE + // Notify the listeners. + EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_, + column_family_name, file_name, + job_id, creation_reason); +#else + (void)file_name; + (void)column_family_name; + (void)job_id; + (void)creation_reason; +#endif + } + + Status OnBlobFileCompleted(const std::string& file_name, + const std::string& column_family_name, int job_id, + uint64_t file_number, + BlobFileCreationReason creation_reason, + const Status& report_status, + const std::string& checksum_value, + const std::string& checksum_method, + uint64_t blob_count, uint64_t blob_bytes) { + Status s; + +#ifndef ROCKSDB_LITE + auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager_); + if (sfm) { + // Report new blob files to SstFileManagerImpl + s = sfm->OnAddFile(file_name); + if (sfm->IsMaxAllowedSpaceReached()) { + s = Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT( + "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached"); + InstrumentedMutexLock l(mutex_); + error_handler_->SetBGError(s, BackgroundErrorReason::kFlush); + } + } +#endif // !ROCKSDB_LITE + + // Notify the listeners. + EventHelpers::LogAndNotifyBlobFileCreationFinished( + event_logger_, listeners_, dbname_, column_family_name, file_name, + job_id, file_number, creation_reason, + (!report_status.ok() ? report_status : s), + (checksum_value.empty() ? kUnknownFileChecksum : checksum_value), + (checksum_method.empty() ? kUnknownFileChecksumFuncName + : checksum_method), + blob_count, blob_bytes); + return s; + } + + private: +#ifndef ROCKSDB_LITE + SstFileManager* sst_file_manager_; + InstrumentedMutex* mutex_; + ErrorHandler* error_handler_; +#endif // ROCKSDB_LITE + EventLogger* event_logger_; + std::vector<std::shared_ptr<EventListener>> listeners_; + std::string dbname_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_garbage.cc b/src/rocksdb/db/blob/blob_file_garbage.cc new file mode 100644 index 000000000..52c336f49 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_garbage.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_garbage.h" + +#include <ostream> +#include <sstream> + +#include "logging/event_logger.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "test_util/sync_point.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +// Tags for custom fields. Note that these get persisted in the manifest, +// so existing tags should not be modified. +enum BlobFileGarbage::CustomFieldTags : uint32_t { + kEndMarker, + + // Add forward compatible fields here + + ///////////////////////////////////////////////////////////////////// + + kForwardIncompatibleMask = 1 << 6, + + // Add forward incompatible fields here +}; + +void BlobFileGarbage::EncodeTo(std::string* output) const { + PutVarint64(output, blob_file_number_); + PutVarint64(output, garbage_blob_count_); + PutVarint64(output, garbage_blob_bytes_); + + // Encode any custom fields here. The format to use is a Varint32 tag (see + // CustomFieldTags above) followed by a length prefixed slice. Unknown custom + // fields will be ignored during decoding unless they're in the forward + // incompatible range. + + TEST_SYNC_POINT_CALLBACK("BlobFileGarbage::EncodeTo::CustomFields", output); + + PutVarint32(output, kEndMarker); +} + +Status BlobFileGarbage::DecodeFrom(Slice* input) { + constexpr char class_name[] = "BlobFileGarbage"; + + if (!GetVarint64(input, &blob_file_number_)) { + return Status::Corruption(class_name, "Error decoding blob file number"); + } + + if (!GetVarint64(input, &garbage_blob_count_)) { + return Status::Corruption(class_name, "Error decoding garbage blob count"); + } + + if (!GetVarint64(input, &garbage_blob_bytes_)) { + return Status::Corruption(class_name, "Error decoding garbage blob bytes"); + } + + while (true) { + uint32_t custom_field_tag = 0; + if (!GetVarint32(input, &custom_field_tag)) { + return Status::Corruption(class_name, "Error decoding custom field tag"); + } + + if (custom_field_tag == kEndMarker) { + break; + } + + if (custom_field_tag & kForwardIncompatibleMask) { + return Status::Corruption( + class_name, "Forward incompatible custom field encountered"); + } + + Slice custom_field_value; + if (!GetLengthPrefixedSlice(input, &custom_field_value)) { + return Status::Corruption(class_name, + "Error decoding custom field value"); + } + } + + return Status::OK(); +} + +std::string BlobFileGarbage::DebugString() const { + std::ostringstream oss; + + oss << *this; + + return oss.str(); +} + +std::string BlobFileGarbage::DebugJSON() const { + JSONWriter jw; + + jw << *this; + + jw.EndObject(); + + return jw.Get(); +} + +bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) { + return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() && + lhs.GetGarbageBlobCount() == rhs.GetGarbageBlobCount() && + lhs.GetGarbageBlobBytes() == rhs.GetGarbageBlobBytes(); +} + +bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) { + return !(lhs == rhs); +} + +std::ostream& operator<<(std::ostream& os, + const BlobFileGarbage& blob_file_garbage) { + os << "blob_file_number: " << blob_file_garbage.GetBlobFileNumber() + << " garbage_blob_count: " << blob_file_garbage.GetGarbageBlobCount() + << " garbage_blob_bytes: " << blob_file_garbage.GetGarbageBlobBytes(); + + return os; +} + +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileGarbage& blob_file_garbage) { + jw << "BlobFileNumber" << blob_file_garbage.GetBlobFileNumber() + << "GarbageBlobCount" << blob_file_garbage.GetGarbageBlobCount() + << "GarbageBlobBytes" << blob_file_garbage.GetGarbageBlobBytes(); + + return jw; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_garbage.h b/src/rocksdb/db/blob/blob_file_garbage.h new file mode 100644 index 000000000..6dc14ddca --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_garbage.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cstdint> +#include <iosfwd> +#include <string> + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class JSONWriter; +class Slice; +class Status; + +class BlobFileGarbage { + public: + BlobFileGarbage() = default; + + BlobFileGarbage(uint64_t blob_file_number, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) + : blob_file_number_(blob_file_number), + garbage_blob_count_(garbage_blob_count), + garbage_blob_bytes_(garbage_blob_bytes) {} + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } + uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } + + void EncodeTo(std::string* output) const; + Status DecodeFrom(Slice* input); + + std::string DebugString() const; + std::string DebugJSON() const; + + private: + enum CustomFieldTags : uint32_t; + + uint64_t blob_file_number_ = kInvalidBlobFileNumber; + uint64_t garbage_blob_count_ = 0; + uint64_t garbage_blob_bytes_ = 0; +}; + +bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs); +bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs); + +std::ostream& operator<<(std::ostream& os, + const BlobFileGarbage& blob_file_garbage); +JSONWriter& operator<<(JSONWriter& jw, + const BlobFileGarbage& blob_file_garbage); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_garbage_test.cc b/src/rocksdb/db/blob/blob_file_garbage_test.cc new file mode 100644 index 000000000..292a8b38a --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_garbage_test.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_garbage.h" + +#include <cstdint> +#include <cstring> +#include <string> + +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileGarbageTest : public testing::Test { + public: + static void TestEncodeDecode(const BlobFileGarbage& blob_file_garbage) { + std::string encoded; + blob_file_garbage.EncodeTo(&encoded); + + BlobFileGarbage decoded; + Slice input(encoded); + ASSERT_OK(decoded.DecodeFrom(&input)); + + ASSERT_EQ(blob_file_garbage, decoded); + } +}; + +TEST_F(BlobFileGarbageTest, Empty) { + BlobFileGarbage blob_file_garbage; + + ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), kInvalidBlobFileNumber); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), 0); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), 0); + + TestEncodeDecode(blob_file_garbage); +} + +TEST_F(BlobFileGarbageTest, NonEmpty) { + constexpr uint64_t blob_file_number = 123; + constexpr uint64_t garbage_blob_count = 1; + constexpr uint64_t garbage_blob_bytes = 9876; + + BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), blob_file_number); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), garbage_blob_count); + ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), garbage_blob_bytes); + + TestEncodeDecode(blob_file_garbage); +} + +TEST_F(BlobFileGarbageTest, DecodeErrors) { + std::string str; + Slice slice(str); + + BlobFileGarbage blob_file_garbage; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "blob file number")); + } + + constexpr uint64_t blob_file_number = 123; + PutVarint64(&str, blob_file_number); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "garbage blob count")); + } + + constexpr uint64_t garbage_blob_count = 4567; + PutVarint64(&str, garbage_blob_count); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "garbage blob bytes")); + } + + constexpr uint64_t garbage_blob_bytes = 12345678; + PutVarint64(&str, garbage_blob_bytes); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field tag")); + } + + constexpr uint32_t custom_tag = 2; + PutVarint32(&str, custom_tag); + slice = str; + + { + const Status s = blob_file_garbage.DecodeFrom(&slice); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "custom field value")); + } +} + +TEST_F(BlobFileGarbageTest, ForwardCompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast<std::string*>(arg); + + constexpr uint32_t forward_compatible_tag = 2; + PutVarint32(output, forward_compatible_tag); + + PutLengthPrefixedSlice(output, "deadbeef"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 678; + constexpr uint64_t garbage_blob_count = 9999; + constexpr uint64_t garbage_blob_bytes = 100000000; + + BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + TestEncodeDecode(blob_file_garbage); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileGarbageTest, ForwardIncompatibleCustomField) { + SyncPoint::GetInstance()->SetCallBack( + "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) { + std::string* output = static_cast<std::string*>(arg); + + constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1; + PutVarint32(output, forward_incompatible_tag); + + PutLengthPrefixedSlice(output, "foobar"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t blob_file_number = 456; + constexpr uint64_t garbage_blob_count = 100; + constexpr uint64_t garbage_blob_bytes = 2000000; + + BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count, + garbage_blob_bytes); + + std::string encoded; + blob_file_garbage.EncodeTo(&encoded); + + BlobFileGarbage decoded_blob_file_addition; + Slice input(encoded); + const Status s = decoded_blob_file_addition.DecodeFrom(&input); + + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible")); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_file_meta.cc b/src/rocksdb/db/blob/blob_file_meta.cc new file mode 100644 index 000000000..4913137e5 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_meta.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_meta.h" + +#include <ostream> +#include <sstream> + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { +uint64_t SharedBlobFileMetaData::GetBlobFileSize() const { + return BlobLogHeader::kSize + total_blob_bytes_ + BlobLogFooter::kSize; +} + +std::string SharedBlobFileMetaData::DebugString() const { + std::ostringstream oss; + oss << (*this); + + return oss.str(); +} + +std::ostream& operator<<(std::ostream& os, + const SharedBlobFileMetaData& shared_meta) { + os << "blob_file_number: " << shared_meta.GetBlobFileNumber() + << " total_blob_count: " << shared_meta.GetTotalBlobCount() + << " total_blob_bytes: " << shared_meta.GetTotalBlobBytes() + << " checksum_method: " << shared_meta.GetChecksumMethod() + << " checksum_value: " + << Slice(shared_meta.GetChecksumValue()).ToString(/* hex */ true); + + return os; +} + +std::string BlobFileMetaData::DebugString() const { + std::ostringstream oss; + oss << (*this); + + return oss.str(); +} + +std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta) { + const auto& shared_meta = meta.GetSharedMeta(); + assert(shared_meta); + os << (*shared_meta); + + os << " linked_ssts: {"; + for (uint64_t file_number : meta.GetLinkedSsts()) { + os << ' ' << file_number; + } + os << " }"; + + os << " garbage_blob_count: " << meta.GetGarbageBlobCount() + << " garbage_blob_bytes: " << meta.GetGarbageBlobBytes(); + + return os; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_meta.h b/src/rocksdb/db/blob/blob_file_meta.h new file mode 100644 index 000000000..d7c8a1243 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_meta.h @@ -0,0 +1,170 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cassert> +#include <iosfwd> +#include <memory> +#include <string> +#include <unordered_set> + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// SharedBlobFileMetaData represents the immutable part of blob files' metadata, +// like the blob file number, total number and size of blobs, or checksum +// method and value. There is supposed to be one object of this class per blob +// file (shared across all versions that include the blob file in question); +// hence, the type is neither copyable nor movable. A blob file can be marked +// obsolete when the corresponding SharedBlobFileMetaData object is destroyed. + +class SharedBlobFileMetaData { + public: + static std::shared_ptr<SharedBlobFileMetaData> Create( + uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) { + return std::shared_ptr<SharedBlobFileMetaData>(new SharedBlobFileMetaData( + blob_file_number, total_blob_count, total_blob_bytes, + std::move(checksum_method), std::move(checksum_value))); + } + + template <typename Deleter> + static std::shared_ptr<SharedBlobFileMetaData> Create( + uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value, Deleter deleter) { + return std::shared_ptr<SharedBlobFileMetaData>( + new SharedBlobFileMetaData(blob_file_number, total_blob_count, + total_blob_bytes, std::move(checksum_method), + std::move(checksum_value)), + deleter); + } + + SharedBlobFileMetaData(const SharedBlobFileMetaData&) = delete; + SharedBlobFileMetaData& operator=(const SharedBlobFileMetaData&) = delete; + + SharedBlobFileMetaData(SharedBlobFileMetaData&&) = delete; + SharedBlobFileMetaData& operator=(SharedBlobFileMetaData&&) = delete; + + uint64_t GetBlobFileSize() const; + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + uint64_t GetTotalBlobCount() const { return total_blob_count_; } + uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; } + const std::string& GetChecksumMethod() const { return checksum_method_; } + const std::string& GetChecksumValue() const { return checksum_value_; } + + std::string DebugString() const; + + private: + SharedBlobFileMetaData(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, std::string checksum_method, + std::string checksum_value) + : blob_file_number_(blob_file_number), + total_blob_count_(total_blob_count), + total_blob_bytes_(total_blob_bytes), + checksum_method_(std::move(checksum_method)), + checksum_value_(std::move(checksum_value)) { + assert(checksum_method_.empty() == checksum_value_.empty()); + } + + uint64_t blob_file_number_; + uint64_t total_blob_count_; + uint64_t total_blob_bytes_; + std::string checksum_method_; + std::string checksum_value_; +}; + +std::ostream& operator<<(std::ostream& os, + const SharedBlobFileMetaData& shared_meta); + +// BlobFileMetaData contains the part of the metadata for blob files that can +// vary across versions, like the amount of garbage in the blob file. In +// addition, BlobFileMetaData objects point to and share the ownership of the +// SharedBlobFileMetaData object for the corresponding blob file. Similarly to +// SharedBlobFileMetaData, BlobFileMetaData are not copyable or movable. They +// are meant to be jointly owned by the versions in which the blob file has the +// same (immutable *and* mutable) state. + +class BlobFileMetaData { + public: + using LinkedSsts = std::unordered_set<uint64_t>; + + static std::shared_ptr<BlobFileMetaData> Create( + std::shared_ptr<SharedBlobFileMetaData> shared_meta, + LinkedSsts linked_ssts, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) { + return std::shared_ptr<BlobFileMetaData>( + new BlobFileMetaData(std::move(shared_meta), std::move(linked_ssts), + garbage_blob_count, garbage_blob_bytes)); + } + + BlobFileMetaData(const BlobFileMetaData&) = delete; + BlobFileMetaData& operator=(const BlobFileMetaData&) = delete; + + BlobFileMetaData(BlobFileMetaData&&) = delete; + BlobFileMetaData& operator=(BlobFileMetaData&&) = delete; + + const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const { + return shared_meta_; + } + + uint64_t GetBlobFileSize() const { + assert(shared_meta_); + return shared_meta_->GetBlobFileSize(); + } + + uint64_t GetBlobFileNumber() const { + assert(shared_meta_); + return shared_meta_->GetBlobFileNumber(); + } + uint64_t GetTotalBlobCount() const { + assert(shared_meta_); + return shared_meta_->GetTotalBlobCount(); + } + uint64_t GetTotalBlobBytes() const { + assert(shared_meta_); + return shared_meta_->GetTotalBlobBytes(); + } + const std::string& GetChecksumMethod() const { + assert(shared_meta_); + return shared_meta_->GetChecksumMethod(); + } + const std::string& GetChecksumValue() const { + assert(shared_meta_); + return shared_meta_->GetChecksumValue(); + } + + const LinkedSsts& GetLinkedSsts() const { return linked_ssts_; } + + uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } + uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } + + std::string DebugString() const; + + private: + BlobFileMetaData(std::shared_ptr<SharedBlobFileMetaData> shared_meta, + LinkedSsts linked_ssts, uint64_t garbage_blob_count, + uint64_t garbage_blob_bytes) + : shared_meta_(std::move(shared_meta)), + linked_ssts_(std::move(linked_ssts)), + garbage_blob_count_(garbage_blob_count), + garbage_blob_bytes_(garbage_blob_bytes) { + assert(shared_meta_); + assert(garbage_blob_count_ <= shared_meta_->GetTotalBlobCount()); + assert(garbage_blob_bytes_ <= shared_meta_->GetTotalBlobBytes()); + } + + std::shared_ptr<SharedBlobFileMetaData> shared_meta_; + LinkedSsts linked_ssts_; + uint64_t garbage_blob_count_; + uint64_t garbage_blob_bytes_; +}; + +std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_reader.cc b/src/rocksdb/db/blob/blob_file_reader.cc new file mode 100644 index 000000000..a4eabb605 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_reader.cc @@ -0,0 +1,610 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_reader.h" + +#include <cassert> +#include <string> + +#include "db/blob/blob_contents.h" +#include "db/blob/blob_log_format.h" +#include "file/file_prefetch_buffer.h" +#include "file/filename.h" +#include "monitoring/statistics.h" +#include "options/cf_options.h" +#include "rocksdb/file_system.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "table/multiget_context.h" +#include "test_util/sync_point.h" +#include "util/compression.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobFileReader::Create( + const ImmutableOptions& immutable_options, const FileOptions& file_options, + uint32_t column_family_id, HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, const std::shared_ptr<IOTracer>& io_tracer, + std::unique_ptr<BlobFileReader>* blob_file_reader) { + assert(blob_file_reader); + assert(!*blob_file_reader); + + uint64_t file_size = 0; + std::unique_ptr<RandomAccessFileReader> file_reader; + + { + const Status s = + OpenFile(immutable_options, file_options, blob_file_read_hist, + blob_file_number, io_tracer, &file_size, &file_reader); + if (!s.ok()) { + return s; + } + } + + assert(file_reader); + + Statistics* const statistics = immutable_options.stats; + + CompressionType compression_type = kNoCompression; + + { + const Status s = ReadHeader(file_reader.get(), column_family_id, statistics, + &compression_type); + if (!s.ok()) { + return s; + } + } + + { + const Status s = ReadFooter(file_reader.get(), file_size, statistics); + if (!s.ok()) { + return s; + } + } + + blob_file_reader->reset( + new BlobFileReader(std::move(file_reader), file_size, compression_type, + immutable_options.clock, statistics)); + + return Status::OK(); +} + +Status BlobFileReader::OpenFile( + const ImmutableOptions& immutable_options, const FileOptions& file_opts, + HistogramImpl* blob_file_read_hist, uint64_t blob_file_number, + const std::shared_ptr<IOTracer>& io_tracer, uint64_t* file_size, + std::unique_ptr<RandomAccessFileReader>* file_reader) { + assert(file_size); + assert(file_reader); + + const auto& cf_paths = immutable_options.cf_paths; + assert(!cf_paths.empty()); + + const std::string blob_file_path = + BlobFileName(cf_paths.front().path, blob_file_number); + + FileSystem* const fs = immutable_options.fs.get(); + assert(fs); + + constexpr IODebugContext* dbg = nullptr; + + { + TEST_SYNC_POINT("BlobFileReader::OpenFile:GetFileSize"); + + const Status s = + fs->GetFileSize(blob_file_path, IOOptions(), file_size, dbg); + if (!s.ok()) { + return s; + } + } + + if (*file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) { + return Status::Corruption("Malformed blob file"); + } + + std::unique_ptr<FSRandomAccessFile> file; + + { + TEST_SYNC_POINT("BlobFileReader::OpenFile:NewRandomAccessFile"); + + const Status s = + fs->NewRandomAccessFile(blob_file_path, file_opts, &file, dbg); + if (!s.ok()) { + return s; + } + } + + assert(file); + + if (immutable_options.advise_random_on_open) { + file->Hint(FSRandomAccessFile::kRandom); + } + + file_reader->reset(new RandomAccessFileReader( + std::move(file), blob_file_path, immutable_options.clock, io_tracer, + immutable_options.stats, BLOB_DB_BLOB_FILE_READ_MICROS, + blob_file_read_hist, immutable_options.rate_limiter.get(), + immutable_options.listeners)); + + return Status::OK(); +} + +Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, + uint32_t column_family_id, + Statistics* statistics, + CompressionType* compression_type) { + assert(file_reader); + assert(compression_type); + + Slice header_slice; + Buffer buf; + AlignedBuf aligned_buf; + + { + TEST_SYNC_POINT("BlobFileReader::ReadHeader:ReadFromFile"); + + constexpr uint64_t read_offset = 0; + constexpr size_t read_size = BlobLogHeader::kSize; + + // TODO: rate limit reading headers from blob files. + const Status s = ReadFromFile(file_reader, read_offset, read_size, + statistics, &header_slice, &buf, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + return s; + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadHeader:TamperWithResult", + &header_slice); + } + + BlobLogHeader header; + + { + const Status s = header.DecodeFrom(header_slice); + if (!s.ok()) { + return s; + } + } + + constexpr ExpirationRange no_expiration_range; + + if (header.has_ttl || header.expiration_range != no_expiration_range) { + return Status::Corruption("Unexpected TTL blob file"); + } + + if (header.column_family_id != column_family_id) { + return Status::Corruption("Column family ID mismatch"); + } + + *compression_type = header.compression; + + return Status::OK(); +} + +Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, + uint64_t file_size, Statistics* statistics) { + assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize); + assert(file_reader); + + Slice footer_slice; + Buffer buf; + AlignedBuf aligned_buf; + + { + TEST_SYNC_POINT("BlobFileReader::ReadFooter:ReadFromFile"); + + const uint64_t read_offset = file_size - BlobLogFooter::kSize; + constexpr size_t read_size = BlobLogFooter::kSize; + + // TODO: rate limit reading footers from blob files. + const Status s = ReadFromFile(file_reader, read_offset, read_size, + statistics, &footer_slice, &buf, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); + if (!s.ok()) { + return s; + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadFooter:TamperWithResult", + &footer_slice); + } + + BlobLogFooter footer; + + { + const Status s = footer.DecodeFrom(footer_slice); + if (!s.ok()) { + return s; + } + } + + constexpr ExpirationRange no_expiration_range; + + if (footer.expiration_range != no_expiration_range) { + return Status::Corruption("Unexpected TTL blob file"); + } + + return Status::OK(); +} + +Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, + uint64_t read_offset, size_t read_size, + Statistics* statistics, Slice* slice, + Buffer* buf, AlignedBuf* aligned_buf, + Env::IOPriority rate_limiter_priority) { + assert(slice); + assert(buf); + assert(aligned_buf); + + assert(file_reader); + + RecordTick(statistics, BLOB_DB_BLOB_FILE_BYTES_READ, read_size); + + Status s; + + if (file_reader->use_direct_io()) { + constexpr char* scratch = nullptr; + + s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch, + aligned_buf, rate_limiter_priority); + } else { + buf->reset(new char[read_size]); + constexpr AlignedBuf* aligned_scratch = nullptr; + + s = file_reader->Read(IOOptions(), read_offset, read_size, slice, + buf->get(), aligned_scratch, rate_limiter_priority); + } + + if (!s.ok()) { + return s; + } + + if (slice->size() != read_size) { + return Status::Corruption("Failed to read data from blob file"); + } + + return Status::OK(); +} + +BlobFileReader::BlobFileReader( + std::unique_ptr<RandomAccessFileReader>&& file_reader, uint64_t file_size, + CompressionType compression_type, SystemClock* clock, + Statistics* statistics) + : file_reader_(std::move(file_reader)), + file_size_(file_size), + compression_type_(compression_type), + clock_(clock), + statistics_(statistics) { + assert(file_reader_); +} + +BlobFileReader::~BlobFileReader() = default; + +Status BlobFileReader::GetBlob( + const ReadOptions& read_options, const Slice& user_key, uint64_t offset, + uint64_t value_size, CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, MemoryAllocator* allocator, + std::unique_ptr<BlobContents>* result, uint64_t* bytes_read) const { + assert(result); + + const uint64_t key_size = user_key.size(); + + if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) { + return Status::Corruption("Invalid blob offset"); + } + + if (compression_type != compression_type_) { + return Status::Corruption("Compression type mismatch when reading blob"); + } + + // Note: if verify_checksum is set, we read the entire blob record to be able + // to perform the verification; otherwise, we just read the blob itself. Since + // the offset in BlobIndex actually points to the blob value, we need to make + // an adjustment in the former case. + const uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + : 0; + assert(offset >= adjustment); + + const uint64_t record_offset = offset - adjustment; + const uint64_t record_size = value_size + adjustment; + + Slice record_slice; + Buffer buf; + AlignedBuf aligned_buf; + + bool prefetched = false; + + if (prefetch_buffer) { + Status s; + constexpr bool for_compaction = true; + + prefetched = prefetch_buffer->TryReadFromCache( + IOOptions(), file_reader_.get(), record_offset, + static_cast<size_t>(record_size), &record_slice, &s, + read_options.rate_limiter_priority, for_compaction); + if (!s.ok()) { + return s; + } + } + + if (!prefetched) { + TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile"); + PERF_COUNTER_ADD(blob_read_count, 1); + PERF_COUNTER_ADD(blob_read_byte, record_size); + PERF_TIMER_GUARD(blob_read_time); + const Status s = ReadFromFile(file_reader_.get(), record_offset, + static_cast<size_t>(record_size), statistics_, + &record_slice, &buf, &aligned_buf, + read_options.rate_limiter_priority); + if (!s.ok()) { + return s; + } + } + + TEST_SYNC_POINT_CALLBACK("BlobFileReader::GetBlob:TamperWithResult", + &record_slice); + + if (read_options.verify_checksums) { + const Status s = VerifyBlob(record_slice, user_key, value_size); + if (!s.ok()) { + return s; + } + } + + const Slice value_slice(record_slice.data() + adjustment, value_size); + + { + const Status s = UncompressBlobIfNeeded( + value_slice, compression_type, allocator, clock_, statistics_, result); + if (!s.ok()) { + return s; + } + } + + if (bytes_read) { + *bytes_read = record_size; + } + + return Status::OK(); +} + +void BlobFileReader::MultiGetBlob( + const ReadOptions& read_options, MemoryAllocator* allocator, + autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>& + blob_reqs, + uint64_t* bytes_read) const { + const size_t num_blobs = blob_reqs.size(); + assert(num_blobs > 0); + assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE); + +#ifndef NDEBUG + for (size_t i = 0; i < num_blobs - 1; ++i) { + assert(blob_reqs[i].first->offset <= blob_reqs[i + 1].first->offset); + } +#endif // !NDEBUG + + std::vector<FSReadRequest> read_reqs; + autovector<uint64_t> adjustments; + uint64_t total_len = 0; + read_reqs.reserve(num_blobs); + for (size_t i = 0; i < num_blobs; ++i) { + BlobReadRequest* const req = blob_reqs[i].first; + assert(req); + assert(req->user_key); + assert(req->status); + + const size_t key_size = req->user_key->size(); + const uint64_t offset = req->offset; + const uint64_t value_size = req->len; + + if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) { + *req->status = Status::Corruption("Invalid blob offset"); + continue; + } + if (req->compression != compression_type_) { + *req->status = + Status::Corruption("Compression type mismatch when reading a blob"); + continue; + } + + const uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + : 0; + assert(req->offset >= adjustment); + adjustments.push_back(adjustment); + + FSReadRequest read_req = {}; + read_req.offset = req->offset - adjustment; + read_req.len = req->len + adjustment; + read_reqs.emplace_back(read_req); + total_len += read_req.len; + } + + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len); + + Buffer buf; + AlignedBuf aligned_buf; + + Status s; + bool direct_io = file_reader_->use_direct_io(); + if (direct_io) { + for (size_t i = 0; i < read_reqs.size(); ++i) { + read_reqs[i].scratch = nullptr; + } + } else { + buf.reset(new char[total_len]); + std::ptrdiff_t pos = 0; + for (size_t i = 0; i < read_reqs.size(); ++i) { + read_reqs[i].scratch = buf.get() + pos; + pos += read_reqs[i].len; + } + } + TEST_SYNC_POINT("BlobFileReader::MultiGetBlob:ReadFromFile"); + PERF_COUNTER_ADD(blob_read_count, num_blobs); + PERF_COUNTER_ADD(blob_read_byte, total_len); + s = file_reader_->MultiRead(IOOptions(), read_reqs.data(), read_reqs.size(), + direct_io ? &aligned_buf : nullptr, + read_options.rate_limiter_priority); + if (!s.ok()) { + for (auto& req : read_reqs) { + req.status.PermitUncheckedError(); + } + for (auto& blob_req : blob_reqs) { + BlobReadRequest* const req = blob_req.first; + assert(req); + assert(req->status); + + if (!req->status->IsCorruption()) { + // Avoid overwriting corruption status. + *req->status = s; + } + } + return; + } + + assert(s.ok()); + + uint64_t total_bytes = 0; + for (size_t i = 0, j = 0; i < num_blobs; ++i) { + BlobReadRequest* const req = blob_reqs[i].first; + assert(req); + assert(req->user_key); + assert(req->status); + + if (!req->status->ok()) { + continue; + } + + assert(j < read_reqs.size()); + auto& read_req = read_reqs[j++]; + const auto& record_slice = read_req.result; + if (read_req.status.ok() && record_slice.size() != read_req.len) { + read_req.status = + IOStatus::Corruption("Failed to read data from blob file"); + } + + *req->status = read_req.status; + if (!req->status->ok()) { + continue; + } + + // Verify checksums if enabled + if (read_options.verify_checksums) { + *req->status = VerifyBlob(record_slice, *req->user_key, req->len); + if (!req->status->ok()) { + continue; + } + } + + // Uncompress blob if needed + Slice value_slice(record_slice.data() + adjustments[i], req->len); + *req->status = + UncompressBlobIfNeeded(value_slice, compression_type_, allocator, + clock_, statistics_, &blob_reqs[i].second); + if (req->status->ok()) { + total_bytes += record_slice.size(); + } + } + + if (bytes_read) { + *bytes_read = total_bytes; + } +} + +Status BlobFileReader::VerifyBlob(const Slice& record_slice, + const Slice& user_key, uint64_t value_size) { + PERF_TIMER_GUARD(blob_checksum_time); + + BlobLogRecord record; + + const Slice header_slice(record_slice.data(), BlobLogRecord::kHeaderSize); + + { + const Status s = record.DecodeHeaderFrom(header_slice); + if (!s.ok()) { + return s; + } + } + + if (record.key_size != user_key.size()) { + return Status::Corruption("Key size mismatch when reading blob"); + } + + if (record.value_size != value_size) { + return Status::Corruption("Value size mismatch when reading blob"); + } + + record.key = + Slice(record_slice.data() + BlobLogRecord::kHeaderSize, record.key_size); + if (record.key != user_key) { + return Status::Corruption("Key mismatch when reading blob"); + } + + record.value = Slice(record.key.data() + record.key_size, value_size); + + { + TEST_SYNC_POINT_CALLBACK("BlobFileReader::VerifyBlob:CheckBlobCRC", + &record); + + const Status s = record.CheckBlobCRC(); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + +Status BlobFileReader::UncompressBlobIfNeeded( + const Slice& value_slice, CompressionType compression_type, + MemoryAllocator* allocator, SystemClock* clock, Statistics* statistics, + std::unique_ptr<BlobContents>* result) { + assert(result); + + if (compression_type == kNoCompression) { + CacheAllocationPtr allocation = + AllocateBlock(value_slice.size(), allocator); + memcpy(allocation.get(), value_slice.data(), value_slice.size()); + + *result = BlobContents::Create(std::move(allocation), value_slice.size()); + + return Status::OK(); + } + + UncompressionContext context(compression_type); + UncompressionInfo info(context, UncompressionDict::GetEmptyDict(), + compression_type); + + size_t uncompressed_size = 0; + constexpr uint32_t compression_format_version = 2; + + CacheAllocationPtr output; + + { + PERF_TIMER_GUARD(blob_decompress_time); + StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS); + output = UncompressData(info, value_slice.data(), value_slice.size(), + &uncompressed_size, compression_format_version, + allocator); + } + + TEST_SYNC_POINT_CALLBACK( + "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output); + + if (!output) { + return Status::Corruption("Unable to uncompress blob"); + } + + *result = BlobContents::Create(std::move(output), uncompressed_size); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_reader.h b/src/rocksdb/db/blob/blob_file_reader.h new file mode 100644 index 000000000..75b756da1 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_reader.h @@ -0,0 +1,108 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cinttypes> +#include <memory> + +#include "db/blob/blob_read_request.h" +#include "file/random_access_file_reader.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/rocksdb_namespace.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class Status; +struct ImmutableOptions; +struct FileOptions; +class HistogramImpl; +struct ReadOptions; +class Slice; +class FilePrefetchBuffer; +class BlobContents; +class Statistics; + +class BlobFileReader { + public: + static Status Create(const ImmutableOptions& immutable_options, + const FileOptions& file_options, + uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, + const std::shared_ptr<IOTracer>& io_tracer, + std::unique_ptr<BlobFileReader>* reader); + + BlobFileReader(const BlobFileReader&) = delete; + BlobFileReader& operator=(const BlobFileReader&) = delete; + + ~BlobFileReader(); + + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + uint64_t offset, uint64_t value_size, + CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, + MemoryAllocator* allocator, + std::unique_ptr<BlobContents>* result, + uint64_t* bytes_read) const; + + // offsets must be sorted in ascending order by caller. + void MultiGetBlob( + const ReadOptions& read_options, MemoryAllocator* allocator, + autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>& + blob_reqs, + uint64_t* bytes_read) const; + + CompressionType GetCompressionType() const { return compression_type_; } + + uint64_t GetFileSize() const { return file_size_; } + + private: + BlobFileReader(std::unique_ptr<RandomAccessFileReader>&& file_reader, + uint64_t file_size, CompressionType compression_type, + SystemClock* clock, Statistics* statistics); + + static Status OpenFile(const ImmutableOptions& immutable_options, + const FileOptions& file_opts, + HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, + const std::shared_ptr<IOTracer>& io_tracer, + uint64_t* file_size, + std::unique_ptr<RandomAccessFileReader>* file_reader); + + static Status ReadHeader(const RandomAccessFileReader* file_reader, + uint32_t column_family_id, Statistics* statistics, + CompressionType* compression_type); + + static Status ReadFooter(const RandomAccessFileReader* file_reader, + uint64_t file_size, Statistics* statistics); + + using Buffer = std::unique_ptr<char[]>; + + static Status ReadFromFile(const RandomAccessFileReader* file_reader, + uint64_t read_offset, size_t read_size, + Statistics* statistics, Slice* slice, Buffer* buf, + AlignedBuf* aligned_buf, + Env::IOPriority rate_limiter_priority); + + static Status VerifyBlob(const Slice& record_slice, const Slice& user_key, + uint64_t value_size); + + static Status UncompressBlobIfNeeded(const Slice& value_slice, + CompressionType compression_type, + MemoryAllocator* allocator, + SystemClock* clock, + Statistics* statistics, + std::unique_ptr<BlobContents>* result); + + std::unique_ptr<RandomAccessFileReader> file_reader_; + uint64_t file_size_; + CompressionType compression_type_; + SystemClock* clock_; + Statistics* statistics_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_file_reader_test.cc b/src/rocksdb/db/blob/blob_file_reader_test.cc new file mode 100644 index 000000000..03458e2b5 --- /dev/null +++ b/src/rocksdb/db/blob/blob_file_reader_test.cc @@ -0,0 +1,1024 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_reader.h" + +#include <cassert> +#include <string> + +#include "db/blob/blob_contents.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/compression.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Creates a test blob file with `num` blobs in it. +void WriteBlobFile(const ImmutableOptions& immutable_options, + uint32_t column_family_id, bool has_ttl, + const ExpirationRange& expiration_range_header, + const ExpirationRange& expiration_range_footer, + uint64_t blob_file_number, const std::vector<Slice>& keys, + const std::vector<Slice>& blobs, CompressionType compression, + std::vector<uint64_t>& blob_offsets, + std::vector<uint64_t>& blob_sizes) { + assert(!immutable_options.cf_paths.empty()); + size_t num = keys.size(); + assert(num == blobs.size()); + assert(num == blob_offsets.size()); + assert(num == blob_sizes.size()); + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + std::unique_ptr<FSWritableFile> file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter( + std::move(file), blob_file_path, FileOptions(), immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock, + statistics, blob_file_number, use_fsync, + do_flush); + + BlobLogHeader header(column_family_id, compression, has_ttl, + expiration_range_header); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + + std::vector<std::string> compressed_blobs(num); + std::vector<Slice> blobs_to_write(num); + if (kNoCompression == compression) { + for (size_t i = 0; i < num; ++i) { + blobs_to_write[i] = blobs[i]; + blob_sizes[i] = blobs[i].size(); + } + } else { + CompressionOptions opts; + CompressionContext context(compression); + constexpr uint64_t sample_for_compression = 0; + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + compression, sample_for_compression); + + constexpr uint32_t compression_format_version = 2; + + for (size_t i = 0; i < num; ++i) { + ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version, + &compressed_blobs[i])); + blobs_to_write[i] = compressed_blobs[i]; + blob_sizes[i] = compressed_blobs[i].size(); + } + } + + for (size_t i = 0; i < num; ++i) { + uint64_t key_offset = 0; + ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset, + &blob_offsets[i])); + } + + BlobLogFooter footer; + footer.blob_count = num; + footer.expiration_range = expiration_range_footer; + + std::string checksum_method; + std::string checksum_value; + ASSERT_OK( + blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); +} + +// Creates a test blob file with a single blob in it. Note: this method +// makes it possible to test various corner cases by allowing the caller +// to specify the contents of various blob file header/footer fields. +void WriteBlobFile(const ImmutableOptions& immutable_options, + uint32_t column_family_id, bool has_ttl, + const ExpirationRange& expiration_range_header, + const ExpirationRange& expiration_range_footer, + uint64_t blob_file_number, const Slice& key, + const Slice& blob, CompressionType compression, + uint64_t* blob_offset, uint64_t* blob_size) { + std::vector<Slice> keys{key}; + std::vector<Slice> blobs{blob}; + std::vector<uint64_t> blob_offsets{0}; + std::vector<uint64_t> blob_sizes{0}; + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range_header, expiration_range_footer, + blob_file_number, keys, blobs, compression, blob_offsets, + blob_sizes); + if (blob_offset) { + *blob_offset = blob_offsets[0]; + } + if (blob_size) { + *blob_size = blob_sizes[0]; + } +} + +} // anonymous namespace + +class BlobFileReaderTest : public testing::Test { + protected: + BlobFileReaderTest() { mock_env_.reset(MockEnv::Create(Env::Default())); } + std::unique_ptr<Env> mock_env_; +}; + +TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_CreateReaderAndGetBlob"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr size_t num_blobs = 3; + const std::vector<std::string> key_strs = {"key1", "key2", "key3"}; + const std::vector<std::string> blob_strs = {"blob1", "blob2", "blob3"}; + + const std::vector<Slice> keys = {key_strs[0], key_strs[1], key_strs[2]}; + const std::vector<Slice> blobs = {blob_strs[0], blob_strs[1], blob_strs[2]}; + + std::vector<uint64_t> blob_offsets(keys.size()); + std::vector<uint64_t> blob_sizes(keys.size()); + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, keys, blobs, kNoCompression, + blob_offsets, blob_sizes); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileReader> reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + // Make sure the blob can be retrieved with and without checksum verification + ReadOptions read_options; + read_options.verify_checksums = false; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + { + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, keys[0], blob_offsets[0], + blob_sizes[0], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blobs[0]); + ASSERT_EQ(bytes_read, blob_sizes[0]); + + // MultiGetBlob + bytes_read = 0; + size_t total_size = 0; + + std::array<Status, num_blobs> statuses_buf; + std::array<BlobReadRequest, num_blobs> requests_buf; + autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>> + blob_reqs; + + for (size_t i = 0; i < num_blobs; ++i) { + requests_buf[i] = + BlobReadRequest(keys[i], blob_offsets[i], blob_sizes[i], + kNoCompression, nullptr, &statuses_buf[i]); + blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>()); + } + + reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + const auto& result = blob_reqs[i].second; + + ASSERT_OK(statuses_buf[i]); + ASSERT_NE(result, nullptr); + ASSERT_EQ(result->data(), blobs[i]); + total_size += blob_sizes[i]; + } + ASSERT_EQ(bytes_read, total_size); + } + + read_options.verify_checksums = true; + + { + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, keys[1], blob_offsets[1], + blob_sizes[1], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blobs[1]); + + const uint64_t key_size = keys[1].size(); + ASSERT_EQ(bytes_read, + BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + + blob_sizes[1]); + } + + // Invalid offset (too close to start of file) + { + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[0], blob_offsets[0] - 1, + blob_sizes[0], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + } + + // Invalid offset (too close to end of file) + { + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[2], blob_offsets[2] + 1, + blob_sizes[2], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + } + + // Incorrect compression type + { + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[0], blob_offsets[0], + blob_sizes[0], kZSTD, prefetch_buffer, allocator, + &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + } + + // Incorrect key size + { + constexpr char shorter_key[] = "k"; + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, shorter_key, + blob_offsets[0] - + (keys[0].size() - sizeof(shorter_key) + 1), + blob_sizes[0], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + + // MultiGetBlob + autovector<std::reference_wrapper<const Slice>> key_refs; + for (const auto& key_ref : keys) { + key_refs.emplace_back(std::cref(key_ref)); + } + Slice shorter_key_slice(shorter_key, sizeof(shorter_key) - 1); + key_refs[1] = std::cref(shorter_key_slice); + + autovector<uint64_t> offsets{ + blob_offsets[0], + blob_offsets[1] - (keys[1].size() - key_refs[1].get().size()), + blob_offsets[2]}; + + std::array<Status, num_blobs> statuses_buf; + std::array<BlobReadRequest, num_blobs> requests_buf; + autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>> + blob_reqs; + + for (size_t i = 0; i < num_blobs; ++i) { + requests_buf[i] = + BlobReadRequest(key_refs[i], offsets[i], blob_sizes[i], + kNoCompression, nullptr, &statuses_buf[i]); + blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>()); + } + + reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + if (i == 1) { + ASSERT_TRUE(statuses_buf[i].IsCorruption()); + } else { + ASSERT_OK(statuses_buf[i]); + } + } + } + + // Incorrect key + { + constexpr char incorrect_key[] = "foo1"; + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, incorrect_key, blob_offsets[0], + blob_sizes[0], kNoCompression, prefetch_buffer, + allocator, &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + + // MultiGetBlob + autovector<std::reference_wrapper<const Slice>> key_refs; + for (const auto& key_ref : keys) { + key_refs.emplace_back(std::cref(key_ref)); + } + Slice wrong_key_slice(incorrect_key, sizeof(incorrect_key) - 1); + key_refs[2] = std::cref(wrong_key_slice); + + std::array<Status, num_blobs> statuses_buf; + std::array<BlobReadRequest, num_blobs> requests_buf; + autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>> + blob_reqs; + + for (size_t i = 0; i < num_blobs; ++i) { + requests_buf[i] = + BlobReadRequest(key_refs[i], blob_offsets[i], blob_sizes[i], + kNoCompression, nullptr, &statuses_buf[i]); + blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>()); + } + + reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + if (i == num_blobs - 1) { + ASSERT_TRUE(statuses_buf[i].IsCorruption()); + } else { + ASSERT_OK(statuses_buf[i]); + } + } + } + + // Incorrect value size + { + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(read_options, keys[1], blob_offsets[1], + blob_sizes[1] + 1, kNoCompression, + prefetch_buffer, allocator, &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + + // MultiGetBlob + autovector<std::reference_wrapper<const Slice>> key_refs; + for (const auto& key_ref : keys) { + key_refs.emplace_back(std::cref(key_ref)); + } + + std::array<Status, num_blobs> statuses_buf; + std::array<BlobReadRequest, num_blobs> requests_buf; + + requests_buf[0] = + BlobReadRequest(key_refs[0], blob_offsets[0], blob_sizes[0], + kNoCompression, nullptr, &statuses_buf[0]); + requests_buf[1] = + BlobReadRequest(key_refs[1], blob_offsets[1], blob_sizes[1] + 1, + kNoCompression, nullptr, &statuses_buf[1]); + requests_buf[2] = + BlobReadRequest(key_refs[2], blob_offsets[2], blob_sizes[2], + kNoCompression, nullptr, &statuses_buf[2]); + + autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>> + blob_reqs; + + for (size_t i = 0; i < num_blobs; ++i) { + blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>()); + } + + reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + if (i != 1) { + ASSERT_OK(statuses_buf[i]); + } else { + ASSERT_TRUE(statuses_buf[i].IsCorruption()); + } + } + } +} + +TEST_F(BlobFileReaderTest, Malformed) { + // Write a blob file consisting of nothing but a header, and make sure we + // detect the error when we open it for reading + + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Malformed"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr uint64_t blob_file_number = 1; + + { + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + + std::unique_ptr<FSWritableFile> file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr<WritableFileWriter> file_writer( + new WritableFileWriter(std::move(file), blob_file_path, FileOptions(), + immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), + immutable_options.clock, statistics, + blob_file_number, use_fsync, do_flush); + + BlobLogHeader header(column_family_id, kNoCompression, has_ttl, + expiration_range); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + } + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileReader> reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, TTL) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_TTL"), 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = true; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileReader> reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_ExpirationRangeInHeader"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + const ExpirationRange expiration_range_header( + 1, 2); // can be made constexpr when we adopt C++14 + constexpr ExpirationRange expiration_range_footer; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range_header, expiration_range_footer, + blob_file_number, key, blob, kNoCompression, &blob_offset, + &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileReader> reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_ExpirationRangeInFooter"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range_header; + const ExpirationRange expiration_range_footer( + 1, 2); // can be made constexpr when we adopt C++14 + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range_header, expiration_range_footer, + blob_file_number, key, blob, kNoCompression, &blob_offset, + &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileReader> reader; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, + &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, IncorrectColumnFamily) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_IncorrectColumnFamily"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileReader> reader; + + constexpr uint32_t incorrect_column_family_id = 2; + + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), + incorrect_column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) + .IsCorruption()); +} + +TEST_F(BlobFileReaderTest, BlobCRCError) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_BlobCRCError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileReader> reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) { + BlobLogRecord* const record = static_cast<BlobLogRecord*>(arg); + assert(record); + + record->blob_crc = 0xfaceb00c; + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kNoCompression, prefetch_buffer, allocator, &value, + &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileReaderTest, Compression) { + if (!Snappy_Supported()) { + return; + } + + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Compression"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, + kSnappyCompression, &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileReader> reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + // Make sure the blob can be retrieved with and without checksum verification + ReadOptions read_options; + read_options.verify_checksums = false; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + { + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size, + kSnappyCompression, prefetch_buffer, allocator, + &value, &bytes_read)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blob); + ASSERT_EQ(bytes_read, blob_size); + } + + read_options.verify_checksums = true; + + { + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size, + kSnappyCompression, prefetch_buffer, allocator, + &value, &bytes_read)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blob); + + constexpr uint64_t key_size = sizeof(key) - 1; + ASSERT_EQ(bytes_read, + BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + + blob_size); + } +} + +TEST_F(BlobFileReaderTest, UncompressionError) { + if (!Snappy_Supported()) { + return; + } + + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderTest_UncompressionError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, + kSnappyCompression, &blob_offset, &blob_size); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileReader> reader; + + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); + + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) { + CacheAllocationPtr* const output = + static_cast<CacheAllocationPtr*>(arg); + assert(output); + + output->reset(); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kSnappyCompression, prefetch_buffer, allocator, + &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +class BlobFileReaderIOErrorTest + : public testing::Test, + public testing::WithParamInterface<std::string> { + protected: + BlobFileReaderIOErrorTest() : sync_point_(GetParam()) { + mock_env_.reset(MockEnv::Create(Env::Default())); + fault_injection_env_.reset(new FaultInjectionTestEnv(mock_env_.get())); + } + + std::unique_ptr<Env> mock_env_; + std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_; + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderIOErrorTest, + ::testing::ValuesIn(std::vector<std::string>{ + "BlobFileReader::OpenFile:GetFileSize", + "BlobFileReader::OpenFile:NewRandomAccessFile", + "BlobFileReader::ReadHeader:ReadFromFile", + "BlobFileReader::ReadFooter:ReadFromFile", + "BlobFileReader::GetBlob:ReadFromFile"})); + +TEST_P(BlobFileReaderIOErrorTest, IOError) { + // Simulates an I/O error during the specified step + + Options options; + options.env = fault_injection_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(fault_injection_env_.get(), + "BlobFileReaderIOErrorTest_IOError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileReader> reader; + + const Status s = BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader); + + const bool fail_during_create = + (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile"); + + if (fail_during_create) { + ASSERT_TRUE(s.IsIOError()); + } else { + ASSERT_OK(s); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kNoCompression, prefetch_buffer, allocator, + &value, &bytes_read) + .IsIOError()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +class BlobFileReaderDecodingErrorTest + : public testing::Test, + public testing::WithParamInterface<std::string> { + protected: + BlobFileReaderDecodingErrorTest() : sync_point_(GetParam()) { + mock_env_.reset(MockEnv::Create(Env::Default())); + } + + std::unique_ptr<Env> mock_env_; + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderDecodingErrorTest, + ::testing::ValuesIn(std::vector<std::string>{ + "BlobFileReader::ReadHeader:TamperWithResult", + "BlobFileReader::ReadFooter:TamperWithResult", + "BlobFileReader::GetBlob:TamperWithResult"})); + +TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) { + Options options; + options.env = mock_env_.get(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(mock_env_.get(), + "BlobFileReaderDecodingErrorTest_DecodingError"), + 0); + options.enable_blob_files = true; + + ImmutableOptions immutable_options(options); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + uint64_t blob_offset = 0; + uint64_t blob_size = 0; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [](void* arg) { + Slice* const slice = static_cast<Slice*>(arg); + assert(slice); + assert(!slice->empty()); + + slice->remove_prefix(1); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileReader> reader; + + const Status s = BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader); + + const bool fail_during_create = + sync_point_ != "BlobFileReader::GetBlob:TamperWithResult"; + + if (fail_during_create) { + ASSERT_TRUE(s.IsCorruption()); + } else { + ASSERT_OK(s); + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + constexpr MemoryAllocator* allocator = nullptr; + + std::unique_ptr<BlobContents> value; + uint64_t bytes_read = 0; + + ASSERT_TRUE(reader + ->GetBlob(ReadOptions(), key, blob_offset, blob_size, + kNoCompression, prefetch_buffer, allocator, + &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(value, nullptr); + ASSERT_EQ(bytes_read, 0); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_garbage_meter.cc b/src/rocksdb/db/blob/blob_garbage_meter.cc new file mode 100644 index 000000000..d328d7ff4 --- /dev/null +++ b/src/rocksdb/db/blob/blob_garbage_meter.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_garbage_meter.h" + +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/dbformat.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobGarbageMeter::ProcessInFlow(const Slice& key, const Slice& value) { + uint64_t blob_file_number = kInvalidBlobFileNumber; + uint64_t bytes = 0; + + const Status s = Parse(key, value, &blob_file_number, &bytes); + if (!s.ok()) { + return s; + } + + if (blob_file_number == kInvalidBlobFileNumber) { + return Status::OK(); + } + + flows_[blob_file_number].AddInFlow(bytes); + + return Status::OK(); +} + +Status BlobGarbageMeter::ProcessOutFlow(const Slice& key, const Slice& value) { + uint64_t blob_file_number = kInvalidBlobFileNumber; + uint64_t bytes = 0; + + const Status s = Parse(key, value, &blob_file_number, &bytes); + if (!s.ok()) { + return s; + } + + if (blob_file_number == kInvalidBlobFileNumber) { + return Status::OK(); + } + + // Note: in order to measure the amount of additional garbage, we only need to + // track the outflow for preexisting files, i.e. those that also had inflow. + // (Newly written files would only have outflow.) + auto it = flows_.find(blob_file_number); + if (it == flows_.end()) { + return Status::OK(); + } + + it->second.AddOutFlow(bytes); + + return Status::OK(); +} + +Status BlobGarbageMeter::Parse(const Slice& key, const Slice& value, + uint64_t* blob_file_number, uint64_t* bytes) { + assert(blob_file_number); + assert(*blob_file_number == kInvalidBlobFileNumber); + assert(bytes); + assert(*bytes == 0); + + ParsedInternalKey ikey; + + { + constexpr bool log_err_key = false; + const Status s = ParseInternalKey(key, &ikey, log_err_key); + if (!s.ok()) { + return s; + } + } + + if (ikey.type != kTypeBlobIndex) { + return Status::OK(); + } + + BlobIndex blob_index; + + { + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + return s; + } + } + + if (blob_index.IsInlined() || blob_index.HasTTL()) { + return Status::Corruption("Unexpected TTL/inlined blob index"); + } + + *blob_file_number = blob_index.file_number(); + *bytes = + blob_index.size() + + BlobLogRecord::CalculateAdjustmentForRecordHeader(ikey.user_key.size()); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_garbage_meter.h b/src/rocksdb/db/blob/blob_garbage_meter.h new file mode 100644 index 000000000..a6c04b0b2 --- /dev/null +++ b/src/rocksdb/db/blob/blob_garbage_meter.h @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cassert> +#include <cstdint> +#include <unordered_map> + +#include "db/blob/blob_constants.h" +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +// A class that can be used to compute the amount of additional garbage +// generated by a compaction. It parses the keys and blob references in the +// input and output of a compaction, and aggregates the "inflow" and "outflow" +// on a per-blob file basis. The amount of additional garbage for any given blob +// file can then be computed by subtracting the outflow from the inflow. +class BlobGarbageMeter { + public: + // A class to store the number and total size of blobs on a per-blob file + // basis. + class BlobStats { + public: + void Add(uint64_t bytes) { + ++count_; + bytes_ += bytes; + } + void Add(uint64_t count, uint64_t bytes) { + count_ += count; + bytes_ += bytes; + } + + uint64_t GetCount() const { return count_; } + uint64_t GetBytes() const { return bytes_; } + + private: + uint64_t count_ = 0; + uint64_t bytes_ = 0; + }; + + // A class to keep track of the "inflow" and the "outflow" and to compute the + // amount of additional garbage for a given blob file. + class BlobInOutFlow { + public: + void AddInFlow(uint64_t bytes) { + in_flow_.Add(bytes); + assert(IsValid()); + } + void AddOutFlow(uint64_t bytes) { + out_flow_.Add(bytes); + assert(IsValid()); + } + + const BlobStats& GetInFlow() const { return in_flow_; } + const BlobStats& GetOutFlow() const { return out_flow_; } + + bool IsValid() const { + return in_flow_.GetCount() >= out_flow_.GetCount() && + in_flow_.GetBytes() >= out_flow_.GetBytes(); + } + bool HasGarbage() const { + assert(IsValid()); + return in_flow_.GetCount() > out_flow_.GetCount(); + } + uint64_t GetGarbageCount() const { + assert(IsValid()); + assert(HasGarbage()); + return in_flow_.GetCount() - out_flow_.GetCount(); + } + uint64_t GetGarbageBytes() const { + assert(IsValid()); + assert(HasGarbage()); + return in_flow_.GetBytes() - out_flow_.GetBytes(); + } + + private: + BlobStats in_flow_; + BlobStats out_flow_; + }; + + Status ProcessInFlow(const Slice& key, const Slice& value); + Status ProcessOutFlow(const Slice& key, const Slice& value); + + const std::unordered_map<uint64_t, BlobInOutFlow>& flows() const { + return flows_; + } + + private: + static Status Parse(const Slice& key, const Slice& value, + uint64_t* blob_file_number, uint64_t* bytes); + + std::unordered_map<uint64_t, BlobInOutFlow> flows_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_garbage_meter_test.cc b/src/rocksdb/db/blob/blob_garbage_meter_test.cc new file mode 100644 index 000000000..ba53f06f1 --- /dev/null +++ b/src/rocksdb/db/blob/blob_garbage_meter_test.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_garbage_meter.h" + +#include <string> +#include <vector> + +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/dbformat.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +TEST(BlobGarbageMeterTest, MeasureGarbage) { + BlobGarbageMeter blob_garbage_meter; + + struct BlobDescriptor { + std::string user_key; + uint64_t blob_file_number; + uint64_t offset; + uint64_t size; + CompressionType compression_type; + bool has_in_flow; + bool has_out_flow; + + uint64_t GetExpectedBytes() const { + return size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(user_key.size()); + } + }; + + // Note: blob file 4 has the same inflow and outflow and hence no additional + // garbage. Blob file 5 has less outflow than inflow and thus it does have + // additional garbage. Blob file 6 is a newly written file (i.e. no inflow, + // only outflow) and is thus not tracked by the meter. + std::vector<BlobDescriptor> blobs{ + {"key", 4, 1234, 555, kLZ4Compression, true, true}, + {"other_key", 4, 6789, 101010, kLZ4Compression, true, true}, + {"yet_another_key", 5, 22222, 3456, kLZ4Compression, true, true}, + {"foo_key", 5, 77777, 8888, kLZ4Compression, true, true}, + {"bar_key", 5, 999999, 1212, kLZ4Compression, true, false}, + {"baz_key", 5, 1234567, 890, kLZ4Compression, true, false}, + {"new_key", 6, 7777, 9999, kNoCompression, false, true}}; + + for (const auto& blob : blobs) { + constexpr SequenceNumber seq = 123; + const InternalKey key(blob.user_key, seq, kTypeBlobIndex); + const Slice key_slice = key.Encode(); + + std::string value; + BlobIndex::EncodeBlob(&value, blob.blob_file_number, blob.offset, blob.size, + blob.compression_type); + const Slice value_slice(value); + + if (blob.has_in_flow) { + ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + } + if (blob.has_out_flow) { + ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); + } + } + + const auto& flows = blob_garbage_meter.flows(); + ASSERT_EQ(flows.size(), 2); + + { + const auto it = flows.find(4); + ASSERT_NE(it, flows.end()); + + const auto& flow = it->second; + + constexpr uint64_t expected_count = 2; + const uint64_t expected_bytes = + blobs[0].GetExpectedBytes() + blobs[1].GetExpectedBytes(); + + const auto& in = flow.GetInFlow(); + ASSERT_EQ(in.GetCount(), expected_count); + ASSERT_EQ(in.GetBytes(), expected_bytes); + + const auto& out = flow.GetOutFlow(); + ASSERT_EQ(out.GetCount(), expected_count); + ASSERT_EQ(out.GetBytes(), expected_bytes); + + ASSERT_TRUE(flow.IsValid()); + ASSERT_FALSE(flow.HasGarbage()); + } + + { + const auto it = flows.find(5); + ASSERT_NE(it, flows.end()); + + const auto& flow = it->second; + + const auto& in = flow.GetInFlow(); + + constexpr uint64_t expected_in_count = 4; + const uint64_t expected_in_bytes = + blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes() + + blobs[4].GetExpectedBytes() + blobs[5].GetExpectedBytes(); + + ASSERT_EQ(in.GetCount(), expected_in_count); + ASSERT_EQ(in.GetBytes(), expected_in_bytes); + + const auto& out = flow.GetOutFlow(); + + constexpr uint64_t expected_out_count = 2; + const uint64_t expected_out_bytes = + blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes(); + + ASSERT_EQ(out.GetCount(), expected_out_count); + ASSERT_EQ(out.GetBytes(), expected_out_bytes); + + ASSERT_TRUE(flow.IsValid()); + ASSERT_TRUE(flow.HasGarbage()); + ASSERT_EQ(flow.GetGarbageCount(), expected_in_count - expected_out_count); + ASSERT_EQ(flow.GetGarbageBytes(), expected_in_bytes - expected_out_bytes); + } +} + +TEST(BlobGarbageMeterTest, PlainValue) { + constexpr char user_key[] = "user_key"; + constexpr SequenceNumber seq = 123; + + const InternalKey key(user_key, seq, kTypeValue); + const Slice key_slice = key.Encode(); + + constexpr char value[] = "value"; + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); + ASSERT_TRUE(blob_garbage_meter.flows().empty()); +} + +TEST(BlobGarbageMeterTest, CorruptInternalKey) { + constexpr char corrupt_key[] = "i_am_corrupt"; + const Slice key_slice(corrupt_key); + + constexpr char value[] = "value"; + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); +} + +TEST(BlobGarbageMeterTest, CorruptBlobIndex) { + constexpr char user_key[] = "user_key"; + constexpr SequenceNumber seq = 123; + + const InternalKey key(user_key, seq, kTypeBlobIndex); + const Slice key_slice = key.Encode(); + + constexpr char value[] = "i_am_not_a_blob_index"; + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); +} + +TEST(BlobGarbageMeterTest, InlinedTTLBlobIndex) { + constexpr char user_key[] = "user_key"; + constexpr SequenceNumber seq = 123; + + const InternalKey key(user_key, seq, kTypeBlobIndex); + const Slice key_slice = key.Encode(); + + constexpr uint64_t expiration = 1234567890; + constexpr char inlined_value[] = "inlined"; + + std::string value; + BlobIndex::EncodeInlinedTTL(&value, expiration, inlined_value); + + const Slice value_slice(value); + + BlobGarbageMeter blob_garbage_meter; + + ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice)); + ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/blob_index.h b/src/rocksdb/db/blob/blob_index.h new file mode 100644 index 000000000..e9944d784 --- /dev/null +++ b/src/rocksdb/db/blob/blob_index.h @@ -0,0 +1,187 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <sstream> +#include <string> + +#include "rocksdb/compression_type.h" +#include "util/coding.h" +#include "util/compression.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +// BlobIndex is a pointer to the blob and metadata of the blob. The index is +// stored in base DB as ValueType::kTypeBlobIndex. +// There are three types of blob index: +// +// kInlinedTTL: +// +------+------------+---------------+ +// | type | expiration | value | +// +------+------------+---------------+ +// | char | varint64 | variable size | +// +------+------------+---------------+ +// +// kBlob: +// +------+-------------+----------+----------+-------------+ +// | type | file number | offset | size | compression | +// +------+-------------+----------+----------+-------------+ +// | char | varint64 | varint64 | varint64 | char | +// +------+-------------+----------+----------+-------------+ +// +// kBlobTTL: +// +------+------------+-------------+----------+----------+-------------+ +// | type | expiration | file number | offset | size | compression | +// +------+------------+-------------+----------+----------+-------------+ +// | char | varint64 | varint64 | varint64 | varint64 | char | +// +------+------------+-------------+----------+----------+-------------+ +// +// There isn't a kInlined (without TTL) type since we can store it as a plain +// value (i.e. ValueType::kTypeValue). +class BlobIndex { + public: + enum class Type : unsigned char { + kInlinedTTL = 0, + kBlob = 1, + kBlobTTL = 2, + kUnknown = 3, + }; + + BlobIndex() : type_(Type::kUnknown) {} + + BlobIndex(const BlobIndex&) = default; + BlobIndex& operator=(const BlobIndex&) = default; + + bool IsInlined() const { return type_ == Type::kInlinedTTL; } + + bool HasTTL() const { + return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL; + } + + uint64_t expiration() const { + assert(HasTTL()); + return expiration_; + } + + const Slice& value() const { + assert(IsInlined()); + return value_; + } + + uint64_t file_number() const { + assert(!IsInlined()); + return file_number_; + } + + uint64_t offset() const { + assert(!IsInlined()); + return offset_; + } + + uint64_t size() const { + assert(!IsInlined()); + return size_; + } + + CompressionType compression() const { + assert(!IsInlined()); + return compression_; + } + + Status DecodeFrom(Slice slice) { + const char* kErrorMessage = "Error while decoding blob index"; + assert(slice.size() > 0); + type_ = static_cast<Type>(*slice.data()); + if (type_ >= Type::kUnknown) { + return Status::Corruption(kErrorMessage, + "Unknown blob index type: " + + std::to_string(static_cast<char>(type_))); + } + slice = Slice(slice.data() + 1, slice.size() - 1); + if (HasTTL()) { + if (!GetVarint64(&slice, &expiration_)) { + return Status::Corruption(kErrorMessage, "Corrupted expiration"); + } + } + if (IsInlined()) { + value_ = slice; + } else { + if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) && + GetVarint64(&slice, &size_) && slice.size() == 1) { + compression_ = static_cast<CompressionType>(*slice.data()); + } else { + return Status::Corruption(kErrorMessage, "Corrupted blob offset"); + } + } + return Status::OK(); + } + + std::string DebugString(bool output_hex) const { + std::ostringstream oss; + + if (IsInlined()) { + oss << "[inlined blob] value:" << value_.ToString(output_hex); + } else { + oss << "[blob ref] file:" << file_number_ << " offset:" << offset_ + << " size:" << size_ + << " compression: " << CompressionTypeToString(compression_); + } + + if (HasTTL()) { + oss << " exp:" << expiration_; + } + + return oss.str(); + } + + static void EncodeInlinedTTL(std::string* dst, uint64_t expiration, + const Slice& value) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(1 + kMaxVarint64Length + value.size()); + dst->push_back(static_cast<char>(Type::kInlinedTTL)); + PutVarint64(dst, expiration); + dst->append(value.data(), value.size()); + } + + static void EncodeBlob(std::string* dst, uint64_t file_number, + uint64_t offset, uint64_t size, + CompressionType compression) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(kMaxVarint64Length * 3 + 2); + dst->push_back(static_cast<char>(Type::kBlob)); + PutVarint64(dst, file_number); + PutVarint64(dst, offset); + PutVarint64(dst, size); + dst->push_back(static_cast<char>(compression)); + } + + static void EncodeBlobTTL(std::string* dst, uint64_t expiration, + uint64_t file_number, uint64_t offset, + uint64_t size, CompressionType compression) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(kMaxVarint64Length * 4 + 2); + dst->push_back(static_cast<char>(Type::kBlobTTL)); + PutVarint64(dst, expiration); + PutVarint64(dst, file_number); + PutVarint64(dst, offset); + PutVarint64(dst, size); + dst->push_back(static_cast<char>(compression)); + } + + private: + Type type_ = Type::kUnknown; + uint64_t expiration_ = 0; + Slice value_; + uint64_t file_number_ = 0; + uint64_t offset_ = 0; + uint64_t size_ = 0; + CompressionType compression_ = kNoCompression; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_log_format.cc b/src/rocksdb/db/blob/blob_log_format.cc new file mode 100644 index 000000000..8e26281e3 --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_format.cc @@ -0,0 +1,143 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "db/blob/blob_log_format.h" + +#include "util/coding.h" +#include "util/crc32c.h" + +namespace ROCKSDB_NAMESPACE { + +void BlobLogHeader::EncodeTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogHeader::kSize); + PutFixed32(dst, kMagicNumber); + PutFixed32(dst, version); + PutFixed32(dst, column_family_id); + unsigned char flags = (has_ttl ? 1 : 0); + dst->push_back(flags); + dst->push_back(compression); + PutFixed64(dst, expiration_range.first); + PutFixed64(dst, expiration_range.second); +} + +Status BlobLogHeader::DecodeFrom(Slice src) { + const char* kErrorMessage = "Error while decoding blob log header"; + if (src.size() != BlobLogHeader::kSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob file header size"); + } + uint32_t magic_number; + unsigned char flags; + if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) || + !GetFixed32(&src, &column_family_id)) { + return Status::Corruption( + kErrorMessage, + "Error decoding magic number, version and column family id"); + } + if (magic_number != kMagicNumber) { + return Status::Corruption(kErrorMessage, "Magic number mismatch"); + } + if (version != kVersion1) { + return Status::Corruption(kErrorMessage, "Unknown header version"); + } + flags = src.data()[0]; + compression = static_cast<CompressionType>(src.data()[1]); + has_ttl = (flags & 1) == 1; + src.remove_prefix(2); + if (!GetFixed64(&src, &expiration_range.first) || + !GetFixed64(&src, &expiration_range.second)) { + return Status::Corruption(kErrorMessage, "Error decoding expiration range"); + } + return Status::OK(); +} + +void BlobLogFooter::EncodeTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogFooter::kSize); + PutFixed32(dst, kMagicNumber); + PutFixed64(dst, blob_count); + PutFixed64(dst, expiration_range.first); + PutFixed64(dst, expiration_range.second); + crc = crc32c::Value(dst->c_str(), dst->size()); + crc = crc32c::Mask(crc); + PutFixed32(dst, crc); +} + +Status BlobLogFooter::DecodeFrom(Slice src) { + const char* kErrorMessage = "Error while decoding blob log footer"; + if (src.size() != BlobLogFooter::kSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob file footer size"); + } + uint32_t src_crc = 0; + src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - sizeof(uint32_t)); + src_crc = crc32c::Mask(src_crc); + uint32_t magic_number = 0; + if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) || + !GetFixed64(&src, &expiration_range.first) || + !GetFixed64(&src, &expiration_range.second) || !GetFixed32(&src, &crc)) { + return Status::Corruption(kErrorMessage, "Error decoding content"); + } + if (magic_number != kMagicNumber) { + return Status::Corruption(kErrorMessage, "Magic number mismatch"); + } + if (src_crc != crc) { + return Status::Corruption(kErrorMessage, "CRC mismatch"); + } + return Status::OK(); +} + +void BlobLogRecord::EncodeHeaderTo(std::string* dst) { + assert(dst != nullptr); + dst->clear(); + dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size()); + PutFixed64(dst, key.size()); + PutFixed64(dst, value.size()); + PutFixed64(dst, expiration); + header_crc = crc32c::Value(dst->c_str(), dst->size()); + header_crc = crc32c::Mask(header_crc); + PutFixed32(dst, header_crc); + blob_crc = crc32c::Value(key.data(), key.size()); + blob_crc = crc32c::Extend(blob_crc, value.data(), value.size()); + blob_crc = crc32c::Mask(blob_crc); + PutFixed32(dst, blob_crc); +} + +Status BlobLogRecord::DecodeHeaderFrom(Slice src) { + const char* kErrorMessage = "Error while decoding blob record"; + if (src.size() != BlobLogRecord::kHeaderSize) { + return Status::Corruption(kErrorMessage, + "Unexpected blob record header size"); + } + uint32_t src_crc = 0; + src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8); + src_crc = crc32c::Mask(src_crc); + if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) || + !GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) || + !GetFixed32(&src, &blob_crc)) { + return Status::Corruption(kErrorMessage, "Error decoding content"); + } + if (src_crc != header_crc) { + return Status::Corruption(kErrorMessage, "Header CRC mismatch"); + } + return Status::OK(); +} + +Status BlobLogRecord::CheckBlobCRC() const { + uint32_t expected_crc = 0; + expected_crc = crc32c::Value(key.data(), key.size()); + expected_crc = crc32c::Extend(expected_crc, value.data(), value.size()); + expected_crc = crc32c::Mask(expected_crc); + if (expected_crc != blob_crc) { + return Status::Corruption("Blob CRC mismatch"); + } + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_log_format.h b/src/rocksdb/db/blob/blob_log_format.h new file mode 100644 index 000000000..607db2367 --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_format.h @@ -0,0 +1,164 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Log format information shared by reader and writer. + +#pragma once + +#include <memory> +#include <utility> + +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +constexpr uint32_t kMagicNumber = 2395959; // 0x00248f37 +constexpr uint32_t kVersion1 = 1; + +using ExpirationRange = std::pair<uint64_t, uint64_t>; + +// clang-format off + +// Format of blob log file header (30 bytes): +// +// +--------------+---------+---------+-------+-------------+-------------------+ +// | magic number | version | cf id | flags | compression | expiration range | +// +--------------+---------+---------+-------+-------------+-------------------+ +// | Fixed32 | Fixed32 | Fixed32 | char | char | Fixed64 Fixed64 | +// +--------------+---------+---------+-------+-------------+-------------------+ +// +// List of flags: +// has_ttl: Whether the file contain TTL data. +// +// Expiration range in the header is a rough range based on +// blob_db_options.ttl_range_secs. + +// clang-format on + +struct BlobLogHeader { + static constexpr size_t kSize = 30; + + BlobLogHeader() = default; + BlobLogHeader(uint32_t _column_family_id, CompressionType _compression, + bool _has_ttl, const ExpirationRange& _expiration_range) + : column_family_id(_column_family_id), + compression(_compression), + has_ttl(_has_ttl), + expiration_range(_expiration_range) {} + + uint32_t version = kVersion1; + uint32_t column_family_id = 0; + CompressionType compression = kNoCompression; + bool has_ttl = false; + ExpirationRange expiration_range; + + void EncodeTo(std::string* dst); + + Status DecodeFrom(Slice slice); +}; + +// clang-format off + +// Format of blob log file footer (32 bytes): +// +// +--------------+------------+-------------------+------------+ +// | magic number | blob count | expiration range | footer CRC | +// +--------------+------------+-------------------+------------+ +// | Fixed32 | Fixed64 | Fixed64 + Fixed64 | Fixed32 | +// +--------------+------------+-------------------+------------+ +// +// The footer will be presented only when the blob file is properly closed. +// +// Unlike the same field in file header, expiration range in the footer is the +// range of smallest and largest expiration of the data in this file. + +// clang-format on + +struct BlobLogFooter { + static constexpr size_t kSize = 32; + + uint64_t blob_count = 0; + ExpirationRange expiration_range = std::make_pair(0, 0); + uint32_t crc = 0; + + void EncodeTo(std::string* dst); + + Status DecodeFrom(Slice slice); +}; + +// clang-format off + +// Blob record format (32 bytes header + key + value): +// +// +------------+--------------+------------+------------+----------+---------+-----------+ +// | key length | value length | expiration | header CRC | blob CRC | key | value | +// +------------+--------------+------------+------------+----------+---------+-----------+ +// | Fixed64 | Fixed64 | Fixed64 | Fixed32 | Fixed32 | key len | value len | +// +------------+--------------+------------+------------+----------+---------+-----------+ +// +// If file has has_ttl = false, expiration field is always 0, and the blob +// doesn't has expiration. +// +// Also note that if compression is used, value is compressed value and value +// length is compressed value length. +// +// Header CRC is the checksum of (key_len + val_len + expiration), while +// blob CRC is the checksum of (key + value). +// +// We could use variable length encoding (Varint64) to save more space, but it +// make reader more complicated. + +// clang-format on + +struct BlobLogRecord { + // header include fields up to blob CRC + static constexpr size_t kHeaderSize = 32; + + // Note that the offset field of BlobIndex actually points to the blob value + // as opposed to the start of the blob record. The following method can + // be used to calculate the adjustment needed to read the blob record header. + static constexpr uint64_t CalculateAdjustmentForRecordHeader( + uint64_t key_size) { + return key_size + kHeaderSize; + } + + uint64_t key_size = 0; + uint64_t value_size = 0; + uint64_t expiration = 0; + uint32_t header_crc = 0; + uint32_t blob_crc = 0; + Slice key; + Slice value; + std::unique_ptr<char[]> key_buf; + std::unique_ptr<char[]> value_buf; + + uint64_t record_size() const { return kHeaderSize + key_size + value_size; } + + void EncodeHeaderTo(std::string* dst); + + Status DecodeHeaderFrom(Slice src); + + Status CheckBlobCRC() const; +}; + +// Checks whether a blob offset is potentially valid or not. +inline bool IsValidBlobOffset(uint64_t value_offset, uint64_t key_size, + uint64_t value_size, uint64_t file_size) { + if (value_offset < + BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key_size) { + return false; + } + + if (value_offset + value_size + BlobLogFooter::kSize > file_size) { + return false; + } + + return true; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.cc b/src/rocksdb/db/blob/blob_log_sequential_reader.cc new file mode 100644 index 000000000..778725189 --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_sequential_reader.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "db/blob/blob_log_sequential_reader.h" + +#include "file/random_access_file_reader.h" +#include "monitoring/statistics.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +BlobLogSequentialReader::BlobLogSequentialReader( + std::unique_ptr<RandomAccessFileReader>&& file_reader, SystemClock* clock, + Statistics* statistics) + : file_(std::move(file_reader)), + clock_(clock), + statistics_(statistics), + next_byte_(0) {} + +BlobLogSequentialReader::~BlobLogSequentialReader() = default; + +Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice, + char* buf) { + assert(slice); + assert(file_); + + StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); + // TODO: rate limit `BlobLogSequentialReader` reads (it appears unused?) + Status s = + file_->Read(IOOptions(), next_byte_, static_cast<size_t>(size), slice, + buf, nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + next_byte_ += size; + if (!s.ok()) { + return s; + } + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size()); + if (slice->size() != size) { + return Status::Corruption("EOF reached while reading record"); + } + return s; +} + +Status BlobLogSequentialReader::ReadHeader(BlobLogHeader* header) { + assert(header); + assert(next_byte_ == 0); + + static_assert(BlobLogHeader::kSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogHeader::kSize"); + + Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + + if (buffer_.size() != BlobLogHeader::kSize) { + return Status::Corruption("EOF reached before file header"); + } + + return header->DecodeFrom(buffer_); +} + +Status BlobLogSequentialReader::ReadRecord(BlobLogRecord* record, + ReadLevel level, + uint64_t* blob_offset) { + assert(record); + static_assert(BlobLogRecord::kHeaderSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogRecord::kHeaderSize"); + + Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + if (buffer_.size() != BlobLogRecord::kHeaderSize) { + return Status::Corruption("EOF reached before record header"); + } + + s = record->DecodeHeaderFrom(buffer_); + if (!s.ok()) { + return s; + } + + uint64_t kb_size = record->key_size + record->value_size; + if (blob_offset != nullptr) { + *blob_offset = next_byte_ + record->key_size; + } + + switch (level) { + case kReadHeader: + next_byte_ += kb_size; + break; + + case kReadHeaderKey: + record->key_buf.reset(new char[record->key_size]); + s = ReadSlice(record->key_size, &record->key, record->key_buf.get()); + next_byte_ += record->value_size; + break; + + case kReadHeaderKeyBlob: + record->key_buf.reset(new char[record->key_size]); + s = ReadSlice(record->key_size, &record->key, record->key_buf.get()); + if (s.ok()) { + record->value_buf.reset(new char[record->value_size]); + s = ReadSlice(record->value_size, &record->value, + record->value_buf.get()); + } + if (s.ok()) { + s = record->CheckBlobCRC(); + } + break; + } + return s; +} + +Status BlobLogSequentialReader::ReadFooter(BlobLogFooter* footer) { + assert(footer); + static_assert(BlobLogFooter::kSize <= sizeof(header_buf_), + "Buffer is smaller than BlobLogFooter::kSize"); + + Status s = ReadSlice(BlobLogFooter::kSize, &buffer_, header_buf_); + if (!s.ok()) { + return s; + } + + if (buffer_.size() != BlobLogFooter::kSize) { + return Status::Corruption("EOF reached before file footer"); + } + + return footer->DecodeFrom(buffer_); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.h b/src/rocksdb/db/blob/blob_log_sequential_reader.h new file mode 100644 index 000000000..98afa8518 --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_sequential_reader.h @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#include <memory> + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" + +#define MAX_HEADER_SIZE(a, b, c) (a > b ? (a > c ? a : c) : (b > c ? b : c)) + +namespace ROCKSDB_NAMESPACE { + +class RandomAccessFileReader; +class Env; +class Statistics; +class Status; +class SystemClock; + +/** + * BlobLogSequentialReader is a general purpose log stream reader + * implementation. The actual job of reading from the device is implemented by + * the RandomAccessFileReader interface. + * + * Please see BlobLogWriter for details on the file and record layout. + */ + +class BlobLogSequentialReader { + public: + enum ReadLevel { + kReadHeader, + kReadHeaderKey, + kReadHeaderKeyBlob, + }; + + // Create a reader that will return log records from "*file_reader". + BlobLogSequentialReader(std::unique_ptr<RandomAccessFileReader>&& file_reader, + SystemClock* clock, Statistics* statistics); + + // No copying allowed + BlobLogSequentialReader(const BlobLogSequentialReader&) = delete; + BlobLogSequentialReader& operator=(const BlobLogSequentialReader&) = delete; + + ~BlobLogSequentialReader(); + + Status ReadHeader(BlobLogHeader* header); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. The contents filled in + // *record will only be valid until the next mutating operation on this + // reader. + // If blob_offset is non-null, return offset of the blob through it. + Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader, + uint64_t* blob_offset = nullptr); + + Status ReadFooter(BlobLogFooter* footer); + + void ResetNextByte() { next_byte_ = 0; } + + uint64_t GetNextByte() const { return next_byte_; } + + private: + Status ReadSlice(uint64_t size, Slice* slice, char* buf); + + const std::unique_ptr<RandomAccessFileReader> file_; + SystemClock* clock_; + + Statistics* statistics_; + + Slice buffer_; + char header_buf_[MAX_HEADER_SIZE(BlobLogHeader::kSize, BlobLogFooter::kSize, + BlobLogRecord::kHeaderSize)]; + + // which byte to read next + uint64_t next_byte_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#undef MAX_HEADER_SIZE
\ No newline at end of file diff --git a/src/rocksdb/db/blob/blob_log_writer.cc b/src/rocksdb/db/blob/blob_log_writer.cc new file mode 100644 index 000000000..9dbac7f25 --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_writer.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_log_writer.h" + +#include <cstdint> +#include <string> + +#include "db/blob/blob_log_format.h" +#include "file/writable_file_writer.h" +#include "monitoring/statistics.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { + +BlobLogWriter::BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest, + SystemClock* clock, Statistics* statistics, + uint64_t log_number, bool use_fs, bool do_flush, + uint64_t boffset) + : dest_(std::move(dest)), + clock_(clock), + statistics_(statistics), + log_number_(log_number), + block_offset_(boffset), + use_fsync_(use_fs), + do_flush_(do_flush), + last_elem_type_(kEtNone) {} + +BlobLogWriter::~BlobLogWriter() = default; + +Status BlobLogWriter::Sync() { + TEST_SYNC_POINT("BlobLogWriter::Sync"); + + StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS); + Status s = dest_->Sync(use_fsync_); + RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED); + return s; +} + +Status BlobLogWriter::WriteHeader(BlobLogHeader& header) { + assert(block_offset_ == 0); + assert(last_elem_type_ == kEtNone); + std::string str; + header.EncodeTo(&str); + + Status s = dest_->Append(Slice(str)); + if (s.ok()) { + block_offset_ += str.size(); + if (do_flush_) { + s = dest_->Flush(); + } + } + last_elem_type_ = kEtFileHdr; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogHeader::kSize); + return s; +} + +Status BlobLogWriter::AppendFooter(BlobLogFooter& footer, + std::string* checksum_method, + std::string* checksum_value) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string str; + footer.EncodeTo(&str); + + Status s; + if (dest_->seen_error()) { + s.PermitUncheckedError(); + return Status::IOError("Seen Error. Skip closing."); + } else { + s = dest_->Append(Slice(str)); + if (s.ok()) { + block_offset_ += str.size(); + + s = Sync(); + + if (s.ok()) { + s = dest_->Close(); + + if (s.ok()) { + assert(!!checksum_method == !!checksum_value); + + if (checksum_method) { + assert(checksum_method->empty()); + + std::string method = dest_->GetFileChecksumFuncName(); + if (method != kUnknownFileChecksumFuncName) { + *checksum_method = std::move(method); + } + } + if (checksum_value) { + assert(checksum_value->empty()); + + std::string value = dest_->GetFileChecksum(); + if (value != kUnknownFileChecksum) { + *checksum_value = std::move(value); + } + } + } + } + } + + dest_.reset(); + } + + last_elem_type_ = kEtFileFooter; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogFooter::kSize); + return s; +} + +Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, + uint64_t expiration, uint64_t* key_offset, + uint64_t* blob_offset) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string buf; + ConstructBlobHeader(&buf, key, val, expiration); + + Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + return s; +} + +Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val, + uint64_t* key_offset, uint64_t* blob_offset) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord); + + std::string buf; + ConstructBlobHeader(&buf, key, val, 0); + + Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + return s; +} + +void BlobLogWriter::ConstructBlobHeader(std::string* buf, const Slice& key, + const Slice& val, uint64_t expiration) { + BlobLogRecord record; + record.key = key; + record.value = val; + record.expiration = expiration; + record.EncodeHeaderTo(buf); +} + +Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf, + const Slice& key, const Slice& val, + uint64_t* key_offset, + uint64_t* blob_offset) { + StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS); + Status s = dest_->Append(Slice(headerbuf)); + if (s.ok()) { + s = dest_->Append(key); + } + if (s.ok()) { + s = dest_->Append(val); + } + if (do_flush_ && s.ok()) { + s = dest_->Flush(); + } + + *key_offset = block_offset_ + BlobLogRecord::kHeaderSize; + *blob_offset = *key_offset + key.size(); + block_offset_ = *blob_offset + val.size(); + last_elem_type_ = kEtRecord; + RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, + BlobLogRecord::kHeaderSize + key.size() + val.size()); + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_log_writer.h b/src/rocksdb/db/blob/blob_log_writer.h new file mode 100644 index 000000000..c1f9f31ad --- /dev/null +++ b/src/rocksdb/db/blob/blob_log_writer.h @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include <cstdint> +#include <memory> +#include <string> + +#include "db/blob/blob_log_format.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +class WritableFileWriter; +class SystemClock; +/** + * BlobLogWriter is the blob log stream writer. It provides an append-only + * abstraction for writing blob data. + * + * + * Look at blob_db_format.h to see the details of the record formats. + */ + +class BlobLogWriter { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this BlobLogWriter is in use. + BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest, SystemClock* clock, + Statistics* statistics, uint64_t log_number, bool use_fsync, + bool do_flush, uint64_t boffset = 0); + // No copying allowed + BlobLogWriter(const BlobLogWriter&) = delete; + BlobLogWriter& operator=(const BlobLogWriter&) = delete; + + ~BlobLogWriter(); + + static void ConstructBlobHeader(std::string* buf, const Slice& key, + const Slice& val, uint64_t expiration); + + Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset, + uint64_t* blob_offset); + + Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration, + uint64_t* key_offset, uint64_t* blob_offset); + + Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key, + const Slice& val, uint64_t* key_offset, + uint64_t* blob_offset); + + Status AppendFooter(BlobLogFooter& footer, std::string* checksum_method, + std::string* checksum_value); + + Status WriteHeader(BlobLogHeader& header); + + WritableFileWriter* file() { return dest_.get(); } + + const WritableFileWriter* file() const { return dest_.get(); } + + uint64_t get_log_number() const { return log_number_; } + + Status Sync(); + + private: + std::unique_ptr<WritableFileWriter> dest_; + SystemClock* clock_; + Statistics* statistics_; + uint64_t log_number_; + uint64_t block_offset_; // Current offset in block + bool use_fsync_; + bool do_flush_; + + public: + enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter }; + ElemType last_elem_type_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_read_request.h b/src/rocksdb/db/blob/blob_read_request.h new file mode 100644 index 000000000..f9668ca2e --- /dev/null +++ b/src/rocksdb/db/blob/blob_read_request.h @@ -0,0 +1,58 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cinttypes> + +#include "rocksdb/compression_type.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// A read Blob request structure for use in BlobSource::MultiGetBlob and +// BlobFileReader::MultiGetBlob. +struct BlobReadRequest { + // User key to lookup the paired blob + const Slice* user_key = nullptr; + + // File offset in bytes + uint64_t offset = 0; + + // Length to read in bytes + size_t len = 0; + + // Blob compression type + CompressionType compression = kNoCompression; + + // Output parameter set by MultiGetBlob() to point to the data buffer, and + // the number of valid bytes + PinnableSlice* result = nullptr; + + // Status of read + Status* status = nullptr; + + BlobReadRequest(const Slice& _user_key, uint64_t _offset, size_t _len, + CompressionType _compression, PinnableSlice* _result, + Status* _status) + : user_key(&_user_key), + offset(_offset), + len(_len), + compression(_compression), + result(_result), + status(_status) {} + + BlobReadRequest() = default; + BlobReadRequest(const BlobReadRequest& other) = default; + BlobReadRequest& operator=(const BlobReadRequest& other) = default; +}; + +using BlobFileReadRequests = + std::tuple<uint64_t /* file_number */, uint64_t /* file_size */, + autovector<BlobReadRequest>>; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_source.cc b/src/rocksdb/db/blob/blob_source.cc new file mode 100644 index 000000000..bfade2507 --- /dev/null +++ b/src/rocksdb/db/blob/blob_source.cc @@ -0,0 +1,488 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_source.h" + +#include <cassert> +#include <string> + +#include "cache/cache_reservation_manager.h" +#include "cache/charged_cache.h" +#include "db/blob/blob_contents.h" +#include "db/blob/blob_file_reader.h" +#include "db/blob/blob_log_format.h" +#include "monitoring/statistics.h" +#include "options/cf_options.h" +#include "table/get_context.h" +#include "table/multiget_context.h" + +namespace ROCKSDB_NAMESPACE { + +BlobSource::BlobSource(const ImmutableOptions* immutable_options, + const std::string& db_id, + const std::string& db_session_id, + BlobFileCache* blob_file_cache) + : db_id_(db_id), + db_session_id_(db_session_id), + statistics_(immutable_options->statistics.get()), + blob_file_cache_(blob_file_cache), + blob_cache_(immutable_options->blob_cache), + lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) { +#ifndef ROCKSDB_LITE + auto bbto = + immutable_options->table_factory->GetOptions<BlockBasedTableOptions>(); + if (bbto && + bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache) + .charged == CacheEntryRoleOptions::Decision::kEnabled) { + blob_cache_ = std::make_shared<ChargedCache>(immutable_options->blob_cache, + bbto->block_cache); + } +#endif // ROCKSDB_LITE +} + +BlobSource::~BlobSource() = default; + +Status BlobSource::GetBlobFromCache( + const Slice& cache_key, CacheHandleGuard<BlobContents>* cached_blob) const { + assert(blob_cache_); + assert(!cache_key.empty()); + assert(cached_blob); + assert(cached_blob->IsEmpty()); + + Cache::Handle* cache_handle = nullptr; + cache_handle = GetEntryFromCache(cache_key); + if (cache_handle != nullptr) { + *cached_blob = + CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle); + + assert(cached_blob->GetValue()); + + PERF_COUNTER_ADD(blob_cache_hit_count, 1); + RecordTick(statistics_, BLOB_DB_CACHE_HIT); + RecordTick(statistics_, BLOB_DB_CACHE_BYTES_READ, + cached_blob->GetValue()->size()); + + return Status::OK(); + } + + RecordTick(statistics_, BLOB_DB_CACHE_MISS); + + return Status::NotFound("Blob not found in cache"); +} + +Status BlobSource::PutBlobIntoCache( + const Slice& cache_key, std::unique_ptr<BlobContents>* blob, + CacheHandleGuard<BlobContents>* cached_blob) const { + assert(blob_cache_); + assert(!cache_key.empty()); + assert(blob); + assert(*blob); + assert(cached_blob); + assert(cached_blob->IsEmpty()); + + Cache::Handle* cache_handle = nullptr; + const Status s = InsertEntryIntoCache(cache_key, blob->get(), + (*blob)->ApproximateMemoryUsage(), + &cache_handle, Cache::Priority::BOTTOM); + if (s.ok()) { + blob->release(); + + assert(cache_handle != nullptr); + *cached_blob = + CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle); + + assert(cached_blob->GetValue()); + + RecordTick(statistics_, BLOB_DB_CACHE_ADD); + RecordTick(statistics_, BLOB_DB_CACHE_BYTES_WRITE, + cached_blob->GetValue()->size()); + + } else { + RecordTick(statistics_, BLOB_DB_CACHE_ADD_FAILURES); + } + + return s; +} + +Cache::Handle* BlobSource::GetEntryFromCache(const Slice& key) const { + Cache::Handle* cache_handle = nullptr; + + if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) { + Cache::CreateCallback create_cb = + [allocator = blob_cache_->memory_allocator()]( + const void* buf, size_t size, void** out_obj, + size_t* charge) -> Status { + return BlobContents::CreateCallback(AllocateBlock(size, allocator), buf, + size, out_obj, charge); + }; + + cache_handle = blob_cache_->Lookup(key, BlobContents::GetCacheItemHelper(), + create_cb, Cache::Priority::BOTTOM, + true /* wait_for_cache */, statistics_); + } else { + cache_handle = blob_cache_->Lookup(key, statistics_); + } + + return cache_handle; +} + +void BlobSource::PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob, + PinnableSlice* value) { + assert(cached_blob); + assert(cached_blob->GetValue()); + assert(value); + + // To avoid copying the cached blob into the buffer provided by the + // application, we can simply transfer ownership of the cache handle to + // the target PinnableSlice. This has the potential to save a lot of + // CPU, especially with large blob values. + + value->Reset(); + + constexpr Cleanable* cleanable = nullptr; + value->PinSlice(cached_blob->GetValue()->data(), cleanable); + + cached_blob->TransferTo(value); +} + +void BlobSource::PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob, + PinnableSlice* value) { + assert(owned_blob); + assert(*owned_blob); + assert(value); + + BlobContents* const blob = owned_blob->release(); + assert(blob); + + value->Reset(); + value->PinSlice( + blob->data(), + [](void* arg1, void* /* arg2 */) { + delete static_cast<BlobContents*>(arg1); + }, + blob, nullptr); +} + +Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value, + size_t charge, + Cache::Handle** cache_handle, + Cache::Priority priority) const { + Status s; + + Cache::CacheItemHelper* const cache_item_helper = + BlobContents::GetCacheItemHelper(); + assert(cache_item_helper); + + if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) { + s = blob_cache_->Insert(key, value, cache_item_helper, charge, cache_handle, + priority); + } else { + s = blob_cache_->Insert(key, value, charge, cache_item_helper->del_cb, + cache_handle, priority); + } + + return s; +} + +Status BlobSource::GetBlob(const ReadOptions& read_options, + const Slice& user_key, uint64_t file_number, + uint64_t offset, uint64_t file_size, + uint64_t value_size, + CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, + PinnableSlice* value, uint64_t* bytes_read) { + assert(value); + + Status s; + + const CacheKey cache_key = GetCacheKey(file_number, file_size, offset); + + CacheHandleGuard<BlobContents> blob_handle; + + // First, try to get the blob from the cache + // + // If blob cache is enabled, we'll try to read from it. + if (blob_cache_) { + Slice key = cache_key.AsSlice(); + s = GetBlobFromCache(key, &blob_handle); + if (s.ok()) { + PinCachedBlob(&blob_handle, value); + + // For consistency, the size of on-disk (possibly compressed) blob record + // is assigned to bytes_read. + uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader( + user_key.size()) + : 0; + assert(offset >= adjustment); + + uint64_t record_size = value_size + adjustment; + if (bytes_read) { + *bytes_read = record_size; + } + return s; + } + } + + assert(blob_handle.IsEmpty()); + + const bool no_io = read_options.read_tier == kBlockCacheTier; + if (no_io) { + s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed"); + return s; + } + + // Can't find the blob from the cache. Since I/O is allowed, read from the + // file. + std::unique_ptr<BlobContents> blob_contents; + + { + CacheHandleGuard<BlobFileReader> blob_file_reader; + s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader); + if (!s.ok()) { + return s; + } + + assert(blob_file_reader.GetValue()); + + if (compression_type != blob_file_reader.GetValue()->GetCompressionType()) { + return Status::Corruption("Compression type mismatch when reading blob"); + } + + MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache) + ? blob_cache_->memory_allocator() + : nullptr; + + uint64_t read_size = 0; + s = blob_file_reader.GetValue()->GetBlob( + read_options, user_key, offset, value_size, compression_type, + prefetch_buffer, allocator, &blob_contents, &read_size); + if (!s.ok()) { + return s; + } + if (bytes_read) { + *bytes_read = read_size; + } + } + + if (blob_cache_ && read_options.fill_cache) { + // If filling cache is allowed and a cache is configured, try to put the + // blob to the cache. + Slice key = cache_key.AsSlice(); + s = PutBlobIntoCache(key, &blob_contents, &blob_handle); + if (!s.ok()) { + return s; + } + + PinCachedBlob(&blob_handle, value); + } else { + PinOwnedBlob(&blob_contents, value); + } + + assert(s.ok()); + return s; +} + +void BlobSource::MultiGetBlob(const ReadOptions& read_options, + autovector<BlobFileReadRequests>& blob_reqs, + uint64_t* bytes_read) { + assert(blob_reqs.size() > 0); + + uint64_t total_bytes_read = 0; + uint64_t bytes_read_in_file = 0; + + for (auto& [file_number, file_size, blob_reqs_in_file] : blob_reqs) { + // sort blob_reqs_in_file by file offset. + std::sort( + blob_reqs_in_file.begin(), blob_reqs_in_file.end(), + [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool { + return lhs.offset < rhs.offset; + }); + + MultiGetBlobFromOneFile(read_options, file_number, file_size, + blob_reqs_in_file, &bytes_read_in_file); + + total_bytes_read += bytes_read_in_file; + } + + if (bytes_read) { + *bytes_read = total_bytes_read; + } +} + +void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options, + uint64_t file_number, + uint64_t /*file_size*/, + autovector<BlobReadRequest>& blob_reqs, + uint64_t* bytes_read) { + const size_t num_blobs = blob_reqs.size(); + assert(num_blobs > 0); + assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE); + +#ifndef NDEBUG + for (size_t i = 0; i < num_blobs - 1; ++i) { + assert(blob_reqs[i].offset <= blob_reqs[i + 1].offset); + } +#endif // !NDEBUG + + using Mask = uint64_t; + Mask cache_hit_mask = 0; + + uint64_t total_bytes = 0; + const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number); + + if (blob_cache_) { + size_t cached_blob_count = 0; + for (size_t i = 0; i < num_blobs; ++i) { + auto& req = blob_reqs[i]; + + CacheHandleGuard<BlobContents> blob_handle; + const CacheKey cache_key = base_cache_key.WithOffset(req.offset); + const Slice key = cache_key.AsSlice(); + + const Status s = GetBlobFromCache(key, &blob_handle); + + if (s.ok()) { + assert(req.status); + *req.status = s; + + PinCachedBlob(&blob_handle, req.result); + + // Update the counter for the number of valid blobs read from the cache. + ++cached_blob_count; + + // For consistency, the size of each on-disk (possibly compressed) blob + // record is accumulated to total_bytes. + uint64_t adjustment = + read_options.verify_checksums + ? BlobLogRecord::CalculateAdjustmentForRecordHeader( + req.user_key->size()) + : 0; + assert(req.offset >= adjustment); + total_bytes += req.len + adjustment; + cache_hit_mask |= (Mask{1} << i); // cache hit + } + } + + // All blobs were read from the cache. + if (cached_blob_count == num_blobs) { + if (bytes_read) { + *bytes_read = total_bytes; + } + return; + } + } + + const bool no_io = read_options.read_tier == kBlockCacheTier; + if (no_io) { + for (size_t i = 0; i < num_blobs; ++i) { + if (!(cache_hit_mask & (Mask{1} << i))) { + BlobReadRequest& req = blob_reqs[i]; + assert(req.status); + + *req.status = + Status::Incomplete("Cannot read blob(s): no disk I/O allowed"); + } + } + return; + } + + { + // Find the rest of blobs from the file since I/O is allowed. + autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>> + _blob_reqs; + uint64_t _bytes_read = 0; + + for (size_t i = 0; i < num_blobs; ++i) { + if (!(cache_hit_mask & (Mask{1} << i))) { + _blob_reqs.emplace_back(&blob_reqs[i], std::unique_ptr<BlobContents>()); + } + } + + CacheHandleGuard<BlobFileReader> blob_file_reader; + Status s = + blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader); + if (!s.ok()) { + for (size_t i = 0; i < _blob_reqs.size(); ++i) { + BlobReadRequest* const req = _blob_reqs[i].first; + assert(req); + assert(req->status); + + *req->status = s; + } + return; + } + + assert(blob_file_reader.GetValue()); + + MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache) + ? blob_cache_->memory_allocator() + : nullptr; + + blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator, + _blob_reqs, &_bytes_read); + + if (blob_cache_ && read_options.fill_cache) { + // If filling cache is allowed and a cache is configured, try to put + // the blob(s) to the cache. + for (auto& [req, blob_contents] : _blob_reqs) { + assert(req); + + if (req->status->ok()) { + CacheHandleGuard<BlobContents> blob_handle; + const CacheKey cache_key = base_cache_key.WithOffset(req->offset); + const Slice key = cache_key.AsSlice(); + s = PutBlobIntoCache(key, &blob_contents, &blob_handle); + if (!s.ok()) { + *req->status = s; + } else { + PinCachedBlob(&blob_handle, req->result); + } + } + } + } else { + for (auto& [req, blob_contents] : _blob_reqs) { + assert(req); + + if (req->status->ok()) { + PinOwnedBlob(&blob_contents, req->result); + } + } + } + + total_bytes += _bytes_read; + if (bytes_read) { + *bytes_read = total_bytes; + } + } +} + +bool BlobSource::TEST_BlobInCache(uint64_t file_number, uint64_t file_size, + uint64_t offset, size_t* charge) const { + const CacheKey cache_key = GetCacheKey(file_number, file_size, offset); + const Slice key = cache_key.AsSlice(); + + CacheHandleGuard<BlobContents> blob_handle; + const Status s = GetBlobFromCache(key, &blob_handle); + + if (s.ok() && blob_handle.GetValue() != nullptr) { + if (charge) { + const Cache* const cache = blob_handle.GetCache(); + assert(cache); + + Cache::Handle* const handle = blob_handle.GetCacheHandle(); + assert(handle); + + *charge = cache->GetUsage(handle); + } + + return true; + } + + return false; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_source.h b/src/rocksdb/db/blob/blob_source.h new file mode 100644 index 000000000..2ed296eeb --- /dev/null +++ b/src/rocksdb/db/blob/blob_source.h @@ -0,0 +1,153 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cinttypes> +#include <memory> + +#include "cache/cache_helpers.h" +#include "cache/cache_key.h" +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_read_request.h" +#include "rocksdb/cache.h" +#include "rocksdb/rocksdb_namespace.h" +#include "table/block_based/cachable_entry.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +struct ImmutableOptions; +class Status; +class FilePrefetchBuffer; +class Slice; +class BlobContents; + +// BlobSource is a class that provides universal access to blobs, regardless of +// whether they are in the blob cache, secondary cache, or (remote) storage. +// Depending on user settings, it always fetch blobs from multi-tier cache and +// storage with minimal cost. +class BlobSource { + public: + BlobSource(const ImmutableOptions* immutable_options, + const std::string& db_id, const std::string& db_session_id, + BlobFileCache* blob_file_cache); + + BlobSource(const BlobSource&) = delete; + BlobSource& operator=(const BlobSource&) = delete; + + ~BlobSource(); + + // Read a blob from the underlying cache or one blob file. + // + // If successful, returns ok and sets "*value" to the newly retrieved + // uncompressed blob. If there was an error while fetching the blob, sets + // "*value" to empty and returns a non-ok status. + // + // Note: For consistency, whether the blob is found in the cache or on disk, + // sets "*bytes_read" to the size of on-disk (possibly compressed) blob + // record. + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + uint64_t file_number, uint64_t offset, uint64_t file_size, + uint64_t value_size, CompressionType compression_type, + FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, + uint64_t* bytes_read); + + // Read multiple blobs from the underlying cache or blob file(s). + // + // If successful, returns ok and sets "result" in the elements of "blob_reqs" + // to the newly retrieved uncompressed blobs. If there was an error while + // fetching one of blobs, sets its "result" to empty and sets its + // corresponding "status" to a non-ok status. + // + // Note: + // - The main difference between this function and MultiGetBlobFromOneFile is + // that this function can read multiple blobs from multiple blob files. + // + // - For consistency, whether the blob is found in the cache or on disk, sets + // "*bytes_read" to the total size of on-disk (possibly compressed) blob + // records. + void MultiGetBlob(const ReadOptions& read_options, + autovector<BlobFileReadRequests>& blob_reqs, + uint64_t* bytes_read); + + // Read multiple blobs from the underlying cache or one blob file. + // + // If successful, returns ok and sets "result" in the elements of "blob_reqs" + // to the newly retrieved uncompressed blobs. If there was an error while + // fetching one of blobs, sets its "result" to empty and sets its + // corresponding "status" to a non-ok status. + // + // Note: + // - The main difference between this function and MultiGetBlob is that this + // function is only used for the case where the demanded blobs are stored in + // one blob file. MultiGetBlob will call this function multiple times if the + // demanded blobs are stored in multiple blob files. + // + // - For consistency, whether the blob is found in the cache or on disk, sets + // "*bytes_read" to the total size of on-disk (possibly compressed) blob + // records. + void MultiGetBlobFromOneFile(const ReadOptions& read_options, + uint64_t file_number, uint64_t file_size, + autovector<BlobReadRequest>& blob_reqs, + uint64_t* bytes_read); + + inline Status GetBlobFileReader( + uint64_t blob_file_number, + CacheHandleGuard<BlobFileReader>* blob_file_reader) { + return blob_file_cache_->GetBlobFileReader(blob_file_number, + blob_file_reader); + } + + inline Cache* GetBlobCache() const { return blob_cache_.get(); } + + bool TEST_BlobInCache(uint64_t file_number, uint64_t file_size, + uint64_t offset, size_t* charge = nullptr) const; + + private: + Status GetBlobFromCache(const Slice& cache_key, + CacheHandleGuard<BlobContents>* cached_blob) const; + + Status PutBlobIntoCache(const Slice& cache_key, + std::unique_ptr<BlobContents>* blob, + CacheHandleGuard<BlobContents>* cached_blob) const; + + static void PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob, + PinnableSlice* value); + + static void PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob, + PinnableSlice* value); + + Cache::Handle* GetEntryFromCache(const Slice& key) const; + + Status InsertEntryIntoCache(const Slice& key, BlobContents* value, + size_t charge, Cache::Handle** cache_handle, + Cache::Priority priority) const; + + inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/, + uint64_t offset) const { + OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number); + return base_cache_key.WithOffset(offset); + } + + const std::string& db_id_; + const std::string& db_session_id_; + + Statistics* statistics_; + + // A cache to store blob file reader. + BlobFileCache* blob_file_cache_; + + // A cache to store uncompressed blobs. + std::shared_ptr<Cache> blob_cache_; + + // The control option of how the cache tiers will be used. Currently rocksdb + // support block/blob cache (volatile tier) and secondary cache (this tier + // isn't strictly speaking a non-volatile tier since the compressed cache in + // this tier is in volatile memory). + const CacheTier lowest_used_cache_tier_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/blob_source_test.cc b/src/rocksdb/db/blob/blob_source_test.cc new file mode 100644 index 000000000..a85ed8646 --- /dev/null +++ b/src/rocksdb/db/blob/blob_source_test.cc @@ -0,0 +1,1624 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_source.h" + +#include <cassert> +#include <cstdint> +#include <cstdio> +#include <memory> +#include <string> + +#include "cache/charged_cache.h" +#include "cache/compressed_secondary_cache.h" +#include "db/blob/blob_contents.h" +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_file_reader.h" +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "db/db_test_util.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "options/cf_options.h" +#include "rocksdb/options.h" +#include "util/compression.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Creates a test blob file with `num` blobs in it. +void WriteBlobFile(const ImmutableOptions& immutable_options, + uint32_t column_family_id, bool has_ttl, + const ExpirationRange& expiration_range_header, + const ExpirationRange& expiration_range_footer, + uint64_t blob_file_number, const std::vector<Slice>& keys, + const std::vector<Slice>& blobs, CompressionType compression, + std::vector<uint64_t>& blob_offsets, + std::vector<uint64_t>& blob_sizes) { + assert(!immutable_options.cf_paths.empty()); + size_t num = keys.size(); + assert(num == blobs.size()); + assert(num == blob_offsets.size()); + assert(num == blob_sizes.size()); + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + std::unique_ptr<FSWritableFile> file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter( + std::move(file), blob_file_path, FileOptions(), immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock, + statistics, blob_file_number, use_fsync, + do_flush); + + BlobLogHeader header(column_family_id, compression, has_ttl, + expiration_range_header); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + + std::vector<std::string> compressed_blobs(num); + std::vector<Slice> blobs_to_write(num); + if (kNoCompression == compression) { + for (size_t i = 0; i < num; ++i) { + blobs_to_write[i] = blobs[i]; + blob_sizes[i] = blobs[i].size(); + } + } else { + CompressionOptions opts; + CompressionContext context(compression); + constexpr uint64_t sample_for_compression = 0; + CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), + compression, sample_for_compression); + + constexpr uint32_t compression_format_version = 2; + + for (size_t i = 0; i < num; ++i) { + ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version, + &compressed_blobs[i])); + blobs_to_write[i] = compressed_blobs[i]; + blob_sizes[i] = compressed_blobs[i].size(); + } + } + + for (size_t i = 0; i < num; ++i) { + uint64_t key_offset = 0; + ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset, + &blob_offsets[i])); + } + + BlobLogFooter footer; + footer.blob_count = num; + footer.expiration_range = expiration_range_footer; + + std::string checksum_method; + std::string checksum_value; + ASSERT_OK( + blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); +} + +} // anonymous namespace + +class BlobSourceTest : public DBTestBase { + protected: + public: + explicit BlobSourceTest() + : DBTestBase("blob_source_test", /*env_do_fsync=*/true) { + options_.env = env_; + options_.enable_blob_files = true; + options_.create_if_missing = true; + + LRUCacheOptions co; + co.capacity = 8 << 20; + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + co.high_pri_pool_ratio = 0.2; + co.low_pri_pool_ratio = 0.2; + options_.blob_cache = NewLRUCache(co); + options_.lowest_used_cache_tier = CacheTier::kVolatileTier; + + assert(db_->GetDbIdentity(db_id_).ok()); + assert(db_->GetDbSessionId(db_session_id_).ok()); + } + + Options options_; + std::string db_id_; + std::string db_session_id_; +}; + +TEST_F(BlobSourceTest, GetBlobsFromCache) { + options_.cf_paths.emplace_back( + test::PerThreadDBPath(env_, "BlobSourceTest_GetBlobsFromCache"), 0); + + options_.statistics = CreateDBStatistics(); + Statistics* statistics = options_.statistics.get(); + assert(statistics); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr size_t num_blobs = 16; + + std::vector<std::string> key_strs; + std::vector<std::string> blob_strs; + + for (size_t i = 0; i < num_blobs; ++i) { + key_strs.push_back("key" + std::to_string(i)); + blob_strs.push_back("blob" + std::to_string(i)); + } + + std::vector<Slice> keys; + std::vector<Slice> blobs; + + uint64_t file_size = BlobLogHeader::kSize; + for (size_t i = 0; i < num_blobs; ++i) { + keys.push_back({key_strs[i]}); + blobs.push_back({blob_strs[i]}); + file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size(); + } + file_size += BlobLogFooter::kSize; + + std::vector<uint64_t> blob_offsets(keys.size()); + std::vector<uint64_t> blob_sizes(keys.size()); + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, keys, blobs, kNoCompression, + blob_offsets, blob_sizes); + + constexpr size_t capacity = 1024; + std::shared_ptr<Cache> backing_cache = + NewLRUCache(capacity); // Blob file cache + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileCache> blob_file_cache = + std::make_unique<BlobFileCache>( + backing_cache.get(), &immutable_options, &file_options, + column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ReadOptions read_options; + read_options.verify_checksums = true; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + + { + // GetBlob + std::vector<PinnableSlice> values(keys.size()); + uint64_t bytes_read = 0; + uint64_t blob_bytes = 0; + uint64_t total_bytes = 0; + + read_options.fill_cache = false; + get_perf_context()->Reset(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read)); + ASSERT_EQ(values[i], blobs[i]); + ASSERT_TRUE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + total_bytes += bytes_read; + } + + // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache, + // GetBlob, and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes); + ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + + read_options.fill_cache = true; + blob_bytes = 0; + total_bytes = 0; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read)); + ASSERT_EQ(values[i], blobs[i]); + ASSERT_TRUE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + blob_bytes += blob_sizes[i]; + total_bytes += bytes_read; + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i); + ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i + 1); + ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes); + } + + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs); + ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_blobs); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), blob_bytes); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), + blob_bytes); + + read_options.fill_cache = true; + total_bytes = 0; + blob_bytes = 0; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read)); + ASSERT_EQ(values[i], blobs[i]); + ASSERT_TRUE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + total_bytes += bytes_read; // on-disk blob record size + blob_bytes += blob_sizes[i]; // cached blob value size + } + + // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache, + // GetBlob, and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // without i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // without i/o + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + blob_bytes * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + + // Cache-only GetBlob + read_options.read_tier = ReadTier::kBlockCacheTier; + total_bytes = 0; + blob_bytes = 0; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read)); + ASSERT_EQ(values[i], blobs[i]); + ASSERT_TRUE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + total_bytes += bytes_read; + blob_bytes += blob_sizes[i]; + } + + // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache, + // GetBlob, and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // without i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // without i/o + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + blob_bytes * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } + + options_.blob_cache->EraseUnRefEntries(); + + { + // Cache-only GetBlob + std::vector<PinnableSlice> values(keys.size()); + uint64_t bytes_read = 0; + + read_options.read_tier = ReadTier::kBlockCacheTier; + read_options.fill_cache = true; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_TRUE(blob_source + .GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read) + .IsIncomplete()); + ASSERT_TRUE(values[i].empty()); + ASSERT_FALSE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, 0); + + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + + // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache, + // GetBlob, and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } + + { + // GetBlob from non-existing file + std::vector<PinnableSlice> values(keys.size()); + uint64_t bytes_read = 0; + uint64_t file_number = 100; // non-existing file + + read_options.read_tier = ReadTier::kReadAllTier; + read_options.fill_cache = true; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + + ASSERT_TRUE(blob_source + .GetBlob(read_options, keys[i], file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, &values[i], + &bytes_read) + .IsIOError()); + ASSERT_TRUE(values[i].empty()); + ASSERT_FALSE(values[i].IsPinned()); + ASSERT_EQ(bytes_read, 0); + + ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + } + + // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache, + // GetBlob, and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } +} + +TEST_F(BlobSourceTest, GetCompressedBlobs) { + if (!Snappy_Supported()) { + return; + } + + const CompressionType compression = kSnappyCompression; + + options_.cf_paths.emplace_back( + test::PerThreadDBPath(env_, "BlobSourceTest_GetCompressedBlobs"), 0); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr size_t num_blobs = 256; + + std::vector<std::string> key_strs; + std::vector<std::string> blob_strs; + + for (size_t i = 0; i < num_blobs; ++i) { + key_strs.push_back("key" + std::to_string(i)); + blob_strs.push_back("blob" + std::to_string(i)); + } + + std::vector<Slice> keys; + std::vector<Slice> blobs; + + for (size_t i = 0; i < num_blobs; ++i) { + keys.push_back({key_strs[i]}); + blobs.push_back({blob_strs[i]}); + } + + std::vector<uint64_t> blob_offsets(keys.size()); + std::vector<uint64_t> blob_sizes(keys.size()); + + constexpr size_t capacity = 1024; + auto backing_cache = NewLRUCache(capacity); // Blob file cache + + FileOptions file_options; + std::unique_ptr<BlobFileCache> blob_file_cache = + std::make_unique<BlobFileCache>( + backing_cache.get(), &immutable_options, &file_options, + column_family_id, nullptr /*HistogramImpl*/, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ReadOptions read_options; + read_options.verify_checksums = true; + + uint64_t bytes_read = 0; + std::vector<PinnableSlice> values(keys.size()); + + { + // Snappy Compression + const uint64_t file_number = 1; + + read_options.read_tier = ReadTier::kReadAllTier; + + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range, expiration_range, file_number, keys, blobs, + compression, blob_offsets, blob_sizes); + + CacheHandleGuard<BlobFileReader> blob_file_reader; + ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader)); + ASSERT_NE(blob_file_reader.GetValue(), nullptr); + + const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize(); + ASSERT_EQ(blob_file_reader.GetValue()->GetCompressionType(), compression); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_NE(blobs[i].size() /*uncompressed size*/, + blob_sizes[i] /*compressed size*/); + } + + read_options.fill_cache = true; + read_options.read_tier = ReadTier::kReadAllTier; + get_perf_context()->Reset(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number, + blob_offsets[i], file_size, blob_sizes[i], + compression, nullptr /*prefetch_buffer*/, + &values[i], &bytes_read)); + ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/); + ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + } + + ASSERT_GE((int)get_perf_context()->blob_decompress_time, 0); + + read_options.read_tier = ReadTier::kBlockCacheTier; + get_perf_context()->Reset(); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + + // Compressed blob size is passed in GetBlob + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number, + blob_offsets[i], file_size, blob_sizes[i], + compression, nullptr /*prefetch_buffer*/, + &values[i], &bytes_read)); + ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/); + ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[i])); + } + + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + } +} + +TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) { + options_.cf_paths.emplace_back( + test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromMultiFiles"), + 0); + + options_.statistics = CreateDBStatistics(); + Statistics* statistics = options_.statistics.get(); + assert(statistics); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_files = 2; + constexpr size_t num_blobs = 32; + + std::vector<std::string> key_strs; + std::vector<std::string> blob_strs; + + for (size_t i = 0; i < num_blobs; ++i) { + key_strs.push_back("key" + std::to_string(i)); + blob_strs.push_back("blob" + std::to_string(i)); + } + + std::vector<Slice> keys; + std::vector<Slice> blobs; + + uint64_t file_size = BlobLogHeader::kSize; + uint64_t blob_value_bytes = 0; + for (size_t i = 0; i < num_blobs; ++i) { + keys.push_back({key_strs[i]}); + blobs.push_back({blob_strs[i]}); + blob_value_bytes += blobs[i].size(); + file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size(); + } + file_size += BlobLogFooter::kSize; + const uint64_t blob_records_bytes = + file_size - BlobLogHeader::kSize - BlobLogFooter::kSize; + + std::vector<uint64_t> blob_offsets(keys.size()); + std::vector<uint64_t> blob_sizes(keys.size()); + + { + // Write key/blob pairs to multiple blob files. + for (size_t i = 0; i < blob_files; ++i) { + const uint64_t file_number = i + 1; + WriteBlobFile(immutable_options, column_family_id, has_ttl, + expiration_range, expiration_range, file_number, keys, + blobs, kNoCompression, blob_offsets, blob_sizes); + } + } + + constexpr size_t capacity = 10; + std::shared_ptr<Cache> backing_cache = + NewLRUCache(capacity); // Blob file cache + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileCache> blob_file_cache = + std::make_unique<BlobFileCache>( + backing_cache.get(), &immutable_options, &file_options, + column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ReadOptions read_options; + read_options.verify_checksums = true; + + uint64_t bytes_read = 0; + + { + // MultiGetBlob + read_options.fill_cache = true; + read_options.read_tier = ReadTier::kReadAllTier; + + autovector<BlobFileReadRequests> blob_reqs; + std::array<autovector<BlobReadRequest>, blob_files> blob_reqs_in_file; + std::array<PinnableSlice, num_blobs * blob_files> value_buf; + std::array<Status, num_blobs * blob_files> statuses_buf; + + for (size_t i = 0; i < blob_files; ++i) { + const uint64_t file_number = i + 1; + for (size_t j = 0; j < num_blobs; ++j) { + blob_reqs_in_file[i].emplace_back( + keys[j], blob_offsets[j], blob_sizes[j], kNoCompression, + &value_buf[i * num_blobs + j], &statuses_buf[i * num_blobs + j]); + } + blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file[i]); + } + + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read); + + for (size_t i = 0; i < blob_files; ++i) { + const uint64_t file_number = i + 1; + for (size_t j = 0; j < num_blobs; ++j) { + ASSERT_OK(statuses_buf[i * num_blobs + j]); + ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]); + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[j])); + } + } + + // Retrieved all blobs from 2 blob files twice via MultiGetBlob and + // TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, + num_blobs * blob_files); + ASSERT_EQ((int)get_perf_context()->blob_read_count, + num_blobs * blob_files); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, + blob_records_bytes * blob_files); // blocking i/o + ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), + num_blobs * blob_files); // MultiGetBlob + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), + num_blobs * blob_files); // TEST_BlobInCache + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), + num_blobs * blob_files); // MultiGetBlob + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + blob_value_bytes * blob_files); // TEST_BlobInCache + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), + blob_value_bytes * blob_files); // MultiGetBlob + + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + autovector<BlobReadRequest> fake_blob_reqs_in_file; + std::array<PinnableSlice, num_blobs> fake_value_buf; + std::array<Status, num_blobs> fake_statuses_buf; + + const uint64_t fake_file_number = 100; + for (size_t i = 0; i < num_blobs; ++i) { + fake_blob_reqs_in_file.emplace_back( + keys[i], blob_offsets[i], blob_sizes[i], kNoCompression, + &fake_value_buf[i], &fake_statuses_buf[i]); + } + + // Add a fake multi-get blob request. + blob_reqs.emplace_back(fake_file_number, file_size, fake_blob_reqs_in_file); + + blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read); + + // Check the real blob read requests. + for (size_t i = 0; i < blob_files; ++i) { + const uint64_t file_number = i + 1; + for (size_t j = 0; j < num_blobs; ++j) { + ASSERT_OK(statuses_buf[i * num_blobs + j]); + ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]); + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[j])); + } + } + + // Check the fake blob request. + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(fake_statuses_buf[i].IsIOError()); + ASSERT_TRUE(fake_value_buf[i].empty()); + ASSERT_FALSE(blob_source.TEST_BlobInCache(fake_file_number, file_size, + blob_offsets[i])); + } + + // Retrieved all blobs from 3 blob files (including the fake one) twice + // via MultiGetBlob and TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, + num_blobs * blob_files * 2); + ASSERT_EQ((int)get_perf_context()->blob_read_count, + 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, + 0); // blocking i/o + ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + // Fake blob requests: MultiGetBlob and TEST_BlobInCache + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2); + // Real blob requests: MultiGetBlob and TEST_BlobInCache + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), + num_blobs * blob_files * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + // Real blob requests: MultiGetBlob and TEST_BlobInCache + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + blob_value_bytes * blob_files * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } +} + +TEST_F(BlobSourceTest, MultiGetBlobsFromCache) { + options_.cf_paths.emplace_back( + test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromCache"), 0); + + options_.statistics = CreateDBStatistics(); + Statistics* statistics = options_.statistics.get(); + assert(statistics); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t blob_file_number = 1; + constexpr size_t num_blobs = 16; + + std::vector<std::string> key_strs; + std::vector<std::string> blob_strs; + + for (size_t i = 0; i < num_blobs; ++i) { + key_strs.push_back("key" + std::to_string(i)); + blob_strs.push_back("blob" + std::to_string(i)); + } + + std::vector<Slice> keys; + std::vector<Slice> blobs; + + uint64_t file_size = BlobLogHeader::kSize; + for (size_t i = 0; i < num_blobs; ++i) { + keys.push_back({key_strs[i]}); + blobs.push_back({blob_strs[i]}); + file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size(); + } + file_size += BlobLogFooter::kSize; + + std::vector<uint64_t> blob_offsets(keys.size()); + std::vector<uint64_t> blob_sizes(keys.size()); + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, keys, blobs, kNoCompression, + blob_offsets, blob_sizes); + + constexpr size_t capacity = 10; + std::shared_ptr<Cache> backing_cache = + NewLRUCache(capacity); // Blob file cache + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileCache> blob_file_cache = + std::make_unique<BlobFileCache>( + backing_cache.get(), &immutable_options, &file_options, + column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ReadOptions read_options; + read_options.verify_checksums = true; + + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + + { + // MultiGetBlobFromOneFile + uint64_t bytes_read = 0; + std::array<Status, num_blobs> statuses_buf; + std::array<PinnableSlice, num_blobs> value_buf; + autovector<BlobReadRequest> blob_reqs; + + for (size_t i = 0; i < num_blobs; i += 2) { // even index + blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i], + kNoCompression, &value_buf[i], &statuses_buf[i]); + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + + read_options.fill_cache = true; + read_options.read_tier = ReadTier::kReadAllTier; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + // Get half of blobs + blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number, + file_size, blob_reqs, &bytes_read); + + uint64_t fs_read_bytes = 0; + uint64_t ca_read_bytes = 0; + for (size_t i = 0; i < num_blobs; ++i) { + if (i % 2 == 0) { + ASSERT_OK(statuses_buf[i]); + ASSERT_EQ(value_buf[i], blobs[i]); + ASSERT_TRUE(value_buf[i].IsPinned()); + fs_read_bytes += + blob_sizes[i] + keys[i].size() + BlobLogRecord::kHeaderSize; + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + ca_read_bytes += blob_sizes[i]; + } else { + statuses_buf[i].PermitUncheckedError(); + ASSERT_TRUE(value_buf[i].empty()); + ASSERT_FALSE(value_buf[i].IsPinned()); + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + } + + constexpr int num_even_blobs = num_blobs / 2; + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_even_blobs); + ASSERT_EQ((int)get_perf_context()->blob_read_count, + num_even_blobs); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, + fs_read_bytes); // blocking i/o + ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_even_blobs); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_even_blobs); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + ca_read_bytes); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), + ca_read_bytes); + + // Get the rest of blobs + for (size_t i = 1; i < num_blobs; i += 2) { // odd index + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + + ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number, + blob_offsets[i], file_size, blob_sizes[i], + kNoCompression, prefetch_buffer, + &value_buf[i], &bytes_read)); + ASSERT_EQ(value_buf[i], blobs[i]); + ASSERT_TRUE(value_buf[i].IsPinned()); + ASSERT_EQ(bytes_read, + BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + + // Cache-only MultiGetBlobFromOneFile + read_options.read_tier = ReadTier::kBlockCacheTier; + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + blob_reqs.clear(); + for (size_t i = 0; i < num_blobs; ++i) { + blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i], + kNoCompression, &value_buf[i], &statuses_buf[i]); + } + + blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number, + file_size, blob_reqs, &bytes_read); + + uint64_t blob_bytes = 0; + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_OK(statuses_buf[i]); + ASSERT_EQ(value_buf[i], blobs[i]); + ASSERT_TRUE(value_buf[i].IsPinned()); + ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + blob_bytes += blob_sizes[i]; + } + + // Retrieved the blob cache num_blobs * 2 times via GetBlob and + // TEST_BlobInCache. + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 2); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o + ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), + blob_bytes * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } + + options_.blob_cache->EraseUnRefEntries(); + + { + // Cache-only MultiGetBlobFromOneFile + uint64_t bytes_read = 0; + read_options.read_tier = ReadTier::kBlockCacheTier; + + std::array<Status, num_blobs> statuses_buf; + std::array<PinnableSlice, num_blobs> value_buf; + autovector<BlobReadRequest> blob_reqs; + + for (size_t i = 0; i < num_blobs; i++) { + blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i], + kNoCompression, &value_buf[i], &statuses_buf[i]); + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number, + file_size, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(statuses_buf[i].IsIncomplete()); + ASSERT_TRUE(value_buf[i].empty()); + ASSERT_FALSE(value_buf[i].IsPinned()); + ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size, + blob_offsets[i])); + } + + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } + + { + // MultiGetBlobFromOneFile from non-existing file + uint64_t bytes_read = 0; + uint64_t non_existing_file_number = 100; + read_options.read_tier = ReadTier::kReadAllTier; + + std::array<Status, num_blobs> statuses_buf; + std::array<PinnableSlice, num_blobs> value_buf; + autovector<BlobReadRequest> blob_reqs; + + for (size_t i = 0; i < num_blobs; i++) { + blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i], + kNoCompression, &value_buf[i], &statuses_buf[i]); + ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number, + file_size, blob_offsets[i])); + } + + get_perf_context()->Reset(); + statistics->Reset().PermitUncheckedError(); + + blob_source.MultiGetBlobFromOneFile(read_options, non_existing_file_number, + file_size, blob_reqs, &bytes_read); + + for (size_t i = 0; i < num_blobs; ++i) { + ASSERT_TRUE(statuses_buf[i].IsIOError()); + ASSERT_TRUE(value_buf[i].empty()); + ASSERT_FALSE(value_buf[i].IsPinned()); + ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number, + file_size, blob_offsets[i])); + } + + ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0); + ASSERT_EQ((int)get_perf_context()->blob_read_count, 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0); // blocking i/o + ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0); + ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0); + + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0); + ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0); + } +} + +class BlobSecondaryCacheTest : public DBTestBase { + protected: + public: + explicit BlobSecondaryCacheTest() + : DBTestBase("blob_secondary_cache_test", /*env_do_fsync=*/true) { + options_.env = env_; + options_.enable_blob_files = true; + options_.create_if_missing = true; + + // Set a small cache capacity to evict entries from the cache, and to test + // that secondary cache is used properly. + lru_cache_opts_.capacity = 1024; + lru_cache_opts_.num_shard_bits = 0; + lru_cache_opts_.strict_capacity_limit = true; + lru_cache_opts_.metadata_charge_policy = kDontChargeCacheMetadata; + lru_cache_opts_.high_pri_pool_ratio = 0.2; + lru_cache_opts_.low_pri_pool_ratio = 0.2; + + secondary_cache_opts_.capacity = 8 << 20; // 8 MB + secondary_cache_opts_.num_shard_bits = 0; + secondary_cache_opts_.metadata_charge_policy = + kDefaultCacheMetadataChargePolicy; + + // Read blobs from the secondary cache if they are not in the primary cache + options_.lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier; + + assert(db_->GetDbIdentity(db_id_).ok()); + assert(db_->GetDbSessionId(db_session_id_).ok()); + } + + Options options_; + + LRUCacheOptions lru_cache_opts_; + CompressedSecondaryCacheOptions secondary_cache_opts_; + + std::string db_id_; + std::string db_session_id_; +}; + +TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { + if (!Snappy_Supported()) { + return; + } + + secondary_cache_opts_.compression_type = kSnappyCompression; + lru_cache_opts_.secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts_); + options_.blob_cache = NewLRUCache(lru_cache_opts_); + + options_.cf_paths.emplace_back( + test::PerThreadDBPath( + env_, "BlobSecondaryCacheTest_GetBlobsFromSecondaryCache"), + 0); + + options_.statistics = CreateDBStatistics(); + Statistics* statistics = options_.statistics.get(); + assert(statistics); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr uint32_t column_family_id = 1; + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + constexpr uint64_t file_number = 1; + + Random rnd(301); + + std::vector<std::string> key_strs{"key0", "key1"}; + std::vector<std::string> blob_strs{rnd.RandomString(512), + rnd.RandomString(768)}; + + std::vector<Slice> keys{key_strs[0], key_strs[1]}; + std::vector<Slice> blobs{blob_strs[0], blob_strs[1]}; + + std::vector<uint64_t> blob_offsets(keys.size()); + std::vector<uint64_t> blob_sizes(keys.size()); + + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, file_number, keys, blobs, kNoCompression, + blob_offsets, blob_sizes); + + constexpr size_t capacity = 1024; + std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileCache> blob_file_cache(new BlobFileCache( + backing_cache.get(), &immutable_options, &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/)); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + CacheHandleGuard<BlobFileReader> file_reader; + ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader)); + ASSERT_NE(file_reader.GetValue(), nullptr); + const uint64_t file_size = file_reader.GetValue()->GetFileSize(); + ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression); + + ReadOptions read_options; + read_options.verify_checksums = true; + + auto blob_cache = options_.blob_cache; + auto secondary_cache = lru_cache_opts_.secondary_cache; + + Cache::CreateCallback create_cb = [](const void* buf, size_t size, + void** out_obj, + size_t* charge) -> Status { + CacheAllocationPtr allocation(new char[size]); + + return BlobContents::CreateCallback(std::move(allocation), buf, size, + out_obj, charge); + }; + + { + // GetBlob + std::vector<PinnableSlice> values(keys.size()); + + read_options.fill_cache = true; + get_perf_context()->Reset(); + + // key0 should be filled to the primary cache from the blob file. + ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number, + blob_offsets[0], file_size, blob_sizes[0], + kNoCompression, nullptr /* prefetch_buffer */, + &values[0], nullptr /* bytes_read */)); + // Release cache handle + values[0].Reset(); + + // key0 should be evicted and key0's dummy item is inserted into secondary + // cache. key1 should be filled to the primary cache from the blob file. + ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number, + blob_offsets[1], file_size, blob_sizes[1], + kNoCompression, nullptr /* prefetch_buffer */, + &values[1], nullptr /* bytes_read */)); + + // Release cache handle + values[1].Reset(); + + // key0 should be filled to the primary cache from the blob file. key1 + // should be evicted and key1's dummy item is inserted into secondary cache. + ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number, + blob_offsets[0], file_size, blob_sizes[0], + kNoCompression, nullptr /* prefetch_buffer */, + &values[0], nullptr /* bytes_read */)); + ASSERT_EQ(values[0], blobs[0]); + ASSERT_TRUE( + blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[0])); + + // Release cache handle + values[0].Reset(); + + // key0 should be evicted and is inserted into secondary cache. + // key1 should be filled to the primary cache from the blob file. + ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number, + blob_offsets[1], file_size, blob_sizes[1], + kNoCompression, nullptr /* prefetch_buffer */, + &values[1], nullptr /* bytes_read */)); + ASSERT_EQ(values[1], blobs[1]); + ASSERT_TRUE( + blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[1])); + + // Release cache handle + values[1].Reset(); + + OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number); + + // blob_cache here only looks at the primary cache since we didn't provide + // the cache item helper for the secondary cache. However, since key0 is + // demoted to the secondary cache, we shouldn't be able to find it in the + // primary cache. + { + CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[0]); + const Slice key0 = cache_key.AsSlice(); + auto handle0 = blob_cache->Lookup(key0, statistics); + ASSERT_EQ(handle0, nullptr); + + // key0's item should be in the secondary cache. + bool is_in_sec_cache = false; + auto sec_handle0 = + secondary_cache->Lookup(key0, create_cb, true, + /*advise_erase=*/true, is_in_sec_cache); + ASSERT_FALSE(is_in_sec_cache); + ASSERT_NE(sec_handle0, nullptr); + ASSERT_TRUE(sec_handle0->IsReady()); + auto value = static_cast<BlobContents*>(sec_handle0->Value()); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blobs[0]); + delete value; + + // key0 doesn't exist in the blob cache although key0's dummy + // item exist in the secondary cache. + ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[0])); + } + + // key1 should exists in the primary cache. key1's dummy item exists + // in the secondary cache. + { + CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[1]); + const Slice key1 = cache_key.AsSlice(); + auto handle1 = blob_cache->Lookup(key1, statistics); + ASSERT_NE(handle1, nullptr); + blob_cache->Release(handle1); + + bool is_in_sec_cache = false; + auto sec_handle1 = + secondary_cache->Lookup(key1, create_cb, true, + /*advise_erase=*/true, is_in_sec_cache); + ASSERT_FALSE(is_in_sec_cache); + ASSERT_EQ(sec_handle1, nullptr); + + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[1])); + } + + { + // fetch key0 from the blob file to the primary cache. + // key1 is evicted and inserted into the secondary cache. + ASSERT_OK(blob_source.GetBlob( + read_options, keys[0], file_number, blob_offsets[0], file_size, + blob_sizes[0], kNoCompression, nullptr /* prefetch_buffer */, + &values[0], nullptr /* bytes_read */)); + ASSERT_EQ(values[0], blobs[0]); + + // Release cache handle + values[0].Reset(); + + // key0 should be in the primary cache. + CacheKey cache_key0 = base_cache_key.WithOffset(blob_offsets[0]); + const Slice key0 = cache_key0.AsSlice(); + auto handle0 = blob_cache->Lookup(key0, statistics); + ASSERT_NE(handle0, nullptr); + auto value = static_cast<BlobContents*>(blob_cache->Value(handle0)); + ASSERT_NE(value, nullptr); + ASSERT_EQ(value->data(), blobs[0]); + blob_cache->Release(handle0); + + // key1 is not in the primary cache and is in the secondary cache. + CacheKey cache_key1 = base_cache_key.WithOffset(blob_offsets[1]); + const Slice key1 = cache_key1.AsSlice(); + auto handle1 = blob_cache->Lookup(key1, statistics); + ASSERT_EQ(handle1, nullptr); + + // erase key0 from the primary cache. + blob_cache->Erase(key0); + handle0 = blob_cache->Lookup(key0, statistics); + ASSERT_EQ(handle0, nullptr); + + // key1 promotion should succeed due to the primary cache being empty. we + // did't call secondary cache's Lookup() here, because it will remove the + // key but it won't be able to promote the key to the primary cache. + // Instead we use the end-to-end blob source API to read key1. + // In function TEST_BlobInCache, key1's dummy item is inserted into the + // primary cache and a standalone handle is checked by GetValue(). + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[1])); + + // key1's dummy handle is in the primary cache and key1's item is still + // in the secondary cache. So, the primary cache's Lookup() without + // secondary cache support cannot see it. (NOTE: The dummy handle used + // to be a leaky abstraction but not anymore.) + handle1 = blob_cache->Lookup(key1, statistics); + ASSERT_EQ(handle1, nullptr); + + // But after another access, it is promoted to primary cache + ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, + blob_offsets[1])); + + // And Lookup() can find it (without secondary cache support) + handle1 = blob_cache->Lookup(key1, statistics); + ASSERT_NE(handle1, nullptr); + ASSERT_NE(blob_cache->Value(handle1), nullptr); + blob_cache->Release(handle1); + } + } +} + +class BlobSourceCacheReservationTest : public DBTestBase { + public: + explicit BlobSourceCacheReservationTest() + : DBTestBase("blob_source_cache_reservation_test", + /*env_do_fsync=*/true) { + options_.env = env_; + options_.enable_blob_files = true; + options_.create_if_missing = true; + + LRUCacheOptions co; + co.capacity = kCacheCapacity; + co.num_shard_bits = kNumShardBits; + co.metadata_charge_policy = kDontChargeCacheMetadata; + + co.high_pri_pool_ratio = 0.0; + co.low_pri_pool_ratio = 0.0; + std::shared_ptr<Cache> blob_cache = NewLRUCache(co); + + co.high_pri_pool_ratio = 0.5; + co.low_pri_pool_ratio = 0.5; + std::shared_ptr<Cache> block_cache = NewLRUCache(co); + + options_.blob_cache = blob_cache; + options_.lowest_used_cache_tier = CacheTier::kVolatileTier; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = block_cache; + block_based_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kBlobCache, + {/* charged = */ CacheEntryRoleOptions::Decision::kEnabled}}); + options_.table_factory.reset( + NewBlockBasedTableFactory(block_based_options)); + + assert(db_->GetDbIdentity(db_id_).ok()); + assert(db_->GetDbSessionId(db_session_id_).ok()); + } + + void GenerateKeysAndBlobs() { + for (size_t i = 0; i < kNumBlobs; ++i) { + key_strs_.push_back("key" + std::to_string(i)); + blob_strs_.push_back("blob" + std::to_string(i)); + } + + blob_file_size_ = BlobLogHeader::kSize; + for (size_t i = 0; i < kNumBlobs; ++i) { + keys_.push_back({key_strs_[i]}); + blobs_.push_back({blob_strs_[i]}); + blob_file_size_ += + BlobLogRecord::kHeaderSize + keys_[i].size() + blobs_[i].size(); + } + blob_file_size_ += BlobLogFooter::kSize; + } + + static constexpr std::size_t kSizeDummyEntry = CacheReservationManagerImpl< + CacheEntryRole::kBlobCache>::GetDummyEntrySize(); + static constexpr std::size_t kCacheCapacity = 1 * kSizeDummyEntry; + static constexpr int kNumShardBits = 0; // 2^0 shard + + static constexpr uint32_t kColumnFamilyId = 1; + static constexpr bool kHasTTL = false; + static constexpr uint64_t kBlobFileNumber = 1; + static constexpr size_t kNumBlobs = 16; + + std::vector<Slice> keys_; + std::vector<Slice> blobs_; + std::vector<std::string> key_strs_; + std::vector<std::string> blob_strs_; + uint64_t blob_file_size_; + + Options options_; + std::string db_id_; + std::string db_session_id_; +}; + +#ifndef ROCKSDB_LITE +TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) { + options_.cf_paths.emplace_back( + test::PerThreadDBPath( + env_, "BlobSourceCacheReservationTest_SimpleCacheReservation"), + 0); + + GenerateKeysAndBlobs(); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + + constexpr ExpirationRange expiration_range; + + std::vector<uint64_t> blob_offsets(keys_.size()); + std::vector<uint64_t> blob_sizes(keys_.size()); + + WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range, + expiration_range, kBlobFileNumber, keys_, blobs_, + kNoCompression, blob_offsets, blob_sizes); + + constexpr size_t capacity = 10; + std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileCache> blob_file_cache = + std::make_unique<BlobFileCache>( + backing_cache.get(), &immutable_options, &file_options, + kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ConcurrentCacheReservationManager* cache_res_mgr = + static_cast<ChargedCache*>(blob_source.GetBlobCache()) + ->TEST_GetCacheReservationManager(); + ASSERT_NE(cache_res_mgr, nullptr); + + ReadOptions read_options; + read_options.verify_checksums = true; + + { + read_options.fill_cache = false; + + std::vector<PinnableSlice> values(keys_.size()); + + for (size_t i = 0; i < kNumBlobs; ++i) { + ASSERT_OK(blob_source.GetBlob( + read_options, keys_[i], kBlobFileNumber, blob_offsets[i], + blob_file_size_, blob_sizes[i], kNoCompression, + nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */)); + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0); + } + } + + { + read_options.fill_cache = true; + + std::vector<PinnableSlice> values(keys_.size()); + + // num_blobs is 16, so the total blob cache usage is less than a single + // dummy entry. Therefore, cache reservation manager only reserves one dummy + // entry here. + uint64_t blob_bytes = 0; + for (size_t i = 0; i < kNumBlobs; ++i) { + ASSERT_OK(blob_source.GetBlob( + read_options, keys_[i], kBlobFileNumber, blob_offsets[i], + blob_file_size_, blob_sizes[i], kNoCompression, + nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */)); + + size_t charge = 0; + ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_, + blob_offsets[i], &charge)); + + blob_bytes += charge; + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), + options_.blob_cache->GetUsage()); + } + } + + { + OffsetableCacheKey base_cache_key(db_id_, db_session_id_, kBlobFileNumber); + size_t blob_bytes = options_.blob_cache->GetUsage(); + + for (size_t i = 0; i < kNumBlobs; ++i) { + size_t charge = 0; + ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_, + blob_offsets[i], &charge)); + + CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[i]); + // We didn't call options_.blob_cache->Erase() here, this is because + // the cache wrapper's Erase() method must be called to update the + // cache usage after erasing the cache entry. + blob_source.GetBlobCache()->Erase(cache_key.AsSlice()); + if (i == kNumBlobs - 1) { + // All the blobs got removed from the cache. cache_res_mgr should not + // reserve any space for them. + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0); + } else { + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry); + } + blob_bytes -= charge; + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), + options_.blob_cache->GetUsage()); + } + } +} + +TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) { + options_.cf_paths.emplace_back( + test::PerThreadDBPath( + env_, + "BlobSourceCacheReservationTest_IncreaseCacheReservationOnFullCache"), + 0); + + GenerateKeysAndBlobs(); + + DestroyAndReopen(options_); + + ImmutableOptions immutable_options(options_); + constexpr size_t blob_size = kSizeDummyEntry / (kNumBlobs / 2); + for (size_t i = 0; i < kNumBlobs; ++i) { + blob_file_size_ -= blobs_[i].size(); // old blob size + blob_strs_[i].resize(blob_size, '@'); + blobs_[i] = Slice(blob_strs_[i]); + blob_file_size_ += blobs_[i].size(); // new blob size + } + + std::vector<uint64_t> blob_offsets(keys_.size()); + std::vector<uint64_t> blob_sizes(keys_.size()); + + constexpr ExpirationRange expiration_range; + WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range, + expiration_range, kBlobFileNumber, keys_, blobs_, + kNoCompression, blob_offsets, blob_sizes); + + constexpr size_t capacity = 10; + std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + std::unique_ptr<BlobFileCache> blob_file_cache = + std::make_unique<BlobFileCache>( + backing_cache.get(), &immutable_options, &file_options, + kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/); + + BlobSource blob_source(&immutable_options, db_id_, db_session_id_, + blob_file_cache.get()); + + ConcurrentCacheReservationManager* cache_res_mgr = + static_cast<ChargedCache*>(blob_source.GetBlobCache()) + ->TEST_GetCacheReservationManager(); + ASSERT_NE(cache_res_mgr, nullptr); + + ReadOptions read_options; + read_options.verify_checksums = true; + + { + read_options.fill_cache = false; + + std::vector<PinnableSlice> values(keys_.size()); + + for (size_t i = 0; i < kNumBlobs; ++i) { + ASSERT_OK(blob_source.GetBlob( + read_options, keys_[i], kBlobFileNumber, blob_offsets[i], + blob_file_size_, blob_sizes[i], kNoCompression, + nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */)); + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0); + } + } + + { + read_options.fill_cache = true; + + std::vector<PinnableSlice> values(keys_.size()); + + // Since we resized each blob to be kSizeDummyEntry / (num_blobs / 2), we + // can't fit all the blobs in the cache at the same time, which means we + // should observe cache evictions once we reach the cache's capacity. + // Due to the overhead of the cache and the BlobContents objects, as well as + // jemalloc bin sizes, this happens after inserting seven blobs. + uint64_t blob_bytes = 0; + for (size_t i = 0; i < kNumBlobs; ++i) { + ASSERT_OK(blob_source.GetBlob( + read_options, keys_[i], kBlobFileNumber, blob_offsets[i], + blob_file_size_, blob_sizes[i], kNoCompression, + nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */)); + + // Release cache handle + values[i].Reset(); + + if (i < kNumBlobs / 2 - 1) { + size_t charge = 0; + ASSERT_TRUE(blob_source.TEST_BlobInCache( + kBlobFileNumber, blob_file_size_, blob_offsets[i], &charge)); + + blob_bytes += charge; + } + + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes); + ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), + options_.blob_cache->GetUsage()); + } + } +} +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/db_blob_basic_test.cc b/src/rocksdb/db/blob/db_blob_basic_test.cc new file mode 100644 index 000000000..e6832a2ae --- /dev/null +++ b/src/rocksdb/db/blob/db_blob_basic_test.cc @@ -0,0 +1,1789 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include <array> +#include <sstream> +#include <string> + +#include "cache/compressed_secondary_cache.h" +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobBasicTest : public DBTestBase { + protected: + DBBlobBasicTest() + : DBTestBase("db_blob_basic_test", /* env_do_fsync */ false) {} +}; + +TEST_F(DBBlobBasicTest, GetBlob) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get(key), blob_value); + + // Try again with no I/O allowed. The table and the necessary blocks should + // already be in their respective caches; however, the blob itself can only be + // read from the blob file, so the read should return Incomplete. + ReadOptions read_options; + read_options.read_tier = kBlockCacheTier; + + PinnableSlice result; + ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result) + .IsIncomplete()); +} + +TEST_F(DBBlobBasicTest, GetBlobFromCache) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 2 << 20; // 2MB + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + options.enable_blob_files = true; + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + + ASSERT_OK(Flush()); + + ReadOptions read_options; + + read_options.fill_cache = false; + + { + PinnableSlice result; + + read_options.read_tier = kReadAllTier; + ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)); + ASSERT_EQ(result, blob_value); + + result.Reset(); + read_options.read_tier = kBlockCacheTier; + + // Try again with no I/O allowed. Since we didn't re-fill the cache, the + // blob itself can only be read from the blob file, so the read should + // return Incomplete. + ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result) + .IsIncomplete()); + ASSERT_TRUE(result.empty()); + } + + read_options.fill_cache = true; + + { + PinnableSlice result; + + read_options.read_tier = kReadAllTier; + ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)); + ASSERT_EQ(result, blob_value); + + result.Reset(); + read_options.read_tier = kBlockCacheTier; + + // Try again with no I/O allowed. The table and the necessary blocks/blobs + // should already be in their respective caches. + ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)); + ASSERT_EQ(result, blob_value); + } +} + +TEST_F(DBBlobBasicTest, IterateBlobsFromCache) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 2 << 20; // 2MB + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + options.enable_blob_files = true; + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + options.statistics = CreateDBStatistics(); + + Reopen(options); + + int num_blobs = 5; + std::vector<std::string> keys; + std::vector<std::string> blobs; + + for (int i = 0; i < num_blobs; ++i) { + keys.push_back("key" + std::to_string(i)); + blobs.push_back("blob" + std::to_string(i)); + ASSERT_OK(Put(keys[i], blobs[i])); + } + ASSERT_OK(Flush()); + + ReadOptions read_options; + + { + read_options.fill_cache = false; + read_options.read_tier = kReadAllTier; + + std::unique_ptr<Iterator> iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + int i = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key().ToString(), keys[i]); + ASSERT_EQ(iter->value().ToString(), blobs[i]); + ++i; + } + ASSERT_EQ(i, num_blobs); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0); + } + + { + read_options.fill_cache = false; + read_options.read_tier = kBlockCacheTier; + + std::unique_ptr<Iterator> iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + // Try again with no I/O allowed. Since we didn't re-fill the cache, + // the blob itself can only be read from the blob file, so iter->Valid() + // should be false. + iter->SeekToFirst(); + ASSERT_NOK(iter->status()); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0); + } + + { + read_options.fill_cache = true; + read_options.read_tier = kReadAllTier; + + std::unique_ptr<Iterator> iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + // Read blobs from the file and refill the cache. + int i = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key().ToString(), keys[i]); + ASSERT_EQ(iter->value().ToString(), blobs[i]); + ++i; + } + ASSERT_EQ(i, num_blobs); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), + num_blobs); + } + + { + read_options.fill_cache = false; + read_options.read_tier = kBlockCacheTier; + + std::unique_ptr<Iterator> iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + // Try again with no I/O allowed. The table and the necessary blocks/blobs + // should already be in their respective caches. + int i = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key().ToString(), keys[i]); + ASSERT_EQ(iter->value().ToString(), blobs[i]); + ++i; + } + ASSERT_EQ(i, num_blobs); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0); + } +} + +TEST_F(DBBlobBasicTest, IterateBlobsFromCachePinning) { + constexpr size_t min_blob_size = 6; + + Options options = GetDefaultOptions(); + + LRUCacheOptions cache_options; + cache_options.capacity = 2048; + cache_options.num_shard_bits = 0; + cache_options.metadata_charge_policy = kDontChargeCacheMetadata; + + options.blob_cache = NewLRUCache(cache_options); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + + Reopen(options); + + // Put then iterate over three key-values. The second value is below the size + // limit and is thus stored inline; the other two are stored separately as + // blobs. We expect to have something pinned in the cache iff we are + // positioned on a blob. + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "long_value"; + static_assert(sizeof(first_value) - 1 >= min_blob_size, + "first_value too short to be stored as blob"); + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "short"; + static_assert(sizeof(second_value) - 1 < min_blob_size, + "second_value too long to be inlined"); + + ASSERT_OK(Put(second_key, second_value)); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "other_long_value"; + static_assert(sizeof(third_value) - 1 >= min_blob_size, + "third_value too short to be stored as blob"); + + ASSERT_OK(Put(third_key, third_value)); + + ASSERT_OK(Flush()); + + { + ReadOptions read_options; + read_options.fill_cache = true; + + std::unique_ptr<Iterator> iter(db_->NewIterator(read_options)); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_value); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_value); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_value); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + { + ReadOptions read_options; + read_options.fill_cache = false; + read_options.read_tier = kBlockCacheTier; + + std::unique_ptr<Iterator> iter(db_->NewIterator(read_options)); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_value); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + } + + { + ReadOptions read_options; + read_options.fill_cache = false; + read_options.read_tier = kBlockCacheTier; + + std::unique_ptr<Iterator> iter(db_->NewIterator(read_options)); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_value); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_value); + ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0); + + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0); + } +} + +TEST_F(DBBlobBasicTest, MultiGetBlobs) { + constexpr size_t min_blob_size = 6; + + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + + Reopen(options); + + // Put then retrieve three key-values. The first value is below the size limit + // and is thus stored inline; the other two are stored separately as blobs. + constexpr size_t num_keys = 3; + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "short"; + static_assert(sizeof(first_value) - 1 < min_blob_size, + "first_value too long to be inlined"); + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "long_value"; + static_assert(sizeof(second_value) - 1 >= min_blob_size, + "second_value too short to be stored as blob"); + + ASSERT_OK(Put(second_key, second_value)); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "other_long_value"; + static_assert(sizeof(third_value) - 1 >= min_blob_size, + "third_value too short to be stored as blob"); + + ASSERT_OK(Put(third_key, third_value)); + + ASSERT_OK(Flush()); + + ReadOptions read_options; + + std::array<Slice, num_keys> keys{{first_key, second_key, third_key}}; + + { + std::array<PinnableSlice, num_keys> values; + std::array<Status, num_keys> statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_value); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } + + // Try again with no I/O allowed. The table and the necessary blocks should + // already be in their respective caches. The first (inlined) value should be + // successfully read; however, the two blob values could only be read from the + // blob file, so for those the read should return Incomplete. + read_options.read_tier = kBlockCacheTier; + + { + std::array<PinnableSlice, num_keys> values; + std::array<Status, num_keys> statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_TRUE(statuses[1].IsIncomplete()); + + ASSERT_TRUE(statuses[2].IsIncomplete()); + } +} + +TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 2 << 20; // 2MB + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + constexpr size_t min_blob_size = 6; + options.min_blob_size = min_blob_size; + options.create_if_missing = true; + options.enable_blob_files = true; + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + DestroyAndReopen(options); + + // Put then retrieve three key-values. The first value is below the size limit + // and is thus stored inline; the other two are stored separately as blobs. + constexpr size_t num_keys = 3; + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "short"; + static_assert(sizeof(first_value) - 1 < min_blob_size, + "first_value too long to be inlined"); + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "long_value"; + static_assert(sizeof(second_value) - 1 >= min_blob_size, + "second_value too short to be stored as blob"); + + ASSERT_OK(Put(second_key, second_value)); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "other_long_value"; + static_assert(sizeof(third_value) - 1 >= min_blob_size, + "third_value too short to be stored as blob"); + + ASSERT_OK(Put(third_key, third_value)); + + ASSERT_OK(Flush()); + + ReadOptions read_options; + read_options.fill_cache = false; + + std::array<Slice, num_keys> keys{{first_key, second_key, third_key}}; + + { + std::array<PinnableSlice, num_keys> values; + std::array<Status, num_keys> statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_value); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } + + // Try again with no I/O allowed. The first (inlined) value should be + // successfully read; however, the two blob values could only be read from the + // blob file, so for those the read should return Incomplete. + read_options.read_tier = kBlockCacheTier; + + { + std::array<PinnableSlice, num_keys> values; + std::array<Status, num_keys> statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_TRUE(statuses[1].IsIncomplete()); + + ASSERT_TRUE(statuses[2].IsIncomplete()); + } + + // Fill the cache when reading blobs from the blob file. + read_options.read_tier = kReadAllTier; + read_options.fill_cache = true; + + { + std::array<PinnableSlice, num_keys> values; + std::array<Status, num_keys> statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_value); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } + + // Try again with no I/O allowed. All blobs should be successfully read from + // the cache. + read_options.read_tier = kBlockCacheTier; + + { + std::array<PinnableSlice, num_keys> values; + std::array<Status, num_keys> statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_value); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) { + Options options = GetDefaultOptions(); + + // First, create an external SST file ["b"]. + const std::string file_path = dbname_ + "/test.sst"; + { + SstFileWriter sst_file_writer(EnvOptions(), GetDefaultOptions()); + Status s = sst_file_writer.Open(file_path); + ASSERT_OK(s); + ASSERT_OK(sst_file_writer.Put("b", "b_value")); + ASSERT_OK(sst_file_writer.Finish()); + } + + options.enable_blob_files = true; + options.min_blob_size = 1000; + options.use_direct_reads = true; + options.allow_ingest_behind = true; + + // Open DB with fixed-prefix sst-partitioner so that compaction will cut + // new table file when encountering a new key whose 1-byte prefix changes. + constexpr size_t key_len = 1; + options.sst_partitioner_factory = + NewSstPartitionerFixedPrefixFactory(key_len); + + Status s = TryReopen(options); + if (s.IsInvalidArgument()) { + ROCKSDB_GTEST_SKIP("This test requires direct IO support"); + return; + } + ASSERT_OK(s); + + constexpr size_t num_keys = 3; + constexpr size_t blob_size = 3000; + + constexpr char first_key[] = "a"; + const std::string first_blob(blob_size, 'a'); + ASSERT_OK(Put(first_key, first_blob)); + + constexpr char second_key[] = "b"; + const std::string second_blob(2 * blob_size, 'b'); + ASSERT_OK(Put(second_key, second_blob)); + + constexpr char third_key[] = "d"; + const std::string third_blob(blob_size, 'd'); + ASSERT_OK(Put(third_key, third_blob)); + + // first_blob, second_blob and third_blob in the same blob file. + // SST Blob file + // L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'| + // | | | ^ ^ ^ + // | | | | | | + // | | +---------|-------|--------+ + // | +-----------------|-------+ + // +-------------------------+ + ASSERT_OK(Flush()); + + constexpr char fourth_key[] = "c"; + const std::string fourth_blob(blob_size, 'c'); + ASSERT_OK(Put(fourth_key, fourth_blob)); + // fourth_blob in another blob file. + // SST Blob file SST Blob file + // L0 ["a", "b", "d"] |'aaaa', 'bbbb', 'dddd'| ["c"] |'cccc'| + // | | | ^ ^ ^ | ^ + // | | | | | | | | + // | | +---------|-------|--------+ +-------+ + // | +-----------------|-------+ + // +-------------------------+ + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + // Due to the above sst partitioner, we get 4 L1 files. The blob files are + // unchanged. + // |'aaaa', 'bbbb', 'dddd'| |'cccc'| + // ^ ^ ^ ^ + // | | | | + // L0 | | | | + // L1 ["a"] ["b"] ["c"] | | ["d"] | + // | | | | | | + // | | +---------|-------|---------------+ + // | +-----------------|-------+ + // +-------------------------+ + ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1)); + + { + // Ingest the external SST file into bottommost level. + std::vector<std::string> ext_files{file_path}; + IngestExternalFileOptions opts; + opts.ingest_behind = true; + ASSERT_OK( + db_->IngestExternalFile(db_->DefaultColumnFamily(), ext_files, opts)); + } + + // Now the database becomes as follows. + // |'aaaa', 'bbbb', 'dddd'| |'cccc'| + // ^ ^ ^ ^ + // | | | | + // L0 | | | | + // L1 ["a"] ["b"] ["c"] | | ["d"] | + // | | | | | | + // | | +---------|-------|---------------+ + // | +-----------------|-------+ + // +-------------------------+ + // + // L6 ["b"] + + { + // Compact ["b"] to bottommost level. + Slice begin = Slice(second_key); + Slice end = Slice(second_key); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, &begin, &end)); + } + + // |'aaaa', 'bbbb', 'dddd'| |'cccc'| + // ^ ^ ^ ^ + // | | | | + // L0 | | | | + // L1 ["a"] ["c"] | | ["d"] | + // | | | | | + // | +---------|-------|---------------+ + // | +-----------------|-------+ + // +-------|-----------------+ + // | + // L6 ["b"] + ASSERT_EQ(3, NumTableFilesAtLevel(/*level=*/1)); + ASSERT_EQ(1, NumTableFilesAtLevel(/*level=*/6)); + + bool called = false; + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* arg) { + auto* aligned_reqs = static_cast<std::vector<FSReadRequest>*>(arg); + assert(aligned_reqs); + ASSERT_EQ(1, aligned_reqs->size()); + called = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::array<Slice, num_keys> keys{{first_key, third_key, second_key}}; + + { + std::array<PinnableSlice, num_keys> values; + std::array<Status, num_keys> statuses; + + // The MultiGet(), when constructing the KeyContexts, will process the keys + // in such order: a, d, b. The reason is that ["a"] and ["d"] are in L1, + // while ["b"] resides in L6. + // Consequently, the original FSReadRequest list prepared by + // Version::MultiGetblob() will be for "a", "d" and "b". It is unsorted as + // follows: + // + // ["a", offset=30, len=3033], + // ["d", offset=9096, len=3033], + // ["b", offset=3063, len=6033] + // + // If we do not sort them before calling MultiRead() in DirectIO, then the + // underlying IO merging logic will yield two requests. + // + // [offset=0, len=4096] (for "a") + // [offset=0, len=12288] (result of merging the request for "d" and "b") + // + // We need to sort them in Version::MultiGetBlob() so that the underlying + // IO merging logic in DirectIO mode works as expected. The correct + // behavior will be one aligned request: + // + // [offset=0, len=12288] + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(called); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_blob); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], third_blob); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], second_blob); + } +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 2 << 20; // 2MB + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + options.min_blob_size = 0; + options.create_if_missing = true; + options.enable_blob_files = true; + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + Reopen(options); + + constexpr size_t kNumBlobFiles = 3; + constexpr size_t kNumBlobsPerFile = 3; + constexpr size_t kNumKeys = kNumBlobsPerFile * kNumBlobFiles; + + std::vector<std::string> key_strs; + std::vector<std::string> value_strs; + for (size_t i = 0; i < kNumBlobFiles; ++i) { + for (size_t j = 0; j < kNumBlobsPerFile; ++j) { + std::string key = "key" + std::to_string(i) + "_" + std::to_string(j); + std::string value = + "value_as_blob" + std::to_string(i) + "_" + std::to_string(j); + ASSERT_OK(Put(key, value)); + key_strs.push_back(key); + value_strs.push_back(value); + } + ASSERT_OK(Flush()); + } + assert(key_strs.size() == kNumKeys); + std::array<Slice, kNumKeys> keys; + for (size_t i = 0; i < keys.size(); ++i) { + keys[i] = key_strs[i]; + } + + ReadOptions read_options; + read_options.read_tier = kReadAllTier; + read_options.fill_cache = false; + + { + std::array<PinnableSlice, kNumKeys> values; + std::array<Status, kNumKeys> statuses; + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], + &values[0], &statuses[0]); + + for (size_t i = 0; i < kNumKeys; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(value_strs[i], values[i]); + } + } + + read_options.read_tier = kBlockCacheTier; + + { + std::array<PinnableSlice, kNumKeys> values; + std::array<Status, kNumKeys> statuses; + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], + &values[0], &statuses[0]); + + for (size_t i = 0; i < kNumKeys; ++i) { + ASSERT_TRUE(statuses[i].IsIncomplete()); + ASSERT_TRUE(values[i].empty()); + } + } + + read_options.read_tier = kReadAllTier; + read_options.fill_cache = true; + + { + std::array<PinnableSlice, kNumKeys> values; + std::array<Status, kNumKeys> statuses; + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], + &values[0], &statuses[0]); + + for (size_t i = 0; i < kNumKeys; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(value_strs[i], values[i]); + } + } + + read_options.read_tier = kBlockCacheTier; + + { + std::array<PinnableSlice, kNumKeys> values; + std::array<Status, kNumKeys> statuses; + db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0], + &values[0], &statuses[0]); + + for (size_t i = 0; i < kNumKeys; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(value_strs[i], values[i]); + } + } +} + +TEST_F(DBBlobBasicTest, GetBlob_CorruptIndex) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + ASSERT_OK(Put(key, blob)); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack( + "Version::Get::TamperWithBlobIndex", [](void* arg) { + Slice* const blob_index = static_cast<Slice*>(arg); + assert(blob_index); + assert(!blob_index->empty()); + blob_index->remove_prefix(1); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBBlobBasicTest, MultiGetBlob_CorruptIndex) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + + DestroyAndReopen(options); + + constexpr size_t kNumOfKeys = 3; + std::array<std::string, kNumOfKeys> key_strs; + std::array<std::string, kNumOfKeys> value_strs; + std::array<Slice, kNumOfKeys + 1> keys; + for (size_t i = 0; i < kNumOfKeys; ++i) { + key_strs[i] = "foo" + std::to_string(i); + value_strs[i] = "blob_value" + std::to_string(i); + ASSERT_OK(Put(key_strs[i], value_strs[i])); + keys[i] = key_strs[i]; + } + + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + ASSERT_OK(Put(key, blob)); + keys[kNumOfKeys] = key; + + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack( + "Version::MultiGet::TamperWithBlobIndex", [&key](void* arg) { + KeyContext* const key_context = static_cast<KeyContext*>(arg); + assert(key_context); + assert(key_context->key); + + if (*(key_context->key) == key) { + Slice* const blob_index = key_context->value; + assert(blob_index); + assert(!blob_index->empty()); + blob_index->remove_prefix(1); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::array<PinnableSlice, kNumOfKeys + 1> values; + std::array<Status, kNumOfKeys + 1> statuses; + db_->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), kNumOfKeys + 1, + keys.data(), values.data(), statuses.data(), + /*sorted_input=*/false); + for (size_t i = 0; i < kNumOfKeys + 1; ++i) { + if (i != kNumOfKeys) { + ASSERT_OK(statuses[i]); + ASSERT_EQ("blob_value" + std::to_string(i), values[i]); + } else { + ASSERT_TRUE(statuses[i].IsCorruption()); + } + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBBlobBasicTest, MultiGetBlob_ExceedSoftLimit) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr size_t kNumOfKeys = 3; + std::array<std::string, kNumOfKeys> key_bufs; + std::array<std::string, kNumOfKeys> value_bufs; + std::array<Slice, kNumOfKeys> keys; + for (size_t i = 0; i < kNumOfKeys; ++i) { + key_bufs[i] = "foo" + std::to_string(i); + value_bufs[i] = "blob_value" + std::to_string(i); + ASSERT_OK(Put(key_bufs[i], value_bufs[i])); + keys[i] = key_bufs[i]; + } + ASSERT_OK(Flush()); + + std::array<PinnableSlice, kNumOfKeys> values; + std::array<Status, kNumOfKeys> statuses; + ReadOptions read_opts; + read_opts.value_size_soft_limit = 1; + db_->MultiGet(read_opts, dbfull()->DefaultColumnFamily(), kNumOfKeys, + keys.data(), values.data(), statuses.data(), + /*sorted_input=*/true); + for (const auto& s : statuses) { + ASSERT_TRUE(s.IsAborted()); + } +} + +TEST_F(DBBlobBasicTest, GetBlob_InlinedTTLIndex) { + constexpr uint64_t min_blob_size = 10; + + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob[] = "short"; + static_assert(sizeof(short) - 1 < min_blob_size, + "Blob too long to be inlined"); + + // Fake an inlined TTL blob index. + std::string blob_index; + + constexpr uint64_t expiration = 1234567890; + + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); +} + +TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + + // Fake a blob index referencing a non-existent blob file. + std::string blob_index; + + constexpr uint64_t blob_file_number = 1000; + constexpr uint64_t offset = 1234; + constexpr uint64_t size = 5678; + + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, GenerateIOTracing) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + std::string trace_file = dbname_ + "/io_trace_file"; + + Reopen(options); + { + // Create IO trace file + std::unique_ptr<TraceWriter> trace_writer; + ASSERT_OK( + NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer)); + ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer))); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + ASSERT_OK(Flush()); + ASSERT_EQ(Get(key), blob_value); + + ASSERT_OK(db_->EndIOTrace()); + ASSERT_OK(env_->FileExists(trace_file)); + } + { + // Parse trace file to check file operations related to blob files are + // recorded. + std::unique_ptr<TraceReader> trace_reader; + ASSERT_OK( + NewFileTraceReader(env_, EnvOptions(), trace_file, &trace_reader)); + IOTraceReader reader(std::move(trace_reader)); + + IOTraceHeader header; + ASSERT_OK(reader.ReadHeader(&header)); + ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version)); + ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version)); + + // Read records. + int blob_files_op_count = 0; + Status status; + while (true) { + IOTraceRecord record; + status = reader.ReadIOOp(&record); + if (!status.ok()) { + break; + } + if (record.file_name.find("blob") != std::string::npos) { + blob_files_op_count++; + } + } + // Assuming blob files will have Append, Close and then Read operations. + ASSERT_GT(blob_files_op_count, 2); + } +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + Reopen(options); + + ASSERT_OK(dbfull()->DisableFileDeletions()); + constexpr int kNumTableFiles = 2; + for (int i = 0; i < kNumTableFiles; ++i) { + for (char ch = 'a'; ch != 'c'; ++ch) { + std::string key(1, ch); + ASSERT_OK(Put(key, "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + } + + Close(); + + std::vector<std::string> files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + std::string blob_file_path; + uint64_t max_blob_file_num = kInvalidBlobFileNumber; + for (const auto& fname : files) { + uint64_t file_num = 0; + FileType type; + if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) && + type == kBlobFile) { + if (file_num > max_blob_file_num) { + max_blob_file_num = file_num; + blob_file_path = dbname_ + "/" + fname; + } + } + } + ASSERT_OK(env_->DeleteFile(blob_file_path)); + + options.best_efforts_recovery = true; + Reopen(options); + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "a", &value)); + ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value); +} + +TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) { + Options options = GetDefaultOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + ASSERT_OK(Put("Key1", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key1", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key1", "v3")); + ASSERT_OK(Flush()); + + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "Key1", &value)); + ASSERT_EQ(Get("Key1"), "v1,v2,v3"); +} + +TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) { + constexpr size_t num_keys = 3; + + Options options = GetDefaultOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + ASSERT_OK(Put("Key0", "v0_0")); + ASSERT_OK(Put("Key1", "v1_0")); + ASSERT_OK(Put("Key2", "v2_0")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key0", "v0_1")); + ASSERT_OK(Merge("Key1", "v1_1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key0", "v0_2")); + ASSERT_OK(Flush()); + + std::array<Slice, num_keys> keys{{"Key0", "Key1", "Key2"}}; + std::array<PinnableSlice, num_keys> values; + std::array<Status, num_keys> statuses; + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], "v0_0,v0_1,v0_2"); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], "v1_0,v1_1"); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], "v2_0"); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, Properties) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key1[] = "key1"; + constexpr size_t key1_size = sizeof(key1) - 1; + + constexpr char key2[] = "key2"; + constexpr size_t key2_size = sizeof(key2) - 1; + + constexpr char key3[] = "key3"; + constexpr size_t key3_size = sizeof(key3) - 1; + + constexpr char blob[] = "00000000000000"; + constexpr size_t blob_size = sizeof(blob) - 1; + + constexpr char longer_blob[] = "00000000000000000000"; + constexpr size_t longer_blob_size = sizeof(longer_blob) - 1; + + ASSERT_OK(Put(key1, blob)); + ASSERT_OK(Put(key2, longer_blob)); + ASSERT_OK(Flush()); + + constexpr size_t first_blob_file_expected_size = + BlobLogHeader::kSize + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key1_size) + blob_size + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) + + longer_blob_size + BlobLogFooter::kSize; + + ASSERT_OK(Put(key3, blob)); + ASSERT_OK(Flush()); + + constexpr size_t second_blob_file_expected_size = + BlobLogHeader::kSize + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key3_size) + blob_size + + BlobLogFooter::kSize; + + constexpr size_t total_expected_size = + first_blob_file_expected_size + second_blob_file_expected_size; + + // Number of blob files + uint64_t num_blob_files = 0; + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kNumBlobFiles, &num_blob_files)); + ASSERT_EQ(num_blob_files, 2); + + // Total size of live blob files + uint64_t live_blob_file_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileSize, + &live_blob_file_size)); + ASSERT_EQ(live_blob_file_size, total_expected_size); + + // Total amount of garbage in live blob files + { + uint64_t live_blob_file_garbage_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize, + &live_blob_file_garbage_size)); + ASSERT_EQ(live_blob_file_garbage_size, 0); + } + + // Total size of all blob files across all versions + // Note: this should be the same as above since we only have one + // version at this point. + uint64_t total_blob_file_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize, + &total_blob_file_size)); + ASSERT_EQ(total_blob_file_size, total_expected_size); + + // Delete key2 to create some garbage + ASSERT_OK(Delete(key2)); + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + constexpr size_t expected_garbage_size = + BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) + + longer_blob_size; + + constexpr double expected_space_amp = + static_cast<double>(total_expected_size) / + (total_expected_size - expected_garbage_size); + + // Blob file stats + std::string blob_stats; + ASSERT_TRUE(db_->GetProperty(DB::Properties::kBlobStats, &blob_stats)); + + std::ostringstream oss; + oss << "Number of blob files: 2\nTotal size of blob files: " + << total_expected_size + << "\nTotal size of garbage in blob files: " << expected_garbage_size + << "\nBlob file space amplification: " << expected_space_amp << '\n'; + + ASSERT_EQ(blob_stats, oss.str()); + + // Total amount of garbage in live blob files + { + uint64_t live_blob_file_garbage_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize, + &live_blob_file_garbage_size)); + ASSERT_EQ(live_blob_file_garbage_size, expected_garbage_size); + } +} + +TEST_F(DBBlobBasicTest, PropertiesMultiVersion) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key1[] = "key1"; + constexpr char key2[] = "key2"; + constexpr char key3[] = "key3"; + + constexpr size_t key_size = sizeof(key1) - 1; + static_assert(sizeof(key2) - 1 == key_size, "unexpected size: key2"); + static_assert(sizeof(key3) - 1 == key_size, "unexpected size: key3"); + + constexpr char blob[] = "0000000000"; + constexpr size_t blob_size = sizeof(blob) - 1; + + ASSERT_OK(Put(key1, blob)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(key2, blob)); + ASSERT_OK(Flush()); + + // Create an iterator to keep the current version alive + std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions())); + ASSERT_OK(iter->status()); + + // Note: the Delete and subsequent compaction results in the first blob file + // not making it to the final version. (It is still part of the previous + // version kept alive by the iterator though.) On the other hand, the Put + // results in a third blob file. + ASSERT_OK(Delete(key1)); + ASSERT_OK(Put(key3, blob)); + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + // Total size of all blob files across all versions: between the two versions, + // we should have three blob files of the same size with one blob each. + // The version kept alive by the iterator contains the first and the second + // blob file, while the final version contains the second and the third blob + // file. (The second blob file is thus shared by the two versions but should + // be counted only once.) + uint64_t total_blob_file_size = 0; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize, + &total_blob_file_size)); + ASSERT_EQ(total_blob_file_size, + 3 * (BlobLogHeader::kSize + + BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + + blob_size + BlobLogFooter::kSize)); +} +#endif // !ROCKSDB_LITE + +class DBBlobBasicIOErrorTest : public DBBlobBasicTest, + public testing::WithParamInterface<std::string> { + protected: + DBBlobBasicIOErrorTest() : sync_point_(GetParam()) { + fault_injection_env_.reset(new FaultInjectionTestEnv(env_)); + } + ~DBBlobBasicIOErrorTest() { Close(); } + + std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_; + std::string sync_point_; +}; + +class DBBlobBasicIOErrorMultiGetTest : public DBBlobBasicIOErrorTest { + public: + DBBlobBasicIOErrorMultiGetTest() : DBBlobBasicIOErrorTest() {} +}; + +INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorTest, + ::testing::ValuesIn(std::vector<std::string>{ + "BlobFileReader::OpenFile:NewRandomAccessFile", + "BlobFileReader::GetBlob:ReadFromFile"})); + +INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorMultiGetTest, + ::testing::ValuesIn(std::vector<std::string>{ + "BlobFileReader::OpenFile:NewRandomAccessFile", + "BlobFileReader::MultiGetBlob:ReadFromFile"})); + +TEST_P(DBBlobBasicIOErrorTest, GetBlob_IOError) { + Options options; + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBBlobBasicIOErrorMultiGetTest, MultiGetBlobs_IOError) { + Options options = GetDefaultOptions(); + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr size_t num_keys = 2; + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + std::array<Slice, num_keys> keys{{first_key, second_key}}; + std::array<PinnableSlice, num_keys> values; + std::array<Status, num_keys> statuses; + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(statuses[0].IsIOError()); + ASSERT_TRUE(statuses[1].IsIOError()); +} + +TEST_P(DBBlobBasicIOErrorMultiGetTest, MultipleBlobFiles) { + Options options = GetDefaultOptions(); + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr size_t num_keys = 2; + + constexpr char key1[] = "key1"; + constexpr char value1[] = "blob1"; + + ASSERT_OK(Put(key1, value1)); + ASSERT_OK(Flush()); + + constexpr char key2[] = "key2"; + constexpr char value2[] = "blob2"; + + ASSERT_OK(Put(key2, value2)); + ASSERT_OK(Flush()); + + std::array<Slice, num_keys> keys{{key1, key2}}; + std::array<PinnableSlice, num_keys> values; + std::array<Status, num_keys> statuses; + + bool first_blob_file = true; + SyncPoint::GetInstance()->SetCallBack( + sync_point_, [&first_blob_file, this](void* /* arg */) { + if (first_blob_file) { + first_blob_file = false; + return; + } + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + ASSERT_OK(statuses[0]); + ASSERT_EQ(value1, values[0]); + ASSERT_TRUE(statuses[1].IsIOError()); +} + +namespace { + +class ReadBlobCompactionFilter : public CompactionFilter { + public: + ReadBlobCompactionFilter() = default; + const char* Name() const override { + return "rocksdb.compaction.filter.read.blob"; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const override { + if (value_type != CompactionFilter::ValueType::kValue) { + return CompactionFilter::Decision::kKeep; + } + assert(new_value); + new_value->assign(existing_value.data(), existing_value.size()); + return CompactionFilter::Decision::kChangeValue; + } +}; + +} // anonymous namespace + +TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) { + Options options = GetDefaultOptions(); + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + std::unique_ptr<CompactionFilter> compaction_filter_guard( + new ReadBlobCompactionFilter); + options.compaction_filter = compaction_filter_guard.get(); + + DestroyAndReopen(options); + constexpr char key[] = "foo"; + constexpr char blob_value[] = "foo_blob_value"; + ASSERT_OK(Put(key, blob_value)); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 1 << 25; + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + DestroyAndReopen(options); + + constexpr size_t kNumBlobs = 10; + constexpr size_t kValueSize = 100; + + std::string value(kValueSize, 'a'); + + for (size_t i = 1; i <= kNumBlobs; i++) { + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap + ASSERT_OK(Flush()); + ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD)); + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs))); + ASSERT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_MISS)); + ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_HIT)); + } + + // Verify compaction not counted + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + EXPECT_EQ(kNumBlobs * 2, + options.statistics->getTickerCount(BLOB_DB_CACHE_ADD)); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) { + Options options = GetDefaultOptions(); + + LRUCacheOptions co; + co.capacity = 1 << 25; + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto backing_cache = NewLRUCache(co); + + options.blob_cache = backing_cache; + + BlockBasedTableOptions block_based_options; + block_based_options.no_block_cache = false; + block_based_options.block_cache = backing_cache; + block_based_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(block_based_options)); + + options.enable_blob_files = true; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + DestroyAndReopen(options); + + constexpr size_t kNumBlobs = 10; + constexpr size_t kValueSize = 100; + + std::string value(kValueSize, 'a'); + + for (size_t i = 1; i <= 5; i++) { + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap + ASSERT_OK(Flush()); + ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD)); + + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs))); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD)); + ASSERT_EQ(0, + options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS)); + ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT)); + } + + ASSERT_OK(dbfull()->SetOptions({{"prepopulate_blob_cache", "kDisable"}})); + + for (size_t i = 6; i <= kNumBlobs; i++) { + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Put(std::to_string(i + kNumBlobs), value)); // Add some overlap + ASSERT_OK(Flush()); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD)); + + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs))); + ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD)); + ASSERT_EQ(2, + options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT)); + } + + // Verify compaction not counted + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + EXPECT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD)); +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) { + CompressedSecondaryCacheOptions secondary_cache_opts; + secondary_cache_opts.capacity = 1 << 20; + secondary_cache_opts.num_shard_bits = 0; + secondary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata; + secondary_cache_opts.compression_type = kNoCompression; + + LRUCacheOptions primary_cache_opts; + primary_cache_opts.capacity = 1024; + primary_cache_opts.num_shard_bits = 0; + primary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata; + primary_cache_opts.secondary_cache = + NewCompressedSecondaryCache(secondary_cache_opts); + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.statistics = CreateDBStatistics(); + options.enable_blob_files = true; + options.blob_cache = NewLRUCache(primary_cache_opts); + options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly; + + DestroyAndReopen(options); + + // Note: only one of the two blobs fit in the primary cache at any given time. + constexpr char first_key[] = "foo"; + constexpr size_t first_blob_size = 512; + const std::string first_blob(first_blob_size, 'a'); + + constexpr char second_key[] = "bar"; + constexpr size_t second_blob_size = 768; + const std::string second_blob(second_blob_size, 'b'); + + // First blob is inserted into primary cache during flush. + ASSERT_OK(Put(first_key, first_blob)); + ASSERT_OK(Flush()); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1); + + // Second blob is inserted into primary cache during flush, + // First blob is evicted but only a dummy handle is inserted into secondary + // cache. + ASSERT_OK(Put(second_key, second_blob)); + ASSERT_OK(Flush()); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1); + + // First blob is inserted into primary cache. + // Second blob is evicted but only a dummy handle is inserted into secondary + // cache. + ASSERT_EQ(Get(first_key), first_blob); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS), + 0); + // Second blob is inserted into primary cache, + // First blob is evicted and is inserted into secondary cache. + ASSERT_EQ(Get(second_key), second_blob); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0); + ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS), + 0); + + // First blob's dummy item is inserted into primary cache b/c of lookup. + // Second blob is still in primary cache. + ASSERT_EQ(Get(first_key), first_blob); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1); + ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS), + 1); + + // First blob's item is inserted into primary cache b/c of lookup. + // Second blob is evicted and inserted into secondary cache. + ASSERT_EQ(Get(first_key), first_blob); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0); + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1); + ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS), + 1); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/db_blob_compaction_test.cc b/src/rocksdb/db/blob/db_blob_compaction_test.cc new file mode 100644 index 000000000..f3fe3c03b --- /dev/null +++ b/src/rocksdb/db/blob/db_blob_compaction_test.cc @@ -0,0 +1,913 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_index.h" +#include "db/blob/blob_log_format.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobCompactionTest : public DBTestBase { + public: + explicit DBBlobCompactionTest() + : DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {} + +#ifndef ROCKSDB_LITE + const std::vector<InternalStats::CompactionStats>& GetCompactionStats() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + return internal_stats->TEST_GetCompactionStats(); + } +#endif // ROCKSDB_LITE +}; + +namespace { + +class FilterByKeyLength : public CompactionFilter { + public: + explicit FilterByKeyLength(size_t len) : length_threshold_(len) {} + const char* Name() const override { + return "rocksdb.compaction.filter.by.key.length"; + } + CompactionFilter::Decision FilterBlobByKey( + int /*level*/, const Slice& key, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (key.size() < length_threshold_) { + return CompactionFilter::Decision::kRemove; + } + return CompactionFilter::Decision::kKeep; + } + + private: + size_t length_threshold_; +}; + +class FilterByValueLength : public CompactionFilter { + public: + explicit FilterByValueLength(size_t len) : length_threshold_(len) {} + const char* Name() const override { + return "rocksdb.compaction.filter.by.value.length"; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType /*value_type*/, + const Slice& existing_value, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (existing_value.size() < length_threshold_) { + return CompactionFilter::Decision::kRemove; + } + return CompactionFilter::Decision::kKeep; + } + + private: + size_t length_threshold_; +}; + +class BadBlobCompactionFilter : public CompactionFilter { + public: + explicit BadBlobCompactionFilter(std::string prefix, + CompactionFilter::Decision filter_by_key, + CompactionFilter::Decision filter_v2) + : prefix_(std::move(prefix)), + filter_blob_by_key_(filter_by_key), + filter_v2_(filter_v2) {} + const char* Name() const override { return "rocksdb.compaction.filter.bad"; } + CompactionFilter::Decision FilterBlobByKey( + int /*level*/, const Slice& key, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (key.size() >= prefix_.size() && + 0 == strncmp(prefix_.data(), key.data(), prefix_.size())) { + return CompactionFilter::Decision::kUndetermined; + } + return filter_blob_by_key_; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return filter_v2_; + } + + private: + const std::string prefix_; + const CompactionFilter::Decision filter_blob_by_key_; + const CompactionFilter::Decision filter_v2_; +}; + +class ValueBlindWriteFilter : public CompactionFilter { + public: + explicit ValueBlindWriteFilter(std::string new_val) + : new_value_(std::move(new_val)) {} + const char* Name() const override { + return "rocksdb.compaction.filter.blind.write"; + } + CompactionFilter::Decision FilterBlobByKey( + int level, const Slice& key, std::string* new_value, + std::string* skip_until) const override; + + private: + const std::string new_value_; +}; + +CompactionFilter::Decision ValueBlindWriteFilter::FilterBlobByKey( + int /*level*/, const Slice& /*key*/, std::string* new_value, + std::string* /*skip_until*/) const { + assert(new_value); + new_value->assign(new_value_); + return CompactionFilter::Decision::kChangeValue; +} + +class ValueMutationFilter : public CompactionFilter { + public: + explicit ValueMutationFilter(std::string padding) + : padding_(std::move(padding)) {} + const char* Name() const override { + return "rocksdb.compaction.filter.value.mutation"; + } + CompactionFilter::Decision FilterV2(int level, const Slice& key, + ValueType value_type, + const Slice& existing_value, + std::string* new_value, + std::string* skip_until) const override; + + private: + const std::string padding_; +}; + +CompactionFilter::Decision ValueMutationFilter::FilterV2( + int /*level*/, const Slice& /*key*/, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const { + assert(CompactionFilter::ValueType::kBlobIndex != value_type); + if (CompactionFilter::ValueType::kValue != value_type) { + return CompactionFilter::Decision::kKeep; + } + assert(new_value); + new_value->assign(existing_value.data(), existing_value.size()); + new_value->append(padding_); + return CompactionFilter::Decision::kChangeValue; +} + +class AlwaysKeepFilter : public CompactionFilter { + public: + explicit AlwaysKeepFilter() = default; + const char* Name() const override { + return "rocksdb.compaction.filter.always.keep"; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return CompactionFilter::Decision::kKeep; + } +}; + +class SkipUntilFilter : public CompactionFilter { + public: + explicit SkipUntilFilter(std::string skip_until) + : skip_until_(std::move(skip_until)) {} + + const char* Name() const override { + return "rocksdb.compaction.filter.skip.until"; + } + + CompactionFilter::Decision FilterV2(int /* level */, const Slice& /* key */, + ValueType /* value_type */, + const Slice& /* existing_value */, + std::string* /* new_value */, + std::string* skip_until) const override { + assert(skip_until); + *skip_until = skip_until_; + + return CompactionFilter::Decision::kRemoveAndSkipUntil; + } + + private: + std::string skip_until_; +}; + +} // anonymous namespace + +class DBBlobBadCompactionFilterTest + : public DBBlobCompactionTest, + public testing::WithParamInterface< + std::tuple<std::string, CompactionFilter::Decision, + CompactionFilter::Decision>> { + public: + explicit DBBlobBadCompactionFilterTest() + : compaction_filter_guard_(new BadBlobCompactionFilter( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()))) {} + + protected: + std::unique_ptr<CompactionFilter> compaction_filter_guard_; +}; + +INSTANTIATE_TEST_CASE_P( + BadCompactionFilter, DBBlobBadCompactionFilterTest, + testing::Combine( + testing::Values("a"), + testing::Values(CompactionFilter::Decision::kChangeBlobIndex, + CompactionFilter::Decision::kIOError), + testing::Values(CompactionFilter::Decision::kUndetermined, + CompactionFilter::Decision::kChangeBlobIndex, + CompactionFilter::Decision::kIOError))); + +TEST_F(DBBlobCompactionTest, FilterByKeyLength) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + constexpr size_t kKeyLength = 2; + std::unique_ptr<CompactionFilter> compaction_filter_guard( + new FilterByKeyLength(kKeyLength)); + options.compaction_filter = compaction_filter_guard.get(); + + constexpr char short_key[] = "a"; + constexpr char long_key[] = "abc"; + constexpr char blob_value[] = "value"; + + DestroyAndReopen(options); + ASSERT_OK(Put(short_key, blob_value)); + ASSERT_OK(Put(long_key, blob_value)); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), short_key, &value).IsNotFound()); + value.clear(); + ASSERT_OK(db_->Get(ReadOptions(), long_key, &value)); + ASSERT_EQ("value", value); + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter decides between kKeep and kRemove solely based on key; + // this involves neither reading nor writing blobs + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, FilterByValueLength) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 5; + options.create_if_missing = true; + constexpr size_t kValueLength = 5; + std::unique_ptr<CompactionFilter> compaction_filter_guard( + new FilterByValueLength(kValueLength)); + options.compaction_filter = compaction_filter_guard.get(); + + const std::vector<std::string> short_value_keys = {"a", "e", "j"}; + constexpr char short_value[] = "val"; + const std::vector<std::string> long_value_keys = {"b", "f", "k"}; + constexpr char long_value[] = "valuevalue"; + + DestroyAndReopen(options); + for (size_t i = 0; i < short_value_keys.size(); ++i) { + ASSERT_OK(Put(short_value_keys[i], short_value)); + } + for (size_t i = 0; i < short_value_keys.size(); ++i) { + ASSERT_OK(Put(long_value_keys[i], long_value)); + } + ASSERT_OK(Flush()); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + std::string value; + for (size_t i = 0; i < short_value_keys.size(); ++i) { + ASSERT_TRUE( + db_->Get(ReadOptions(), short_value_keys[i], &value).IsNotFound()); + value.clear(); + } + for (size_t i = 0; i < long_value_keys.size(); ++i) { + ASSERT_OK(db_->Get(ReadOptions(), long_value_keys[i], &value)); + ASSERT_EQ(long_value, value); + } + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter decides between kKeep and kRemove based on value; + // this involves reading but not writing blobs + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) { + Options options = GetDefaultOptions(); + + options.enable_blob_files = true; + options.min_blob_size = 1000; + options.blob_file_starting_level = 5; + options.create_if_missing = true; + + // Open DB with fixed-prefix sst-partitioner so that compaction will cut + // new table file when encountering a new key whose 1-byte prefix changes. + constexpr size_t key_len = 1; + options.sst_partitioner_factory = + NewSstPartitionerFixedPrefixFactory(key_len); + + ASSERT_OK(TryReopen(options)); + + constexpr size_t blob_size = 3000; + + constexpr char first_key[] = "a"; + const std::string first_blob(blob_size, 'a'); + ASSERT_OK(Put(first_key, first_blob)); + + constexpr char second_key[] = "b"; + const std::string second_blob(2 * blob_size, 'b'); + ASSERT_OK(Put(second_key, second_blob)); + + constexpr char third_key[] = "d"; + const std::string third_blob(blob_size, 'd'); + ASSERT_OK(Put(third_key, third_blob)); + + ASSERT_OK(Flush()); + + constexpr char fourth_key[] = "c"; + const std::string fourth_blob(blob_size, 'c'); + ASSERT_OK(Put(fourth_key, fourth_blob)); + + ASSERT_OK(Flush()); + + ASSERT_EQ(0, GetBlobFileNumbers().size()); + ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0)); + ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1)); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + // No blob file should be created since blob_file_starting_level is 5. + ASSERT_EQ(0, GetBlobFileNumbers().size()); + ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0)); + ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1)); + + { + options.blob_file_starting_level = 1; + DestroyAndReopen(options); + + ASSERT_OK(Put(first_key, first_blob)); + ASSERT_OK(Put(second_key, second_blob)); + ASSERT_OK(Put(third_key, third_blob)); + ASSERT_OK(Flush()); + ASSERT_OK(Put(fourth_key, fourth_blob)); + ASSERT_OK(Flush()); + + ASSERT_EQ(0, GetBlobFileNumbers().size()); + ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0)); + ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1)); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + // The compaction's output level equals to blob_file_starting_level. + ASSERT_EQ(1, GetBlobFileNumbers().size()); + ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0)); + ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1)); + } + + Close(); +} +#endif + +TEST_F(DBBlobCompactionTest, BlindWriteFilter) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + constexpr char new_blob_value[] = "new_blob_value"; + std::unique_ptr<CompactionFilter> compaction_filter_guard( + new ValueBlindWriteFilter(new_blob_value)); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + const std::vector<std::string> keys = {"a", "b", "c"}; + const std::vector<std::string> values = {"a_value", "b_value", "c_value"}; + assert(keys.size() == values.size()); + for (size_t i = 0; i < keys.size(); ++i) { + ASSERT_OK(Put(keys[i], values[i])); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + for (const auto& key : keys) { + ASSERT_EQ(new_blob_value, Get(key)); + } + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter unconditionally changes value in FilterBlobByKey; + // this involves writing but not reading blobs + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, SkipUntilFilter) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + + std::unique_ptr<CompactionFilter> compaction_filter_guard( + new SkipUntilFilter("z")); + options.compaction_filter = compaction_filter_guard.get(); + + Reopen(options); + + const std::vector<std::string> keys{"a", "b", "c"}; + const std::vector<std::string> values{"a_value", "b_value", "c_value"}; + assert(keys.size() == values.size()); + + for (size_t i = 0; i < keys.size(); ++i) { + ASSERT_OK(Put(keys[i], values[i])); + } + + ASSERT_OK(Flush()); + + int process_in_flow_called = 0; + + SyncPoint::GetInstance()->SetCallBack( + "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow", + [&process_in_flow_called](void* /* arg */) { ++process_in_flow_called; }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr, + /* end */ nullptr)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + for (const auto& key : keys) { + ASSERT_EQ(Get(key), "NOT_FOUND"); + } + + // Make sure SkipUntil was performed using iteration rather than Seek + ASSERT_EQ(process_in_flow_called, keys.size()); + + Close(); +} + +TEST_P(DBBlobBadCompactionFilterTest, BadDecisionFromCompactionFilter) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + options.compaction_filter = compaction_filter_guard_.get(); + DestroyAndReopen(options); + ASSERT_OK(Put("b", "value")); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsNotSupported()); + Close(); + + DestroyAndReopen(options); + std::string key(std::get<0>(GetParam())); + ASSERT_OK(Put(key, "value")); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsNotSupported()); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilter_InlinedTTLIndex) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr<CompactionFilter> compaction_filter_guard( + new ValueMutationFilter("")); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + // Fake an inlined TTL blob index. + std::string blob_index; + constexpr uint64_t expiration = 1234567890; + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsCorruption()); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilter) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + constexpr char padding[] = "_delta"; + std::unique_ptr<CompactionFilter> compaction_filter_guard( + new ValueMutationFilter(padding)); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + const std::vector<std::pair<std::string, std::string>> kvs = { + {"a", "a_value"}, {"b", "b_value"}, {"c", "c_value"}}; + for (const auto& kv : kvs) { + ASSERT_OK(Put(kv.first, kv.second)); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + for (const auto& kv : kvs) { + ASSERT_EQ(kv.second + std::string(padding), Get(kv.first)); + } + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter changes the value using the previous value in FilterV2; + // this involves reading and writing blobs + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, CorruptedBlobIndex) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr<CompactionFilter> compaction_filter_guard( + new ValueMutationFilter("")); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + ASSERT_OK(Put(key, blob)); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex", + [](void* arg) { + Slice* const blob_index = static_cast<Slice*>(arg); + assert(blob_index); + assert(!blob_index->empty()); + blob_index->remove_prefix(1); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsCorruption()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr<CompactionFilter> compaction_filter_guard( + new AlwaysKeepFilter()); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "foo_value")); + ASSERT_OK(Flush()); + std::vector<uint64_t> blob_files = GetBlobFileNumbers(); + ASSERT_EQ(1, blob_files.size()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + ASSERT_EQ(blob_files, GetBlobFileNumbers()); + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter decides to keep the existing value in FilterV2; + // this involves reading but not writing blobs + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, TrackGarbage) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + + Reopen(options); + + // First table+blob file pair: 4 blobs with different keys + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Put(third_key, third_value)); + ASSERT_OK(Put(fourth_key, fourth_value)); + ASSERT_OK(Flush()); + + // Second table+blob file pair: overwrite 2 existing keys + constexpr char new_first_value[] = "new_first_value"; + constexpr char new_second_value[] = "new_second_value"; + + ASSERT_OK(Put(first_key, new_first_value)); + ASSERT_OK(Put(second_key, new_second_value)); + ASSERT_OK(Flush()); + + // Compact them together. The first blob file should have 2 garbage blobs + // corresponding to the 2 overwritten keys. + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 2); + + { + const auto& meta = blob_files.front(); + assert(meta); + + constexpr uint64_t first_expected_bytes = + sizeof(first_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) - + 1); + constexpr uint64_t second_expected_bytes = + sizeof(second_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) - + 1); + constexpr uint64_t third_expected_bytes = + sizeof(third_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(third_key) - + 1); + constexpr uint64_t fourth_expected_bytes = + sizeof(fourth_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(fourth_key) - + 1); + + ASSERT_EQ(meta->GetTotalBlobCount(), 4); + ASSERT_EQ(meta->GetTotalBlobBytes(), + first_expected_bytes + second_expected_bytes + + third_expected_bytes + fourth_expected_bytes); + ASSERT_EQ(meta->GetGarbageBlobCount(), 2); + ASSERT_EQ(meta->GetGarbageBlobBytes(), + first_expected_bytes + second_expected_bytes); + } + + { + const auto& meta = blob_files.back(); + assert(meta); + + constexpr uint64_t new_first_expected_bytes = + sizeof(new_first_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) - + 1); + constexpr uint64_t new_second_expected_bytes = + sizeof(new_second_value) - 1 + + BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) - + 1); + + ASSERT_EQ(meta->GetTotalBlobCount(), 2); + ASSERT_EQ(meta->GetTotalBlobBytes(), + new_first_expected_bytes + new_second_expected_bytes); + ASSERT_EQ(meta->GetGarbageBlobCount(), 0); + ASSERT_EQ(meta->GetGarbageBlobBytes(), 0); + } +} + +TEST_F(DBBlobCompactionTest, MergeBlobWithBase) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.disable_auto_compactions = true; + + Reopen(options); + ASSERT_OK(Put("Key1", "v1_1")); + ASSERT_OK(Put("Key2", "v2_1")); + ASSERT_OK(Flush()); + + ASSERT_OK(Merge("Key1", "v1_2")); + ASSERT_OK(Merge("Key2", "v2_2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Merge("Key1", "v1_3")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + ASSERT_EQ(Get("Key1"), "v1_1,v1_2,v1_3"); + ASSERT_EQ(Get("Key2"), "v2_1,v2_2"); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionReadaheadGarbageCollection) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + options.blob_compaction_readahead_size = 1 << 10; + options.disable_auto_compactions = true; + + Reopen(options); + + ASSERT_OK(Put("key", "lime")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("key", "pie")); + ASSERT_OK(Put("foo", "baz")); + ASSERT_OK(Flush()); + + size_t num_non_prefetch_reads = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::GetBlob:ReadFromFile", + [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(Get("key"), "pie"); + ASSERT_EQ(Get("foo"), "baz"); + ASSERT_EQ(num_non_prefetch_reads, 0); + + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionReadaheadFilter) { + Options options = GetDefaultOptions(); + + std::unique_ptr<CompactionFilter> compaction_filter_guard( + new ValueMutationFilter("pie")); + + options.compaction_filter = compaction_filter_guard.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.blob_compaction_readahead_size = 1 << 10; + options.disable_auto_compactions = true; + + Reopen(options); + + ASSERT_OK(Put("key", "lime")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + size_t num_non_prefetch_reads = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::GetBlob:ReadFromFile", + [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(Get("key"), "limepie"); + ASSERT_EQ(Get("foo"), "barpie"); + ASSERT_EQ(num_non_prefetch_reads, 0); + + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionReadaheadMerge) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.blob_compaction_readahead_size = 1 << 10; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.disable_auto_compactions = true; + + Reopen(options); + + ASSERT_OK(Put("key", "lime")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(Merge("key", "pie")); + ASSERT_OK(Merge("foo", "baz")); + ASSERT_OK(Flush()); + + size_t num_non_prefetch_reads = 0; + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::GetBlob:ReadFromFile", + [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(Get("key"), "lime,pie"); + ASSERT_EQ(Get("foo"), "bar,baz"); + ASSERT_EQ(num_non_prefetch_reads, 0); + + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionDoNotFillCache) { + Options options = GetDefaultOptions(); + + options.enable_blob_files = true; + options.min_blob_size = 0; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + + LRUCacheOptions cache_options; + cache_options.capacity = 1 << 20; + cache_options.metadata_charge_policy = kDontChargeCacheMetadata; + + options.blob_cache = NewLRUCache(cache_options); + + Reopen(options); + + ASSERT_OK(Put("key", "lime")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("key", "pie")); + ASSERT_OK(Put("foo", "baz")); + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + ASSERT_EQ(options.statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0); + + Close(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/db_blob_corruption_test.cc b/src/rocksdb/db/blob/db_blob_corruption_test.cc new file mode 100644 index 000000000..7ac7ce3fc --- /dev/null +++ b/src/rocksdb/db/blob/db_blob_corruption_test.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobCorruptionTest : public DBTestBase { + protected: + DBBlobCorruptionTest() + : DBTestBase("db_blob_corruption_test", /* env_do_fsync */ false) {} + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector<std::string> filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + std::string fname; + uint64_t picked_number = kInvalidBlobFileNumber; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && type == filetype && + number > picked_number) { // Pick latest file + fname = dbname_ + "/" + filenames[i]; + picked_number = number; + } + } + ASSERT_TRUE(!fname.empty()) << filetype; + ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt)); + } +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + options.file_checksum_gen_factory = + ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); + Reopen(options); + + ASSERT_OK(Put(Slice("key_1"), Slice("blob_value_1"))); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Slice("key_2"), Slice("blob_value_2"))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + Close(); + + Corrupt(kBlobFile, 0, 2); + + ASSERT_OK(TryReopen(options)); + + int count{0}; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) { + const Status* s = static_cast<Status*>(arg); + ASSERT_NE(s, nullptr); + ++count; + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption()); + ASSERT_EQ(1, count); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // !ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/db_blob_index_test.cc b/src/rocksdb/db/blob/db_blob_index_test.cc new file mode 100644 index 000000000..64c550894 --- /dev/null +++ b/src/rocksdb/db/blob/db_blob_index_test.cc @@ -0,0 +1,602 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <functional> +#include <string> +#include <utility> +#include <vector> + +#include "db/arena_wrapped_db_iter.h" +#include "db/blob/blob_index.h" +#include "db/column_family.h" +#include "db/db_iter.h" +#include "db/db_test_util.h" +#include "db/dbformat.h" +#include "db/write_batch_internal.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "util/string_util.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { + +// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb +// should accept the value type on write, and report not supported value +// for reads, unless caller request for it explicitly. The base rocksdb +// doesn't understand format of actual blob index (the value). +class DBBlobIndexTest : public DBTestBase { + public: + enum Tier { + kMemtable = 0, + kImmutableMemtables = 1, + kL0SstFile = 2, + kLnSstFile = 3, + }; + const std::vector<Tier> kAllTiers = {Tier::kMemtable, + Tier::kImmutableMemtables, + Tier::kL0SstFile, Tier::kLnSstFile}; + + DBBlobIndexTest() : DBTestBase("db_blob_index_test", /*env_do_fsync=*/true) {} + + ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); } + + ColumnFamilyData* cfd() { + return static_cast_with_check<ColumnFamilyHandleImpl>(cfh())->cfd(); + } + + Status PutBlobIndex(WriteBatch* batch, const Slice& key, + const Slice& blob_index) { + return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key, + blob_index); + } + + Status Write(WriteBatch* batch) { + return dbfull()->Write(WriteOptions(), batch); + } + + std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr, + const Snapshot* snapshot = nullptr) { + ReadOptions read_options; + read_options.snapshot = snapshot; + PinnableSlice value; + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cfh(); + get_impl_options.value = &value; + get_impl_options.is_blob_index = is_blob_index; + auto s = dbfull()->GetImpl(read_options, key, get_impl_options); + if (s.IsNotFound()) { + return "NOT_FOUND"; + } + if (s.IsCorruption()) { + return "CORRUPTION"; + } + if (s.IsNotSupported()) { + return "NOT_SUPPORTED"; + } + if (!s.ok()) { + return s.ToString(); + } + return value.ToString(); + } + + std::string GetBlobIndex(const Slice& key, + const Snapshot* snapshot = nullptr) { + bool is_blob_index = false; + std::string value = GetImpl(key, &is_blob_index, snapshot); + if (!is_blob_index) { + return "NOT_BLOB"; + } + return value; + } + + ArenaWrappedDBIter* GetBlobIterator() { + return dbfull()->NewIteratorImpl( + ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(), + nullptr /*read_callback*/, true /*expose_blob_index*/); + } + + Options GetTestOptions() { + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.num_levels = 2; + options.disable_auto_compactions = true; + // Disable auto flushes. + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 10; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + return options; + } + + void MoveDataTo(Tier tier) { + switch (tier) { + case Tier::kMemtable: + break; + case Tier::kImmutableMemtables: + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + break; + case Tier::kL0SstFile: + ASSERT_OK(Flush()); + break; + case Tier::kLnSstFile: + ASSERT_OK(Flush()); + ASSERT_OK(Put("a", "dummy")); + ASSERT_OK(Put("z", "dummy")); + ASSERT_OK(Flush()); + ASSERT_OK( + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel()); +#endif // !ROCKSDB_LITE + break; + } + } +}; + +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. We should be able to write kTypeBlobIndex to memtables and +// SST files. +TEST_F(DBBlobIndexTest, Write) { + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + + std::vector<std::pair<std::string, std::string>> key_values; + + constexpr size_t num_key_values = 5; + + key_values.reserve(num_key_values); + + for (size_t i = 1; i <= num_key_values; ++i) { + std::string key = "key" + std::to_string(i); + + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, + "blob" + std::to_string(i)); + + key_values.emplace_back(std::move(key), std::move(blob_index)); + } + + for (const auto& key_value : key_values) { + WriteBatch batch; + ASSERT_OK(PutBlobIndex(&batch, key_value.first, key_value.second)); + ASSERT_OK(Write(&batch)); + } + + MoveDataTo(tier); + + for (const auto& key_value : key_values) { + ASSERT_EQ(GetBlobIndex(key_value.first), key_value.second); + } + } +} + +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. Get should be able to return blob index if is_blob_index is +// provided, otherwise it should return Status::NotSupported (when reading from +// memtable) or Status::Corruption (when reading from SST). Reading from SST +// returns Corruption because we can't differentiate between the application +// accidentally opening the base DB of a stacked BlobDB and actual corruption +// when using the integrated BlobDB. +TEST_F(DBBlobIndexTest, Get) { + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob"); + + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + + WriteBatch batch; + ASSERT_OK(batch.Put("key", "value")); + ASSERT_OK(PutBlobIndex(&batch, "blob_key", blob_index)); + ASSERT_OK(Write(&batch)); + + MoveDataTo(tier); + + // Verify normal value + bool is_blob_index = false; + PinnableSlice value; + ASSERT_EQ("value", Get("key")); + ASSERT_EQ("value", GetImpl("key")); + ASSERT_EQ("value", GetImpl("key", &is_blob_index)); + ASSERT_FALSE(is_blob_index); + + // Verify blob index + if (tier <= kImmutableMemtables) { + ASSERT_TRUE(Get("blob_key", &value).IsNotSupported()); + ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key")); + } else { + ASSERT_TRUE(Get("blob_key", &value).IsCorruption()); + ASSERT_EQ("CORRUPTION", GetImpl("blob_key")); + } + ASSERT_EQ(blob_index, GetImpl("blob_key", &is_blob_index)); + ASSERT_TRUE(is_blob_index); + } +} + +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. Get should NOT return Status::NotSupported/Status::Corruption +// if blob index is updated with a normal value. See the test case above for +// more details. +TEST_F(DBBlobIndexTest, Updated) { + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob"); + + for (auto tier : kAllTiers) { + DestroyAndReopen(GetTestOptions()); + WriteBatch batch; + for (int i = 0; i < 10; i++) { + ASSERT_OK(PutBlobIndex(&batch, "key" + std::to_string(i), blob_index)); + } + ASSERT_OK(Write(&batch)); + // Avoid blob values from being purged. + const Snapshot* snapshot = dbfull()->GetSnapshot(); + ASSERT_OK(Put("key1", "new_value")); + ASSERT_OK(Merge("key2", "a")); + ASSERT_OK(Merge("key2", "b")); + ASSERT_OK(Merge("key2", "c")); + ASSERT_OK(Delete("key3")); + ASSERT_OK(SingleDelete("key4")); + ASSERT_OK(Delete("key5")); + ASSERT_OK(Merge("key5", "a")); + ASSERT_OK(Merge("key5", "b")); + ASSERT_OK(Merge("key5", "c")); + ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9")); + MoveDataTo(tier); + for (int i = 0; i < 10; i++) { + ASSERT_EQ(blob_index, GetBlobIndex("key" + std::to_string(i), snapshot)); + } + ASSERT_EQ("new_value", Get("key1")); + if (tier <= kImmutableMemtables) { + ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2")); + } else { + ASSERT_EQ("CORRUPTION", GetImpl("key2")); + } + ASSERT_EQ("NOT_FOUND", Get("key3")); + ASSERT_EQ("NOT_FOUND", Get("key4")); + ASSERT_EQ("a,b,c", GetImpl("key5")); + for (int i = 6; i < 9; i++) { + ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i))); + } + ASSERT_EQ(blob_index, GetBlobIndex("key9")); + dbfull()->ReleaseSnapshot(snapshot); + } +} + +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. When a blob iterator is used, it should set the +// expose_blob_index flag for the underlying DBIter, and retrieve/return the +// corresponding blob value. If a regular DBIter is created (i.e. +// expose_blob_index is not set), it should return Status::Corruption. +TEST_F(DBBlobIndexTest, Iterate) { + const std::vector<std::vector<ValueType>> data = { + /*00*/ {kTypeValue}, + /*01*/ {kTypeBlobIndex}, + /*02*/ {kTypeValue}, + /*03*/ {kTypeBlobIndex, kTypeValue}, + /*04*/ {kTypeValue}, + /*05*/ {kTypeValue, kTypeBlobIndex}, + /*06*/ {kTypeValue}, + /*07*/ {kTypeDeletion, kTypeBlobIndex}, + /*08*/ {kTypeValue}, + /*09*/ {kTypeSingleDeletion, kTypeBlobIndex}, + /*10*/ {kTypeValue}, + /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex}, + /*12*/ {kTypeValue}, + /*13*/ + {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex}, + /*14*/ {kTypeValue}, + /*15*/ {kTypeBlobIndex}, + /*16*/ {kTypeValue}, + }; + + auto get_key = [](int index) { + char buf[20]; + snprintf(buf, sizeof(buf), "%02d", index); + return "key" + std::string(buf); + }; + + auto get_value = [&](int index, int version) { + return get_key(index) + "_value" + std::to_string(version); + }; + + auto check_iterator = [&](Iterator* iterator, Status::Code expected_status, + const Slice& expected_value) { + ASSERT_EQ(expected_status, iterator->status().code()); + if (expected_status == Status::kOk) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ(expected_value, iterator->value()); + } else { + ASSERT_FALSE(iterator->Valid()); + } + }; + + auto create_normal_iterator = [&]() -> Iterator* { + return dbfull()->NewIterator(ReadOptions()); + }; + + auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); }; + + auto check_is_blob = [&](bool is_blob) { + return [is_blob](Iterator* iterator) { + ASSERT_EQ(is_blob, + reinterpret_cast<ArenaWrappedDBIter*>(iterator)->IsBlob()); + }; + }; + + auto verify = [&](int index, Status::Code expected_status, + const Slice& forward_value, const Slice& backward_value, + std::function<Iterator*()> create_iterator, + std::function<void(Iterator*)> extra_check = nullptr) { + // Seek + auto* iterator = create_iterator(); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index)); + check_iterator(iterator, expected_status, forward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // Next + iterator = create_iterator(); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index - 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Next(); + check_iterator(iterator, expected_status, forward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // SeekForPrev + iterator = create_iterator(); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->SeekForPrev(get_key(index)); + check_iterator(iterator, expected_status, backward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + + // Prev + iterator = create_iterator(); + iterator->Seek(get_key(index + 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Prev(); + check_iterator(iterator, expected_status, backward_value); + if (extra_check) { + extra_check(iterator); + } + delete iterator; + }; + + for (auto tier : {Tier::kMemtable} /*kAllTiers*/) { + // Avoid values from being purged. + std::vector<const Snapshot*> snapshots; + DestroyAndReopen(GetTestOptions()); + + // fill data + for (int i = 0; i < static_cast<int>(data.size()); i++) { + for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) { + std::string key = get_key(i); + std::string value = get_value(i, j); + WriteBatch batch; + switch (data[i][j]) { + case kTypeValue: + ASSERT_OK(Put(key, value)); + break; + case kTypeDeletion: + ASSERT_OK(Delete(key)); + break; + case kTypeSingleDeletion: + ASSERT_OK(SingleDelete(key)); + break; + case kTypeMerge: + ASSERT_OK(Merge(key, value)); + break; + case kTypeBlobIndex: + ASSERT_OK(PutBlobIndex(&batch, key, value)); + ASSERT_OK(Write(&batch)); + break; + default: + FAIL(); + }; + } + snapshots.push_back(dbfull()->GetSnapshot()); + } + ASSERT_OK( + dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16))); + snapshots.push_back(dbfull()->GetSnapshot()); + MoveDataTo(tier); + + // Normal iterator + verify(1, Status::kCorruption, "", "", create_normal_iterator); + verify(3, Status::kCorruption, "", "", create_normal_iterator); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_normal_iterator); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_normal_iterator); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_normal_iterator); + verify(11, Status::kCorruption, "", "", create_normal_iterator); + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_normal_iterator); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_normal_iterator); + + // Iterator with blob support + verify(1, Status::kOk, get_value(1, 0), get_value(1, 0), + create_blob_iterator, check_is_blob(true)); + verify(3, Status::kOk, get_value(3, 0), get_value(3, 0), + create_blob_iterator, check_is_blob(true)); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_blob_iterator, check_is_blob(false)); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_blob_iterator, check_is_blob(false)); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_blob_iterator, check_is_blob(false)); + if (tier <= kImmutableMemtables) { + verify(11, Status::kNotSupported, "", "", create_blob_iterator); + } else { + verify(11, Status::kCorruption, "", "", create_blob_iterator); + } + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_blob_iterator, check_is_blob(false)); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_blob_iterator, check_is_blob(false)); + +#ifndef ROCKSDB_LITE + // Iterator with blob support and using seek. + ASSERT_OK(dbfull()->SetOptions( + cfh(), {{"max_sequential_skip_in_iterations", "0"}})); + verify(1, Status::kOk, get_value(1, 0), get_value(1, 0), + create_blob_iterator, check_is_blob(true)); + verify(3, Status::kOk, get_value(3, 0), get_value(3, 0), + create_blob_iterator, check_is_blob(true)); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_blob_iterator, check_is_blob(false)); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_blob_iterator, check_is_blob(false)); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_blob_iterator, check_is_blob(false)); + if (tier <= kImmutableMemtables) { + verify(11, Status::kNotSupported, "", "", create_blob_iterator); + } else { + verify(11, Status::kCorruption, "", "", create_blob_iterator); + } + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_blob_iterator, check_is_blob(false)); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_blob_iterator, check_is_blob(false)); +#endif // !ROCKSDB_LITE + + for (auto* snapshot : snapshots) { + dbfull()->ReleaseSnapshot(snapshot); + } + } +} + +TEST_F(DBBlobIndexTest, IntegratedBlobIterate) { + const std::vector<std::vector<std::string>> data = { + /*00*/ {"Put"}, + /*01*/ {"Put", "Merge", "Merge", "Merge"}, + /*02*/ {"Put"}}; + + auto get_key = [](size_t index) { return ("key" + std::to_string(index)); }; + + auto get_value = [&](size_t index, size_t version) { + return get_key(index) + "_value" + std::to_string(version); + }; + + auto check_iterator = [&](Iterator* iterator, Status expected_status, + const Slice& expected_value) { + ASSERT_EQ(expected_status, iterator->status()); + if (expected_status.ok()) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ(expected_value, iterator->value()); + } else { + ASSERT_FALSE(iterator->Valid()); + } + }; + + auto verify = [&](size_t index, Status expected_status, + const Slice& expected_value) { + // Seek + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr<Iterator> iterator_guard(iterator); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index)); + check_iterator(iterator, expected_status, expected_value); + } + // Next + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr<Iterator> iterator_guard(iterator); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index - 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Next(); + check_iterator(iterator, expected_status, expected_value); + } + // SeekForPrev + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr<Iterator> iterator_guard(iterator); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->SeekForPrev(get_key(index)); + check_iterator(iterator, expected_status, expected_value); + } + // Prev + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr<Iterator> iterator_guard(iterator); + iterator->Seek(get_key(index + 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Prev(); + check_iterator(iterator, expected_status, expected_value); + } + }; + + Options options = GetTestOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + DestroyAndReopen(options); + + // fill data + for (size_t i = 0; i < data.size(); i++) { + for (size_t j = 0; j < data[i].size(); j++) { + std::string key = get_key(i); + std::string value = get_value(i, j); + if (data[i][j] == "Put") { + ASSERT_OK(Put(key, value)); + ASSERT_OK(Flush()); + } else if (data[i][j] == "Merge") { + ASSERT_OK(Merge(key, value)); + ASSERT_OK(Flush()); + } + } + } + + std::string expected_value = get_value(1, 0) + "," + get_value(1, 1) + "," + + get_value(1, 2) + "," + get_value(1, 3); + Status expected_status; + verify(1, expected_status, expected_value); + +#ifndef ROCKSDB_LITE + // Test DBIter::FindValueForCurrentKeyUsingSeek flow. + ASSERT_OK(dbfull()->SetOptions(cfh(), + {{"max_sequential_skip_in_iterations", "0"}})); + verify(1, expected_status, expected_value); +#endif // !ROCKSDB_LITE +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.cc b/src/rocksdb/db/blob/prefetch_buffer_collection.cc new file mode 100644 index 000000000..079576f51 --- /dev/null +++ b/src/rocksdb/db/blob/prefetch_buffer_collection.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/prefetch_buffer_collection.h" + +namespace ROCKSDB_NAMESPACE { + +FilePrefetchBuffer* PrefetchBufferCollection::GetOrCreatePrefetchBuffer( + uint64_t file_number) { + auto& prefetch_buffer = prefetch_buffers_[file_number]; + if (!prefetch_buffer) { + prefetch_buffer.reset( + new FilePrefetchBuffer(readahead_size_, readahead_size_)); + } + + return prefetch_buffer.get(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.h b/src/rocksdb/db/blob/prefetch_buffer_collection.h new file mode 100644 index 000000000..b973eddc0 --- /dev/null +++ b/src/rocksdb/db/blob/prefetch_buffer_collection.h @@ -0,0 +1,38 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include <cassert> +#include <cstdint> +#include <memory> +#include <unordered_map> + +#include "file/file_prefetch_buffer.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// A class that owns a collection of FilePrefetchBuffers using the file number +// as key. Used for implementing compaction readahead for blob files. Designed +// to be accessed by a single thread only: every (sub)compaction needs its own +// buffers since they are guaranteed to read different blobs from different +// positions even when reading the same file. +class PrefetchBufferCollection { + public: + explicit PrefetchBufferCollection(uint64_t readahead_size) + : readahead_size_(readahead_size) { + assert(readahead_size_ > 0); + } + + FilePrefetchBuffer* GetOrCreatePrefetchBuffer(uint64_t file_number); + + private: + uint64_t readahead_size_; + std::unordered_map<uint64_t, std::unique_ptr<FilePrefetchBuffer>> + prefetch_buffers_; // maps file number to prefetch buffer +}; + +} // namespace ROCKSDB_NAMESPACE |